#!pip install pandas
#!pip install numpy
#!pip install plotly
#!pip install scikit-learn
#!pip install optuna
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
pd.set_option('display.float_format', lambda x: '%.3f' % x)
data_CF = pd.read_csv(path_archivos + 'mmm.csv')
data_CF.info()
data_CF['Date'] = pd.to_datetime(data_CF['Date'])
data_desc = data_CF.describe().T
data_desc['cv'] = (data_desc['std'] / data_desc['mean']).round(3)
data_desc.sort_values(by='cv', ascending=False, inplace=True)
data_desc
medios = ['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV']
fig = make_subplots(rows=7, cols=1, subplot_titles=medios)
for i, medio in enumerate(medios):
fig.add_trace(go.Scatter(x=data_CF['Date'], y=data_CF[medio], name=medio), row=i+1, col=1)
fig.update_layout(width=1200, height=800)
fig.show()
Histogramas de inversiones
def plot_histogram(medio):
plot = px.histogram( data_CF[data_CF[medio] != 0],
x=medio,
marginal="box",
title="Histograma de " + medio)
return plot
plot_histogram('PayTV')
plot_histogram('OpenTV')
plot_histogram('Facebook')
plot_histogram('Radio')
plot_histogram('Google')
plot_histogram('Print')
plot_histogram('Email')
px.pie(pd.melt( data_CF,
id_vars=['Date'],
value_vars=['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV'],
var_name= "Medio",
value_name= "Inversion"),
values='Inversion',
names='Medio',
title='Share of Investment by Media')
data_CF['year_month'] = data_CF['Date'].dt.strftime('%Y-%m')
data_CF['InversionTotal'] = data_CF[['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV']].sum(axis=1)
data_CF['month'] = data_CF['Date'].dt.month
data_CF['year'] = data_CF['Date'].dt.year
data_CF_com_year = data_CF.groupby(['month'], as_index = False )['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV','InversionTotal'].sum()
medios = ['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV']
px.bar(data_CF_com_year, x= 'month', y = medios)
Correlacion de Inversion y Ventas
fig = make_subplots(specs=[[{"secondary_y": True}]])
data_tem = data_CF.groupby(['year_month'], as_index = False )['InversionTotal', 'Sales'].sum()
fig.add_trace(go.Bar(x=data_tem['year_month'],
y=data_tem['Sales'],
marker_color='LightSkyBlue',
name='Sales'), secondary_y=False)
fig.add_trace(go.Scatter(x=data_tem['year_month'],
y=data_tem['InversionTotal'],
marker_color='MediumPurple',
name='InversionTotal'), secondary_y=True)
fig.update_layout( title='Inversion en medios vs Ventas',
xaxis_title='Date',
yaxis_title='Inversion Total',
barmode='group',
# plot_bgcolor='white'
)
fig.show()
data_CF['year'] = data_CF['year'].astype(str)
data_CF['InversionTotal'] = data_CF[['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV']].sum(axis=1)
import plotly.figure_factory as ff
df = data_CF[['Sales','Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV']].corr().round(2)
fig = ff.create_annotated_heatmap( df.values.tolist(),
x=df.columns.values.tolist(),
y=df.index.values.tolist(),
colorscale='Viridis')
fig.update_layout(title_text='Correlacion')
fig['data'][0]['showscale'] = True
fig.show()
data_CF = data_CF.set_index('Date')
data_CF
data_model = data_CF[['OpenTV', 'PayTV', 'Radio', 'Print', 'Facebook', 'Google', 'Email', 'Sales']]
data_model.head(3)
from sklearn.model_selection import train_test_split
train, test = train_test_split(data_model, test_size=0.2, random_state=10)
distributions= np.array([len(train), len(test)])
print(distributions)
from scipy.signal import convolve2d
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted, check_array
class ExponentialSaturation (BaseEstimator, TransformerMixin):
def __init__ (self, a = 1.):
self.a = a
def fit(self, X, y = None):
X = check_array(X)
self._check_n_features(X, reset = True)
return self
def transform (self, X):
check_is_fitted (self)
X = check_array (X)
self._check_n_features (X, reset = False)
# Adbudg
#data_CF2.OpenTV/(data_CF2.OpenTV + (data_CF2.OpenTV.mean() * rho))
# rho = self.a
#return (X / (X + (X[X != 0].mean() * self.a)))
return (X / (X + (X.mean() * self.a)))
# exponencial
#return 1 - np.exp (-self.a * X)
from sklearn.base import BaseEstimator, TransformerMixin
class ExponentialCarryover(BaseEstimator, TransformerMixin):
def __init__(self, strength = 0.5, length = 1):
self.strength = strength
self.length = length
def fit(self, X, y = None):
X = check_array(X)
self._check_n_features(X, reset=True)
self.sliding_window_ = (
self.strength ** np.arange(self.length + 1)
).reshape(-1, 1)
return self
def transform(self, X: np.ndarray):
check_is_fitted(self)
X = check_array(X)
self._check_n_features(X, reset=False)
convolution = convolve2d(X, self.sliding_window_)
if self.length > 0:
convolution = convolution[: -self.length]
return convolution
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
'OpenTV', 'PayTV', 'Radio', 'Print', 'Facebook', 'Google', 'Email'
adstock = ColumnTransformer([
('OpenTV_pipe', Pipeline([
('carryover', ExponentialCarryover()),
('saturation', ExponentialSaturation())
]), ['OpenTV']),
('PayTV_pipe', Pipeline([
('carryover', ExponentialCarryover()),
('saturation', ExponentialSaturation())
]), ['PayTV']),
('Radio_pipe', Pipeline([
('carryover', ExponentialCarryover()),
('saturation', ExponentialSaturation())
]), ['Radio']),
('Print_pipe', Pipeline([
('carryover', ExponentialCarryover()),
('saturation', ExponentialSaturation())
]), ['Print']),
('Facebook_pipe', Pipeline([
('carryover', ExponentialCarryover()),
('saturation', ExponentialSaturation())
]), ['Facebook']),
('Google_pipe', Pipeline([
('carryover', ExponentialCarryover()),
('saturation', ExponentialSaturation())
]), ['Google']),
('Email_pipe', Pipeline([
('carryover', ExponentialCarryover()),
('saturation', ExponentialSaturation())
]), ['Email']),
])
model = Pipeline([
('adstock', adstock),
('regression', LinearRegression())
])
from sklearn import set_config
set_config(display="diagram")
model
X_train = train.drop(columns='Sales')
Y_train = train['Sales']
model.fit(X_train,Y_train)
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
x = data_model.drop(columns='Sales')
y = data_model['Sales']
print(cross_val_score(model, x, y, cv=TimeSeriesSplit()).mean())
print(x.columns)
print('Coeficientes :', model[1].coef_)
print('Intersección :', model[1].intercept_)
# Optuna nos ayuda a encontrar los mejores parametros
from optuna.integration import OptunaSearchCV
from optuna.distributions import UniformDistribution, IntUniformDistribution
tuned_model = OptunaSearchCV(
estimator=model,
param_distributions={
'adstock__OpenTV_pipe__carryover__strength':UniformDistribution(0, 1), #Probara con valores de 0, 1
'adstock__OpenTV_pipe__carryover__length':IntUniformDistribution(0, 6), #Probara con valores de 0, 6
'adstock__OpenTV_pipe__saturation__a': UniformDistribution(0.1,1), #Probara con valores de 0, 1
'adstock__PayTV_pipe__carryover__strength':UniformDistribution(0, 1),
'adstock__PayTV_pipe__carryover__length':IntUniformDistribution(0, 6),
'adstock__PayTV_pipe__saturation__a': UniformDistribution(0.1,1),
'adstock__Radio_pipe__carryover__strength':UniformDistribution(0, 1),
'adstock__Radio_pipe__carryover__length':IntUniformDistribution(0, 6),
'adstock__Radio_pipe__saturation__a':UniformDistribution(0.1, 1),
'adstock__Print_pipe__carryover__strength':UniformDistribution(0, 1),
'adstock__Print_pipe__carryover__length':IntUniformDistribution(0, 6),
'adstock__Print_pipe__saturation__a':UniformDistribution(0.1, 1),
'adstock__Facebook_pipe__carryover__strength':UniformDistribution(0, 1),
'adstock__Facebook_pipe__carryover__length':IntUniformDistribution(0, 3),
'adstock__Facebook_pipe__saturation__a':UniformDistribution(0.1, 1),
'adstock__Google_pipe__carryover__strength':UniformDistribution(0, 1),
'adstock__Google_pipe__carryover__length':IntUniformDistribution(0, 3),
'adstock__Google_pipe__saturation__a':UniformDistribution(0.1, 1),
'adstock__Email_pipe__carryover__strength':UniformDistribution(0, 1),
'adstock__Email_pipe__carryover__length':IntUniformDistribution(0, 4),
'adstock__Email_pipe__saturation__a':UniformDistribution(0.1, 1),
},
n_trials=1000,
cv=TimeSeriesSplit(),
random_state=0
)
cross_val_score(tuned_model, x, y, cv = TimeSeriesSplit ()).mean()
tuned_model.fit(x,y)
tuned_model.best_params_
pd.set_option('display.max_rows', 30)
best_params = pd.DataFrame({'params': (tuned_model.best_params_).keys(),
'values' : (tuned_model.best_params_).values()})
best_params[['Modelo','Medio','Feature','Parametro']] = best_params.params.str.split('__', expand = True)
best_params
best_params.to_csv('best_params.csv', index = False)
print(tuned_model.best_estimator_.named_steps ['regression']. coef_)
coefbestparam = pd.DataFrame({'Medio' : list(data_model.columns)[:7],
'Valores' : tuned_model.best_estimator_.named_steps ['regression']. coef_})
base = {'Medio':'base', 'Valores' : tuned_model.best_estimator_.named_steps ['regression'].intercept_}
coefbestparam = coefbestparam.append(base, ignore_index=True)
coefbestparam.to_csv('coefbestparam.csv', index = False)
print(tuned_model.best_estimator_.named_steps ['regression'].intercept_)
data_model[[medio + '_sat' for medio in ['OpenTV', 'PayTV', 'Radio', 'Print', 'Facebook', 'Google', 'Email']]] = tuned_model.best_estimator_.named_steps['adstock'].transform(x)
data_model.to_csv('DataSaturada.csv')
df2 = data_model[['Sales','OpenTV_sat', 'PayTV_sat', 'Radio_sat', 'Print_sat', 'Facebook_sat', 'Google_sat', 'Email_sat']].corr().round(2)
fig = ff.create_annotated_heatmap( df2.values.tolist(),
x=df2.columns.values.tolist(),
y=df2.index.values.tolist(),
colorscale='Viridis')
fig.update_layout(title_text='Correlacion')
fig['data'][0]['showscale'] = True
fig.show()
fig = go.Figure()
for i, medio in enumerate(medios):
medio_sat = medio+'_sat'
data_adstock = data_model[data_model[medio_sat] != 0]
fig.add_trace(go.Scatter(x=data_adstock[medio],
y=data_adstock[medio_sat],
mode='markers',
name=medio))
fig.show()
fig = make_subplots(rows=7, cols=1, subplot_titles=medios)
for i, medio in enumerate(medios):
medio_sat = medio+'_sat'
data_adstock = data_model[data_model[medio_sat] != 0]
fig.add_trace(go.Scatter(x=data_adstock[medio], y=data_adstock[medio_sat], mode='markers', name=medio), row=i+1, col=1)
fig.update_layout(width=800, height=3000)
fig.show()
data_model.to_csv('DataSaturada.csv')
adstock_data = pd.DataFrame(
tuned_model.best_estimator_.named_steps['adstock'].transform(x),
columns=x.columns,
index=x.index
)
adstock_data.to_csv('adstock_data.csv')
weights = pd.Series(
tuned_model.best_estimator_.named_steps['regression'].coef_,
index=x.columns
)
weights
weights.to_csv('weights.csv')
adstock_data = pd.DataFrame(
tuned_model.best_estimator_.named_steps['adstock'].transform(x),
columns=x.columns,
index=x.index
)
weights = pd.Series(
tuned_model.best_estimator_.named_steps['regression'].coef_,
index=x.columns
)
base = tuned_model.best_estimator_.named_steps['regression'].intercept_
unadj_contributions = adstock_data.mul(weights).assign(Base=base)
adj_contributions = (unadj_contributions
.div(unadj_contributions.sum(axis=1), axis=0)
.mul(y, axis=0)
)
adj_contributions.to_csv('adj_contributions.csv')
adj_contributions = adj_contributions.reset_index()
inversion_melt = pd.melt(adj_contributions,
id_vars = 'Date',
value_vars = ['OpenTV', 'PayTV', 'Radio', 'Print', 'Facebook', 'Google', 'Email'],
var_name='Medios',
value_name='Inversion')
inversion_melt.head()
atribucion_ventas_medio = pd.read_csv('./adj_contributions.csv')
atribucion_ventas_medio['Date'] = pd.to_datetime(atribucion_ventas_medio['Date'])
atribucion_ventas_medio = atribucion_ventas_medio.set_index('Date')
atribucion_ventas_medio = atribucion_ventas_medio.rename(columns=lambda x: x+'_revenue')
atribucion_ventas_medio = atribucion_ventas_medio.drop(columns=['Base_revenue'])
investment_medio = pd.read_csv('./Data/mmm.csv')
investment_medio['Date'] = pd.to_datetime(investment_medio['Date'])
investment_medio = investment_medio.set_index('Date')
investment_medio = investment_medio.rename(columns=lambda x: x+'_investment')
investment_medio = investment_medio.drop(columns=['Sales_investment'])
atribucion_investment_medio = pd.merge(investment_medio, atribucion_ventas_medio, how='left', left_index=True, right_index=True)
atribucion_investment_medio
ROI Analisis
# calcular el ROI para cada medio
atribucion_investment_medio = pd.merge(investment_medio, atribucion_ventas_medio, how='inner', left_index=True, right_index=True)
amedios3 = ['OpenTV', 'Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV']
for medio in amedios3:
atribucion_investment_medio[medio+'_ROI'] = (atribucion_investment_medio[medio+'_revenue'] - atribucion_investment_medio[medio+'_investment'])/atribucion_investment_medio[medio+'_investment']
amedios4 = ['OpenTV', 'Print', 'Facebook', 'Google', 'PayTV', 'Email', 'Radio']
medio_select = amedios4[1]
atribucion_investment_medio = atribucion_investment_medio[atribucion_investment_medio[medio_select+'_investment'] != 0]
roi_plot = make_subplots(specs=[[{"secondary_y": True}]])
# revenue vs investment
roi_plot.add_trace(go.Scatter( x=atribucion_investment_medio[medio_select+'_investment'],
y=atribucion_investment_medio[medio_select+'_revenue'],
mode='markers',
name='Investment vs Revenue'),
secondary_y=False)
# revenue vs ROI
roi_plot.add_trace(go.Scatter( x=atribucion_investment_medio[medio_select+'_investment'],
y=atribucion_investment_medio[medio_select+'_ROI'],
mode='markers',
name='Investment vs ROI'),
secondary_y=True)
roi_plot.update_layout( #title_text=medio_select+' ROI',
yaxis_title='Revenue',
yaxis2_title='ROI',
xaxis_title='Investment')
roi_plot.show()