Marketing Mix Model Project

#!pip install pandas #!pip install numpy #!pip install plotly #!pip install scikit-learn #!pip install optuna

import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import os pd.set_option('display.float_format', lambda x: '%.3f' % x)

data_CF = pd.read_csv(path_archivos + 'mmm.csv')

data_CF.info()

data_CF['Date'] = pd.to_datetime(data_CF['Date'])

data_desc = data_CF.describe().T data_desc['cv'] = (data_desc['std'] / data_desc['mean']).round(3) data_desc.sort_values(by='cv', ascending=False, inplace=True) data_desc

medios = ['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV']

fig = make_subplots(rows=7, cols=1, subplot_titles=medios) for i, medio in enumerate(medios): fig.add_trace(go.Scatter(x=data_CF['Date'], y=data_CF[medio], name=medio), row=i+1, col=1) fig.update_layout(width=1200, height=800) fig.show()

Histogramas de inversiones

def plot_histogram(medio): plot = px.histogram( data_CF[data_CF[medio] != 0], x=medio, marginal="box", title="Histograma de " + medio) return plot

plot_histogram('PayTV')

plot_histogram('OpenTV')

plot_histogram('Facebook')

plot_histogram('Radio')

plot_histogram('Google')

plot_histogram('Print')

plot_histogram('Email')

px.pie(pd.melt( data_CF, id_vars=['Date'], value_vars=['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV'], var_name= "Medio", value_name= "Inversion"), values='Inversion', names='Medio', title='Share of Investment by Media')

data_CF['year_month'] = data_CF['Date'].dt.strftime('%Y-%m')

data_CF['InversionTotal'] = data_CF[['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV']].sum(axis=1) data_CF['month'] = data_CF['Date'].dt.month data_CF['year'] = data_CF['Date'].dt.year

data_CF_com_year = data_CF.groupby(['month'], as_index = False )['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV','InversionTotal'].sum() medios = ['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV'] px.bar(data_CF_com_year, x= 'month', y = medios)

Correlacion de Inversion y Ventas

fig = make_subplots(specs=[[{"secondary_y": True}]]) data_tem = data_CF.groupby(['year_month'], as_index = False )['InversionTotal', 'Sales'].sum() fig.add_trace(go.Bar(x=data_tem['year_month'], y=data_tem['Sales'], marker_color='LightSkyBlue', name='Sales'), secondary_y=False) fig.add_trace(go.Scatter(x=data_tem['year_month'], y=data_tem['InversionTotal'], marker_color='MediumPurple', name='InversionTotal'), secondary_y=True) fig.update_layout( title='Inversion en medios vs Ventas', xaxis_title='Date', yaxis_title='Inversion Total', barmode='group', # plot_bgcolor='white' ) fig.show()

data_CF['year'] = data_CF['year'].astype(str) data_CF['InversionTotal'] = data_CF[['Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV']].sum(axis=1)

import plotly.figure_factory as ff df = data_CF[['Sales','Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV', 'OpenTV']].corr().round(2) fig = ff.create_annotated_heatmap( df.values.tolist(), x=df.columns.values.tolist(), y=df.index.values.tolist(), colorscale='Viridis') fig.update_layout(title_text='Correlacion') fig['data'][0]['showscale'] = True fig.show()

data_CF = data_CF.set_index('Date') data_CF

data_model = data_CF[['OpenTV', 'PayTV', 'Radio', 'Print', 'Facebook', 'Google', 'Email', 'Sales']] data_model.head(3)

from sklearn.model_selection import train_test_split train, test = train_test_split(data_model, test_size=0.2, random_state=10) distributions= np.array([len(train), len(test)]) print(distributions)

from scipy.signal import convolve2d from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted, check_array class ExponentialSaturation (BaseEstimator, TransformerMixin): def __init__ (self, a = 1.): self.a = a def fit(self, X, y = None): X = check_array(X) self._check_n_features(X, reset = True) return self def transform (self, X): check_is_fitted (self) X = check_array (X) self._check_n_features (X, reset = False) # Adbudg #data_CF2.OpenTV/(data_CF2.OpenTV + (data_CF2.OpenTV.mean() * rho)) # rho = self.a #return (X / (X + (X[X != 0].mean() * self.a))) return (X / (X + (X.mean() * self.a))) # exponencial #return 1 - np.exp (-self.a * X)

from sklearn.base import BaseEstimator, TransformerMixin class ExponentialCarryover(BaseEstimator, TransformerMixin): def __init__(self, strength = 0.5, length = 1): self.strength = strength self.length = length def fit(self, X, y = None): X = check_array(X) self._check_n_features(X, reset=True) self.sliding_window_ = ( self.strength ** np.arange(self.length + 1) ).reshape(-1, 1) return self def transform(self, X: np.ndarray): check_is_fitted(self) X = check_array(X) self._check_n_features(X, reset=False) convolution = convolve2d(X, self.sliding_window_) if self.length > 0: convolution = convolution[: -self.length] return convolution

from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression

'OpenTV', 'PayTV', 'Radio', 'Print', 'Facebook', 'Google', 'Email' adstock = ColumnTransformer([ ('OpenTV_pipe', Pipeline([ ('carryover', ExponentialCarryover()), ('saturation', ExponentialSaturation()) ]), ['OpenTV']), ('PayTV_pipe', Pipeline([ ('carryover', ExponentialCarryover()), ('saturation', ExponentialSaturation()) ]), ['PayTV']), ('Radio_pipe', Pipeline([ ('carryover', ExponentialCarryover()), ('saturation', ExponentialSaturation()) ]), ['Radio']), ('Print_pipe', Pipeline([ ('carryover', ExponentialCarryover()), ('saturation', ExponentialSaturation()) ]), ['Print']), ('Facebook_pipe', Pipeline([ ('carryover', ExponentialCarryover()), ('saturation', ExponentialSaturation()) ]), ['Facebook']), ('Google_pipe', Pipeline([ ('carryover', ExponentialCarryover()), ('saturation', ExponentialSaturation()) ]), ['Google']), ('Email_pipe', Pipeline([ ('carryover', ExponentialCarryover()), ('saturation', ExponentialSaturation()) ]), ['Email']), ]) model = Pipeline([ ('adstock', adstock), ('regression', LinearRegression()) ])

from sklearn import set_config set_config(display="diagram") model

X_train = train.drop(columns='Sales') Y_train = train['Sales']

model.fit(X_train,Y_train)

from sklearn.model_selection import cross_val_score, TimeSeriesSplit x = data_model.drop(columns='Sales') y = data_model['Sales'] print(cross_val_score(model, x, y, cv=TimeSeriesSplit()).mean())

print(x.columns) print('Coeficientes :', model[1].coef_) print('Intersección :', model[1].intercept_)

# Optuna nos ayuda a encontrar los mejores parametros from optuna.integration import OptunaSearchCV from optuna.distributions import UniformDistribution, IntUniformDistribution

tuned_model = OptunaSearchCV( estimator=model, param_distributions={ 'adstock__OpenTV_pipe__carryover__strength':UniformDistribution(0, 1), #Probara con valores de 0, 1 'adstock__OpenTV_pipe__carryover__length':IntUniformDistribution(0, 6), #Probara con valores de 0, 6 'adstock__OpenTV_pipe__saturation__a': UniformDistribution(0.1,1), #Probara con valores de 0, 1 'adstock__PayTV_pipe__carryover__strength':UniformDistribution(0, 1), 'adstock__PayTV_pipe__carryover__length':IntUniformDistribution(0, 6), 'adstock__PayTV_pipe__saturation__a': UniformDistribution(0.1,1), 'adstock__Radio_pipe__carryover__strength':UniformDistribution(0, 1), 'adstock__Radio_pipe__carryover__length':IntUniformDistribution(0, 6), 'adstock__Radio_pipe__saturation__a':UniformDistribution(0.1, 1), 'adstock__Print_pipe__carryover__strength':UniformDistribution(0, 1), 'adstock__Print_pipe__carryover__length':IntUniformDistribution(0, 6), 'adstock__Print_pipe__saturation__a':UniformDistribution(0.1, 1), 'adstock__Facebook_pipe__carryover__strength':UniformDistribution(0, 1), 'adstock__Facebook_pipe__carryover__length':IntUniformDistribution(0, 3), 'adstock__Facebook_pipe__saturation__a':UniformDistribution(0.1, 1), 'adstock__Google_pipe__carryover__strength':UniformDistribution(0, 1), 'adstock__Google_pipe__carryover__length':IntUniformDistribution(0, 3), 'adstock__Google_pipe__saturation__a':UniformDistribution(0.1, 1), 'adstock__Email_pipe__carryover__strength':UniformDistribution(0, 1), 'adstock__Email_pipe__carryover__length':IntUniformDistribution(0, 4), 'adstock__Email_pipe__saturation__a':UniformDistribution(0.1, 1), }, n_trials=1000, cv=TimeSeriesSplit(), random_state=0 )

cross_val_score(tuned_model, x, y, cv = TimeSeriesSplit ()).mean()

tuned_model.fit(x,y)

tuned_model.best_params_

pd.set_option('display.max_rows', 30)

best_params = pd.DataFrame({'params': (tuned_model.best_params_).keys(), 'values' : (tuned_model.best_params_).values()})

best_params[['Modelo','Medio','Feature','Parametro']] = best_params.params.str.split('__', expand = True) best_params

best_params.to_csv('best_params.csv', index = False)

print(tuned_model.best_estimator_.named_steps ['regression']. coef_)

coefbestparam = pd.DataFrame({'Medio' : list(data_model.columns)[:7], 'Valores' : tuned_model.best_estimator_.named_steps ['regression']. coef_}) base = {'Medio':'base', 'Valores' : tuned_model.best_estimator_.named_steps ['regression'].intercept_} coefbestparam = coefbestparam.append(base, ignore_index=True) coefbestparam.to_csv('coefbestparam.csv', index = False)

print(tuned_model.best_estimator_.named_steps ['regression'].intercept_)

data_model[[medio + '_sat' for medio in ['OpenTV', 'PayTV', 'Radio', 'Print', 'Facebook', 'Google', 'Email']]] = tuned_model.best_estimator_.named_steps['adstock'].transform(x)

data_model.to_csv('DataSaturada.csv')

df2 = data_model[['Sales','OpenTV_sat', 'PayTV_sat', 'Radio_sat', 'Print_sat', 'Facebook_sat', 'Google_sat', 'Email_sat']].corr().round(2) fig = ff.create_annotated_heatmap( df2.values.tolist(), x=df2.columns.values.tolist(), y=df2.index.values.tolist(), colorscale='Viridis') fig.update_layout(title_text='Correlacion') fig['data'][0]['showscale'] = True fig.show()

fig = go.Figure() for i, medio in enumerate(medios): medio_sat = medio+'_sat' data_adstock = data_model[data_model[medio_sat] != 0] fig.add_trace(go.Scatter(x=data_adstock[medio], y=data_adstock[medio_sat], mode='markers', name=medio)) fig.show()

fig = make_subplots(rows=7, cols=1, subplot_titles=medios) for i, medio in enumerate(medios): medio_sat = medio+'_sat' data_adstock = data_model[data_model[medio_sat] != 0] fig.add_trace(go.Scatter(x=data_adstock[medio], y=data_adstock[medio_sat], mode='markers', name=medio), row=i+1, col=1) fig.update_layout(width=800, height=3000) fig.show()

data_model.to_csv('DataSaturada.csv')

adstock_data = pd.DataFrame( tuned_model.best_estimator_.named_steps['adstock'].transform(x), columns=x.columns, index=x.index )

adstock_data.to_csv('adstock_data.csv')

weights = pd.Series( tuned_model.best_estimator_.named_steps['regression'].coef_, index=x.columns ) weights

weights.to_csv('weights.csv')

adstock_data = pd.DataFrame( tuned_model.best_estimator_.named_steps['adstock'].transform(x), columns=x.columns, index=x.index ) weights = pd.Series( tuned_model.best_estimator_.named_steps['regression'].coef_, index=x.columns ) base = tuned_model.best_estimator_.named_steps['regression'].intercept_ unadj_contributions = adstock_data.mul(weights).assign(Base=base) adj_contributions = (unadj_contributions .div(unadj_contributions.sum(axis=1), axis=0) .mul(y, axis=0) )

adj_contributions.to_csv('adj_contributions.csv')

adj_contributions = adj_contributions.reset_index()

inversion_melt = pd.melt(adj_contributions, id_vars = 'Date', value_vars = ['OpenTV', 'PayTV', 'Radio', 'Print', 'Facebook', 'Google', 'Email'], var_name='Medios', value_name='Inversion') inversion_melt.head()

atribucion_ventas_medio = pd.read_csv('./adj_contributions.csv') atribucion_ventas_medio['Date'] = pd.to_datetime(atribucion_ventas_medio['Date']) atribucion_ventas_medio = atribucion_ventas_medio.set_index('Date') atribucion_ventas_medio = atribucion_ventas_medio.rename(columns=lambda x: x+'_revenue') atribucion_ventas_medio = atribucion_ventas_medio.drop(columns=['Base_revenue']) investment_medio = pd.read_csv('./Data/mmm.csv') investment_medio['Date'] = pd.to_datetime(investment_medio['Date']) investment_medio = investment_medio.set_index('Date') investment_medio = investment_medio.rename(columns=lambda x: x+'_investment') investment_medio = investment_medio.drop(columns=['Sales_investment']) atribucion_investment_medio = pd.merge(investment_medio, atribucion_ventas_medio, how='left', left_index=True, right_index=True) atribucion_investment_medio

ROI Analisis

# calcular el ROI para cada medio atribucion_investment_medio = pd.merge(investment_medio, atribucion_ventas_medio, how='inner', left_index=True, right_index=True) amedios3 = ['OpenTV', 'Print', 'Email', 'Radio', 'Facebook', 'Google', 'PayTV'] for medio in amedios3: atribucion_investment_medio[medio+'_ROI'] = (atribucion_investment_medio[medio+'_revenue'] - atribucion_investment_medio[medio+'_investment'])/atribucion_investment_medio[medio+'_investment'] amedios4 = ['OpenTV', 'Print', 'Facebook', 'Google', 'PayTV', 'Email', 'Radio'] medio_select = amedios4[1] atribucion_investment_medio = atribucion_investment_medio[atribucion_investment_medio[medio_select+'_investment'] != 0] roi_plot = make_subplots(specs=[[{"secondary_y": True}]]) # revenue vs investment roi_plot.add_trace(go.Scatter( x=atribucion_investment_medio[medio_select+'_investment'], y=atribucion_investment_medio[medio_select+'_revenue'], mode='markers', name='Investment vs Revenue'), secondary_y=False) # revenue vs ROI roi_plot.add_trace(go.Scatter( x=atribucion_investment_medio[medio_select+'_investment'], y=atribucion_investment_medio[medio_select+'_ROI'], mode='markers', name='Investment vs ROI'), secondary_y=True) roi_plot.update_layout( #title_text=medio_select+' ROI', yaxis_title='Revenue', yaxis2_title='ROI', xaxis_title='Investment') roi_plot.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Histogramas de inversiones

Correlacion de Inversion y Ventas

ROI Analisis

Histogramas de inversiones