!ls /datasets/datahub/ml-repo/multi-touch-attribution
import pandas as pd
import itertools
from collections import defaultdict
from itertools import permutations,combinations
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline
import seaborn as sns
from gekko import GEKKO
from pandas_profiling import ProfileReport
from PIL import Image
import requests
#Importing dataset
data=pd.read_csv("/datasets/datahub/ml-repo/multi-touch-attribution/attribution_data.csv")
data=pd.DataFrame(data)
data
#Creating exploratory data analysis report using pandas profiling python module
profile = ProfileReport(data, title='Attribution Data Report', explorative=True)
#Displaying the report
profile.to_widgets()
profile1 = ProfileReport(data, title='Attribution Data Report', explorative=True)
profile1.to_file("Report.html") #An html report is being generated and saved in '\content'
#Here's an image for a better understanding of last-touch attribution model
im = Image.open(requests.get('https://adidodigital.s3.amazonaws.com/Blogs-2020/last%20touch%20attribution%20model.jpg', stream=True).raw)
im
def last_touch_model(dt, conv_col,channel_col): #Defining function for last touch model
last_touch=dt.loc[dt[conv_col]==1] #Extracting rows where conversion is 1
res_last_touch=pd.DataFrame(round(last_touch[channel_col].value_counts(normalize=True)*100,2)) #Calculating the weightage
res_last_touch.columns=['Weightage(%)']
return res_last_touch
last_touch=last_touch_model(data, 'conversion', 'channel') #Calling the last touch model function
last_touch
#Plotting the weightage with respect to channels
plt.subplots(figsize=(18, 6))
sns.barplot(y='Weightage(%)', x=last_touch.index, data=last_touch)
plt.show()
#Here's an image for a better understanding of first-touch attribution model
im = Image.open(requests.get('https://adidodigital.s3.amazonaws.com/Blogs-2020/first-touch.jpg', stream=True).raw)
im
def first_touch_model(dt, conv_col,channel_col,user_id):
temp=dt.loc[dt[conv_col]==1] #Saving the dataframe where all the conversions are 1 into temp variable
first_touch=pd.DataFrame(dt.groupby(user_id).first(),index=dt[user_id]) #Grouping with respect to cookie and then keeping only the first instance of every cookie
cookie_index=list(temp[user_id]) #making a list of cookie column of temp
mid_first_touch=first_touch.loc[cookie_index] #locating cookie index in the first touch dataframe
res_first_touch=pd.DataFrame(round(mid_first_touch[channel_col].value_counts(normalize=True)*100,2)) #Calculating weightage
res_first_touch.columns=['Weightage(%)']
return res_first_touch
first_touch = first_touch_model(data, 'conversion', 'channel', 'cookie')
first_touch
plt.subplots(figsize=(18, 6))
sns.barplot(y='Weightage(%)', x=first_touch.index, data=first_touch)
plt.show()
#Here's an image for a better understanding of last-non-direct attribution model
im = Image.open(requests.get('https://cloudrock.asia/wp-content/uploads/2016/10/Last-Non-Direct-Click-Attribution-Modelling.png', stream=True).raw)
im
def last_non_direct_model(dt, conv_col, channel_col, user_id):
slp=pd.DataFrame(dt.groupby(user_id).tail(2)) #Grouping by cookie and keeping the last two observation for each cookie
temp=slp.loc[slp[conv_col]==1]
last_non_direct=pd.DataFrame(slp.groupby(user_id).first(),index=slp[user_id])
cookie_index=list(temp[user_id])
mid_last_non_direct=last_non_direct.loc[cookie_index]
res_last_non_direct=pd.DataFrame(round(mid_last_non_direct[channel_col].value_counts(normalize=True)*100,2))
res_last_non_direct.columns=['Weightage(%)']
return res_last_non_direct
last_non_direct = last_non_direct_model(data, 'conversion', 'channel', 'cookie')
last_non_direct
plt.subplots(figsize=(18, 6))
sns.barplot(y='Weightage(%)', x=last_non_direct.index, data=last_non_direct)
plt.show()
#Here's an image for a better understanding of Linear Attribution Model
im = Image.open(requests.get('https://www.marketingevolution.com/hs-fs/hubfs/linear%20attribution%20model.png?width=525&name=linear%20attribution%20model.png', stream=True).raw)
im
def linear_model(dt, conv_col, channel_col, user_id):
#Keeping data of only those users who are getting converted at the end
pd.options.mode.chained_assignment = None
temp=dt.loc[dt[conv_col]==1]
cookie_index=list(temp[user_id])
dt['new']=dt[user_id].isin(cookie_index)
y=dt['new'].isin([True])
dt_conv=dt[y]
temp=pd.DataFrame(dt_conv.groupby(user_id).tail(1))
x=Counter(dt_conv[user_id])
temp['click_count']=x.values() #Click count is total number of channels visited by an user
temp.set_index(user_id,inplace=True)
count=Counter(dt_conv[user_id])
dt_conv['clicks']=dt_conv[user_id].map(count) #Adding click count to the filtered the data
dt_conv=dt_conv.assign(click_per=lambda x: round(100/dt_conv['clicks'],2)) #Assigning the weightages in a linear fashion
#Getting the mean weightage of every channels
res_linear=dt_conv.groupby(channel_col, as_index=False)['click_per'].mean()
sum=res_linear['click_per'].sum()
res_linear['Weightage(%)']=res_linear.apply(lambda x: round((x['click_per']/sum)*100,2),axis=1)
res_linear.drop(['click_per'],inplace=True,axis=1)
res_linear=res_linear.set_index(channel_col)
res_linear.index.name=None
return res_linear
linear = linear_model(data, 'conversion', 'channel', 'cookie') #Calling the linear model function
linear
plt.subplots(figsize=(18, 6))
sns.barplot(y='Weightage(%)', x=linear.index, data=linear)
plt.show()
#Here's an image for a better understanding of Position Based (U-shaped) Attribution Model
im = Image.open(requests.get('https://www.marketingevolution.com/hs-fs/hubfs/U-shaped%20attribution%20model.png?width=525&name=U-shaped%20attribution%20model.png', stream=True).raw)
im
def calc_attribution(click_pos,total_clicks):
#Assigning weightage according to the position of the channel
default_att = 0.5 #if a user has visited only 2 channels
extreme_touch_att_1 = 0.4 #Assigining weightage to first and last channels
intermed_att_1 = 0.2 #Total weightage for remaining channels
if total_clicks == 2:
return default_att
elif total_clicks == 1:
return 1
else:
if click_pos == total_clicks or click_pos ==1:
return extreme_touch_att_1
else:
return intermed_att_1/(total_clicks-2) #Giving equal weightage to all the mid channels from the remaining 20%
def u_shaped_model(dt, conv_col, channel_col, user_id):
#Keeping data of only those users who are getting converted at the end
pd.options.mode.chained_assignment = None #Ignoring pandas warnings
temp=dt.loc[dt[conv_col]==1]
cookie_index=list(temp[user_id])
dt['new']=dt[user_id].isin(cookie_index)
y=dt['new'].isin([True])
dt_conv=dt[y]
count=Counter(dt_conv[user_id])
dt_conv['clicks']=dt_conv[user_id].map(count)
dt_conv['click_pos'] = dt_conv.groupby(user_id).cumcount() + 1 #Giving ranks to the channel for each user_id
dt_Ushaped=dt_conv
dt_Ushaped['U_Shape'] = dt_conv.apply(lambda val: round(calc_attribution(val.click_pos,val.clicks)*100,2),axis=1)
#Getting the mean weightage of every channels
res_Ushaped=dt_Ushaped.groupby(channel_col, as_index=False)['U_Shape'].mean()
sum=res_Ushaped['U_Shape'].sum()
res_Ushaped['Weightage(%)']=res_Ushaped.apply(lambda x: round((x['U_Shape']/sum)*100,2),axis=1)
res_Ushaped.drop(['U_Shape'],inplace=True,axis=1)
res_Ushaped=res_Ushaped.set_index(channel_col)
res_Ushaped.index.name=None
return res_Ushaped
u_shaped = u_shaped_model(data, 'conversion', 'channel', 'cookie') #Calling u_shaped model function
u_shaped
plt.subplots(figsize=(18, 6))
sns.barplot(y='Weightage(%)', x=u_shaped.index, data=u_shaped)
plt.show()
#Here's an image for a better understanding of Position Decay Attribution Model
im = Image.open(requests.get('https://www.marketingevolution.com/hs-fs/hubfs/time%20decay%20attribution%20model.gif?width=525&name=time%20decay%20attribution%20model.gif', stream=True).raw)
im
def calc_attribution(click_pos,total_clicks):
rel_pos = total_clicks - click_pos
attribution = pow(2, -(rel_pos)) #Assigning weightage to channels in the negative power of 2 depending on their position
return attribution
def pos_decay_model(dt, conv_col, channel_col, user_id):
#Keeping data of only those users who are getting converted at the end
pd.options.mode.chained_assignment = None
temp=dt.loc[dt[conv_col]==1]
cookie_index=list(temp[user_id])
dt['new']=dt[user_id].isin(cookie_index)
y=dt['new'].isin([True])
dt_conv=dt[y]
dt_conv['temp']=1
count=Counter(dt_conv[user_id])
dt_conv['clicks']=dt_conv[user_id].map(count)
dt_conv=dt_conv.assign(click_per=lambda x: round(100/dt_conv['clicks'],2))
dt_conv['click_pos'] = dt_conv.groupby(user_id).cumcount() + 1 #Giving ranks to channels according to user_id
dt_conv['PosDecay'] = dt_conv.apply(lambda val: calc_attribution(val.click_pos,val.clicks)*100,axis=1)
dt_pos_decay=dt_conv
#Getting the mean weightage of every channels
res_pos_decay=dt_pos_decay.groupby('channel', as_index=False)['PosDecay'].mean()
sum=res_pos_decay['PosDecay'].sum()
res_pos_decay['Weightage(%)']=res_pos_decay.apply(lambda x: round((x['PosDecay']/sum)*100,2),axis=1)
res_pos_decay.drop(['PosDecay'], axis=1,inplace=True)
res_pos_decay=res_pos_decay.set_index(channel_col)
res_pos_decay.index.name=None
return res_pos_decay
pos_decay = pos_decay_model(data, 'conversion', 'channel', 'cookie')
pos_decay
plt.subplots(figsize=(18, 6))
sns.barplot(y='Weightage(%)', x=pos_decay.index, data=pos_decay)
plt.show()
#Here's an image for a better understanding of Markov Attribution Model
im = Image.open(requests.get('https://media.geeksforgeeks.org/wp-content/uploads/Markov-chain-1.png', stream=True).raw)
im
def transition_states(list_of_paths):
list_of_unique_channels = set(x for element in list_of_paths for x in element)
transition_states = {x + '>' + y: 0 for x in list_of_unique_channels for y in list_of_unique_channels} #Creating all possible paths between any 2 channels
#Getting the frequencies for all possible combinations
for possible_state in list_of_unique_channels:
if possible_state not in ['Conversion', 'Null']:
for user_path in list_of_paths:
if possible_state in user_path:
indices = [i for i, s in enumerate(user_path) if possible_state in s]
for col in indices:
transition_states[user_path[col] + '>' + user_path[col + 1]] += 1
return transition_states
def transition_prob(trans_dict,list_of_paths):
#Assigning probabilities to each combination of paths of length 2
list_of_unique_channels = set(x for element in list_of_paths for x in element)
trans_prob = defaultdict(dict)
for state in list_of_unique_channels:
if state not in ['Conversion', 'Null']:
counter = 0
index = [i for i, s in enumerate(trans_dict) if state + '>' in s]
for col in index:
if trans_dict[list(trans_dict)[col]] > 0:
counter += trans_dict[list(trans_dict)[col]]
for col in index:
if trans_dict[list(trans_dict)[col]] > 0:
state_prob = float((trans_dict[list(trans_dict)[col]])) / float(counter)
trans_prob[list(trans_dict)[col]] = state_prob
return trans_prob
def transition_matrix(list_of_paths, transition_probabilities):
#Creating a transition matrix using the probabilities of each paths of length 2
trans_matrix = pd.DataFrame()
list_of_unique_channels = set(x for element in list_of_paths for x in element)
for channel in list_of_unique_channels:
trans_matrix[channel] = 0.00
trans_matrix.loc[channel] = 0.00
trans_matrix.loc[channel][channel] = 1.0 if channel in ['Conversion', 'Null'] else 0.0
for key, value in transition_probabilities.items():
origin, destination = key.split('>')
trans_matrix.at[origin, destination] = value
return trans_matrix
def removal_effects(dt, conversion_rate):
#Calculating the effect of channel if it was removed
removal_effects_dict = {}
channels = [channel for channel in dt.columns if channel not in ['Start','Null','Conversion']]
for channel in channels:
removal_dt = dt.drop(channel, axis=1).drop(channel, axis=0)
for column in removal_dt.columns:
row_sum = np.sum(list(removal_dt.loc[column]))
null_pct = float(1) - row_sum
if null_pct != 0:
removal_dt.loc[column]['Null'] = null_pct
removal_dt.loc['Null']['Null'] = 1.0
removal_to_conv = removal_dt[
['Null', 'Conversion']].drop(['Null', 'Conversion'], axis=0)
removal_to_non_conv = removal_dt.drop(
['Null', 'Conversion'], axis=1).drop(['Null', 'Conversion'], axis=0)
removal_inv_diff = np.linalg.inv(
np.identity(
len(removal_to_non_conv.columns)) - np.asarray(removal_to_non_conv))
removal_dot_prod = np.dot(removal_inv_diff, np.asarray(removal_to_conv))
removal_cvr = pd.DataFrame(removal_dot_prod,
index=removal_to_conv.index)[[1]].loc['Start'].values[0]
removal_effect = 1 - removal_cvr / conversion_rate
removal_effects_dict[channel] = removal_effect
return removal_effects_dict
def markov_chain_allocations(removal_effects, total_conversions):
re_sum = np.sum(list(removal_effects.values()))
return {k: (v / re_sum) * total_conversions for k, v in removal_effects.items()}
def markov_model(df, conv_col, channel_col, user_id):
pd.options.mode.chained_assignment = None
df = df.sort_values(user_id)
df['visit_order'] = df.groupby(user_id).cumcount() + 1
df_paths = df.groupby(user_id)[channel_col].aggregate(lambda x: x.unique().tolist()).reset_index()
df_last_interaction = df.drop_duplicates(user_id, keep='last')[[user_id, conv_col]]
df_paths = pd.merge(df_paths, df_last_interaction, how='left', on=user_id)
df_paths['start'] = [["Start"] for i in range(len(df_paths[conv_col]))]
df_paths['buff'] = [["Conversion"] for i in range(len(df_paths[conv_col]))]
df_paths['null'] = [["Null"] for i in range(len(df_paths[conv_col]))]
df_paths['path'] = np.where(df_paths[conv_col] == 0, df_paths['start'] + df_paths[channel_col] + df_paths['null'], df_paths['start'] + df_paths[channel_col] + df_paths['buff'])
df_paths = df_paths[[user_id, 'path']]
list_of_paths = df_paths['path']
total_conversions = np.sum(a.count('Conversion') for a in df_paths['path'].tolist())
base_conversion_rate = total_conversions / len(list_of_paths)
trans_states = transition_states(list_of_paths)
trans_prob = transition_prob(trans_states, list_of_paths)
trans_matrix = transition_matrix(list_of_paths, trans_prob)
removal_effects_dict = removal_effects(trans_matrix, base_conversion_rate) #Creating a dictionary of the removal effect
attributions = markov_chain_allocations(removal_effects_dict, total_conversions) #Allocating markov chains
res_markov=pd.DataFrame(attributions.values(),index=attributions.keys())
res_markov.columns=['weightage']
sum=res_markov['weightage'].sum()
res_markov['Weightage(%)']=res_markov.apply(lambda x: round((x['weightage']/sum)*100,2),axis=1)
res_markov.drop(['weightage'], axis=1,inplace=True)
res_markov=res_markov.sort_index()
return res_markov
markov = markov_model(data, 'conversion', 'channel', 'cookie')
markov
plt.subplots(figsize=(18, 6))
sns.barplot(y='Weightage(%)', x=markov.index, data=markov)
plt.show()
def power_set(List):
#Creating a power set of a given list
PS = [list(j) for i in range(len(List)) for j in itertools.combinations(List, i+1)]
return PS
def subsets(s):
'''
This function returns all the possible subsets of a set of channels.
input :
- s: a set of channels.
'''
if len(s)==1:
return s
else:
sub_channels=[]
for i in range(1,len(s)+1):
sub_channels.extend(map(list,itertools.combinations(s, i)))
return list(map(",".join,map(sorted,sub_channels)))
def v_function(A,C_values):
'''
This function computes the worth of each coalition.
inputs:
- A : a coalition of channels.
- C_values : A dictionnary containing the number of conversions that each subset of channels has yielded.
'''
subsets_of_A = subsets(A)
#print(subsets_of_A)
#exit()
worth_of_A=0
for subset in subsets_of_A:
#print("subset:", subset)
if subset in C_values:
#print("subset:", subset, "; Value:", C_values[subset])
worth_of_A += C_values[subset]
return worth_of_A
def factorial(n):
if n == 0:
return 1
else:
return n * factorial(n-1)
def calculate_shapley(df, col_name):
'''
This function returns the shapley values
- df: A dataframe with the two columns: ['channels_subset', 'conversion_sum'].
- col_name: A string that is the name of the column with conversions in the dataframe
**Make sure that that each value in channel_subset is in alphabetical order. Facebook,Paid Search and Paid Search,Facebook are the same
in regards to this analysis and should be combined under Facebook,Paid Search.
***Be careful with the distinct number of channels because this can signifcantly slow the perfomance of this function.
'''
c_values = df.set_index("channels_subset").to_dict()[col_name]
df['channels'] = df['channels_subset'].apply(lambda x: x if len(x.split(",")) == 1 else np.nan)
channels = list(df['channels'].dropna().unique())
v_values = {}
for A in power_set(channels):
v_values[','.join(sorted(A))] = v_function(A,c_values)
#print(v_values)
n=len(channels)
shapley_values = defaultdict(int)
for channel in channels:
for A in v_values.keys():
#print(A)
if channel not in A.split(","):
#print(channel)
cardinal_A=len(A.split(","))
A_with_channel = A.split(",")
A_with_channel.append(channel)
A_with_channel=",".join(sorted(A_with_channel))
# Weight = |S|!(n-|S|-1)!/n!
weight = (factorial(cardinal_A)*factorial(n-cardinal_A-1)/factorial(n))
# Marginal contribution = v(S U {i})-v(S)
contrib = (v_values[A_with_channel]-v_values[A])
shapley_values[channel] += weight * contrib
# Add the term corresponding to the empty set
shapley_values[channel]+= v_values[channel]/n
return shapley_values
def shapley_model(df, conv_col, channel_col, user_id):
dt_paths = df.sort_values(channel_col).groupby(user_id)[channel_col].aggregate(lambda x: x.unique().tolist()).reset_index()
dt_paths['channels']=[str(x) for x in dt_paths[channel_col]]
channel_count=Counter(dt_paths['channels'])
channel_ct=pd.DataFrame(channel_count.items())
channel_ct[0] = channel_ct[0].apply(lambda x: x.replace('[','').replace(']','').replace("'","").replace(", ",","))
channel_ct.columns=['channels_subset','conversion_sum']
attribution=calculate_shapley(channel_ct,'conversion_sum')
res_shapley=pd.DataFrame(attribution.values(),index=attribution.keys())
res_shapley.columns=['weightage']
sum=res_shapley['weightage'].sum()
res_shapley['Weightage(%)']=res_shapley.apply(lambda x: round((x['weightage']/sum)*100,2),axis=1)
res_shapley.drop(['weightage'], axis=1,inplace=True)
res_shapley = res_shapley.sort_index()
return res_shapley
shapley=shapley_model(data, 'conversion', 'channel', 'cookie')
shapley
#visualizations
plt.subplots(figsize=(18, 6))
sns.barplot(y='Weightage(%)', x=shapley.index, data=shapley)
plt.show()
Combined_dataframe = pd.concat([last_touch,first_touch,last_non_direct,linear,u_shaped,pos_decay,markov,shapley],axis=1)
Combined_dataframe.columns=['Last-Touch','First-Touch','Last-Non-direct','Linear','U-shaped','Position Decay','Markov','Shapley']
Combined_dataframe['Mean'] = round(Combined_dataframe.mean(axis=1),2)
Combined_dataframe
X_axis = np.arange(len(first_touch.index))
plt.bar(X_axis - 0.2, last_touch['Weightage(%)'],0.2,label='last touch')
plt.bar(X_axis , first_touch['Weightage(%)'],0.2,label='first touch')
plt.bar(X_axis + 0.2, last_non_direct['Weightage(%)'],0.2,label='last non-direct touch')
plt.xticks(X_axis, first_touch.index)
plt.legend()
plt.show()
print()
X_axis = np.arange(len(first_touch.index))
plt.bar(X_axis - 0.2, linear['Weightage(%)'],0.2,label='linear model')
plt.bar(X_axis , u_shaped['Weightage(%)'],0.2,label='U-shaped model')
plt.bar(X_axis + 0.2, pos_decay['Weightage(%)'],0.2,label='position decay model')
plt.xticks(X_axis, first_touch.index)
plt.legend()
plt.show()
print()
X_axis = np.arange(len(markov.index))
plt.bar(X_axis - 0.2, markov['Weightage(%)'],0.4,label='markov-chain model model')
plt.bar(X_axis + 0.2, shapley['Weightage(%)'],0.4,label='shapley value model')
plt.xticks(X_axis, first_touch.index)
plt.legend()
plt.show()
m=GEKKO()
n=5
budget=int(input("Enter campaign budget: ")) #Taking input of the budget for a campaign
coeff_A = Combined_dataframe['Mean'].tolist()
print()
ch_names=list(sorted(Combined_dataframe.index))
#Assigning lower bound and upper bound to each channel
x1 = m.Var(lb=100, ub=budget)
x2 = m.Var(lb=100, ub=budget)
x3 = m.Var(lb=100, ub=budget)
x4 = m.Var(lb=100, ub=budget)
x5 = m.Var(lb=100, ub=budget)
print()
for j in range(n):
print("Channel",j+1,"should not exceed : ", end='') #Taking input of upper bound spending on each channel
z=int(input())
m.Equation(globals()["x" + str(j+1)] <= z)
m.Equation(x1+x2+x3+x4+x5 <= budget)
m.Maximize(coeff_A[0]*x1 + coeff_A[1]*x2 + coeff_A[2]*x3 + coeff_A[3]*x4 + coeff_A[4]*x5)
m.solve(disp=False)
p1 = x1.value[0]
p2 = x2.value[0]
p3 = x3.value[0]
p4 = x4.value[0]
p5 = x5.value[0]
#Printing the budget along with the channel names
print('\n\nBudgets:\n\n')
print(ch_names[0] + ": " + str(round(p1,0)))
print(ch_names[1] + ": " + str(round(p2,0)))
print(ch_names[2] + ": " + str(round(p3,0)))
print(ch_names[3] + ": " + str(round(p4,0)))
print(ch_names[4] + ": " + str(round(p5,0)))