# imports
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime
from dateutil import parser
users = pd.read_csv('takehome_users.csv')
user_engagement = pd.read_csv('takehome_user_engagement.csv')
users.head()
user_engagement.head()
# only users that have at least 3 sessions in the user_engagement table can potentially be adopted
user_id_counts = user_engagement.user_id.value_counts() >= 3
potential_adopted_users = [user_id for user_id in user_id_counts.index if user_id_counts[user_id] == True]
# add user to adopted user list if they have 3 active session in a 7 day period
adopted_users = []
for user in potential_adopted_users:
user_dates = user_engagement[user_engagement.user_id == user].time_stamp.tolist()
user_dates = [parser.parse(date) for date in user_dates]
queue = user_dates[0:3]
adopted = False
cur_index = 2
while adopted == False:
if ((queue[-1] - queue[0]).days <= 7):
adopted = True
else:
try:
cur_index += 1
queue.pop(0)
queue.append(user_dates[cur_index])
except:
break
if adopted == True:
adopted_users.append(user)
len(adopted_users)
# create adopted feature
def adopted_function(x, adopted_users_list):
if x in adopted_users_list:
return True
else:
return False
users['adopted'] = users['object_id'].apply(lambda x: adopted_function(x, adopted_users))
users.adopted.value_counts()
# create list of domains with 5 or more users having that email domain
common_emails = users.email_domain.value_counts() > 5
common_emails = [email for email in common_emails.index if common_emails[email] == True]
# created adjusted email feature
def email_function(email):
if email in common_emails:
return email
else:
return 'other'
users['adjusted_email'] = users['email_domain'].apply(lambda x: email_function(x))
sns.set_style('darkgrid')
fig,ax1 = plt.subplots(figsize=(8, 8))
sns.countplot(x=users.adopted, ax=ax1)
# set titles
plt.suptitle('Distribution of Adopted Users', fontsize=26, y=0.96, fontweight='bold')
ax1.set_ylabel('Number of users', labelpad=10, fontsize=12, fontweight='bold')
ax1.set_xlabel('Adopted user?', labelpad=10,fontsize=12, fontweight='bold')
ax1.set_ylim(0,12000) # set ylim
for p in ax1.patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
ax1.text(txt_x,txt_y,txt)
fig,ax1 = plt.subplots(figsize=(8, 8))
x, y = 'adopted', 'opted_in_to_mailing_list'
# group df to properly show percentage
viz_df = (users
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index())
sns.barplot(x=x, y='percent', hue=y, data=viz_df, ax=ax1)
# set titles
ax1.set_ylim(0,100)
plt.suptitle('Mailing List Status', fontsize=30, y=0.98, fontweight='bold')
plt.title('Distribution across adopted and non-adopted users', fontsize=15, style='italic')
ax1.set_ylabel('Percent of users', labelpad=10, fontsize=12, fontweight='bold')
ax1.set_xlabel('Adopted user?', labelpad=10,fontsize=12, fontweight='bold')
for p in ax1.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x()
txt_y = p.get_height()
ax1.text(txt_x,txt_y,txt)
fig,ax1 = plt.subplots(figsize=(8, 8))
x, y = 'adopted', 'enabled_for_marketing_drip'
# group df to properly show percentage
viz_df = (users
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index())
sns.barplot(x=x, y='percent', hue=y, data=viz_df, ax=ax1)
# set titles
ax1.set_ylim(0,100)
plt.suptitle('Marketing Drip Status', fontsize=30, y=0.98, fontweight='bold')
plt.title('Distribution across adopted and non-adopted users', fontsize=15, style='italic')
ax1.set_ylabel('Percent of users', labelpad=10, fontsize=12, fontweight='bold')
ax1.set_xlabel('Adopted user?', labelpad=10,fontsize=12, fontweight='bold')
for p in ax1.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x()
txt_y = p.get_height()
ax1.text(txt_x,txt_y,txt)
fig,ax1 = plt.subplots(figsize=(8, 8))
x, y = 'adopted', 'adjusted_email'
# group df to properly show percentage
viz_df = (users
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index())
sns.barplot(x=x, y='percent', hue=y, data=viz_df, ax=ax1)
# set titles
ax1.set_ylim(0,40)
plt.suptitle('Signup Email Domain', fontsize=30, y=0.98, fontweight='bold')
plt.title('Distribution across adopted and non-adopted users', fontsize=15, style='italic')
ax1.set_ylabel('Percent of users', labelpad=10, fontsize=12, fontweight='bold')
ax1.set_xlabel('Adopted user?', labelpad=10,fontsize=12, fontweight='bold')
for p in ax1.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x()
txt_y = p.get_height()
ax1.text(txt_x,txt_y,txt)
fig,ax1 = plt.subplots(figsize=(8, 8))
x, y = 'adopted', 'creation_source'
# group df to properly show percentage
viz_df = (users
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index())
sns.barplot(x=x, y='percent', hue=y, data=viz_df, ax=ax1)
# set titles
ax1.set_ylim(0,45)
plt.suptitle('Account Creation Source', fontsize=30, y=0.98, fontweight='bold')
plt.title('Distribution across adopted and non-adopted users', fontsize=15, style='italic')
ax1.set_ylabel('Percent of users', labelpad=10, fontsize=12, fontweight='bold')
ax1.set_xlabel('Adopted user?', labelpad=10,fontsize=12, fontweight='bold')
for p in ax1.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x()
txt_y = p.get_height()
ax1.text(txt_x,txt_y,txt)
# only organizations with 10 or more users are considered; other proportions are too extreme
common_orgs = users.org_id.value_counts() > 10
common_orgs = [org for org in common_orgs.index if common_orgs[org] == True]
org_adopted_rates = []
for org_id in common_orgs:
org_users = users[users.org_id == org_id]
try:
prop_adopted = org_users.adopted.value_counts()[1]/org_users.shape[0]
except:
prop_adopted = 0
org_adopted_rates.append(prop_adopted)
fig,ax1 = plt.subplots(figsize=(8, 8))
sns.histplot(x=org_adopted_rates)
# set titles
plt.suptitle('Organization Adoption', fontsize=30, y=0.98, fontweight='bold')
plt.title("Proportion of an organization's users that are adopted", fontsize=15, style='italic')
ax1.set_ylabel('Frequency', labelpad=10, fontsize=12, fontweight='bold')
ax1.set_xlabel('Proportion of adopted users', labelpad=10,fontsize=12, fontweight='bold')
# imports
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample
df = users.copy()
df_majority = df[df.adopted == False]
df_minority = df[df.adopted == True]
df_minority_upsampled = resample(df_minority,
replace=True,
n_samples=df_majority.shape[0],
random_state=123)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
X, y = df_upsampled.iloc[:, [4, 8, 12]], df_upsampled.iloc[:, 11]
X['org_id'] = X['org_id'].apply(lambda x: str(x))
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
# warning is a false positive
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
model = LogisticRegression(max_iter=1000) # initialize model
# 5-fold cross validation
results = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=5)
np.mean(results)
# fit model and predict based on X_test
fittedmodel = model.fit(X_train, y_train)
predictions = fittedmodel.predict(X_test)
accuracy_score(y_test, predictions)
def plot_confusion_matrix(conf_mat, labels, model_name):
fig,ax1 = plt.subplots(figsize=(4, 4))
sns.heatmap(conf_mat, cmap='Blues', annot=True, fmt='d', cbar=False, square=True)
# set + rotate labels
ax1.set_yticklabels(labels, rotation=0, fontsize="10", va="center")
ax1.set_xticklabels(labels, rotation=90, fontsize="10", va="top")
# set titles
plt.suptitle('Confusion Matrix', fontsize=18, y=1, fontweight='bold')
plt.title(model_name, fontsize=10, style='italic')
ax1.set_ylabel('Actual', labelpad=10, fontsize=12, fontweight='bold')
ax1.set_xlabel('Predicted', labelpad=10,fontsize=12, fontweight='bold')
conf_mat = confusion_matrix(y_test, predictions)
plot_confusion_matrix(conf_mat, ['not adopted', 'adopted'], 'Logistic Regression')
# imports
from mlxtend.evaluate import paired_ttest_5x2cv
from sklearn.dummy import DummyClassifier
model1 = DummyClassifier(strategy='most_frequent') # will guess the most common class, accuracy = 0.5
model2 = LogisticRegression(max_iter=1000)
p_values = []
for i in range(10):
t, p = paired_ttest_5x2cv(estimator1=model1, estimator2=model2,
X=X, y=y, scoring='accuracy')
p_values.append(p)
np.mean(p_values)