# Converting the event_time column
df['event_time'] = pd.to_datetime(df['event_time'], format='%Y-%m-%d %H:%M:%S UTC')
# Split time column into month day year hour
df['event_day'] = df['event_time'].dt.day
df['event_month'] = df['event_time'].dt.month
df['event_year'] = df['event_time'].dt.year
df['event_hour'] = df['event_time'].dt.hour
# df = df.drop(['event_time'], axis=1) # Remove the column after it is processed.
brands = df.brand.unique() # Get list of unique brands.
brands = [b for b in brands if str(b) != 'nan']
for brand in brands:
brand_group = df[df["brand"] == brand] # Filter dataset by brand.
common_category = brand_group.mode()['category_code'][0] # Get modal category_code per brand.
same_brand = df["brand"] == brand
null_category = df["category_code"].isna()
df.loc[(same_brand) & (null_category),'category_code'] = common_category
category_codes = df.category_code.unique() # Get list of unique brands.
category_codes = [c for c in category_codes if str(c) != 'nan']
for category_code in category_codes:
code_group = df[df["category_code"] == category_code] # Filter dataset by category_code.
common_brand = code_group.mode()['brand'][0] # Get modal brand per category_code.
same_category = df["category_code"] == brand
null_brand = df["brand"].isna()
df.loc[(same_category) & (null_brand),'brand'] = common_brand
# df.columns[df.isna().any()].tolist()
# Splitting the category into category, sub_category and item_type.
df[["category", "sub_category", "item"]] = df['category_code'].str.split('.', expand=True)
# What are we good at? What are we not so good at?
views = df[df["event_type"] == "view"]
purchases = df[df["event_type"] == "purchase"]
removals = df[df["event_type"] == "remove_from_cart"]
total_made = float("{:.2f}".format(purchases['price'].sum()))
total_lost = float("{:.2f}".format(removals['price'].sum()))
print(f"Total Made: ${total_made:,}")
print(f"Total Lost: ${total_lost:,}")
Total Made: $1,531,016.9
Total Lost: $4,714,207.04
from IPython.display import display, Markdown, Latex
var = 10
display(Markdown(f"# Getting to a {var}% increase"))
# If you particularly want to display maths, this is more direct:
unique_customers = df.user_id.nunique() # number of customers # 368,232
print("Number of customers: ",unique_customers)
# Spend per customer
user_spending = purchases.groupby(['user_id'])['price'].sum().reset_index()
spend_per_customer = user_spending['price'].mean()
print("Spend per customer: ", spend_per_customer) # $48.56
# Loss per customer
user_removals = removals.groupby(['user_id'])['price'].sum().reset_index()
loss_per_customer = user_removals['price'].mean()
print("Loss per customer: ", loss_per_customer) # $86.29
# Loss per session?
user_removals = removals.groupby(['user_session'])['price'].sum().to_frame().reset_index()
loss_per_session = user_removals['price'].mean()
print("Loss per session: ", loss_per_session) # $42.08
# Number of successful recoveries.
spending_target = total_lost / loss_per_customer
print("Number of recovering needed: ", spending_target) # 54627.99
recoveries_per_customer = unique_customers/spending_target
print("Recoveries per customer: ", recoveries_per_customer) # 6.74
# Number of sessions per month. per user
session_counts = df.groupby(['user_id'])['user_session'].size().to_frame().reset_index()
user_sessions = session_counts['user_session'].mean()
print("user_sessions per month: ", user_sessions) # 12.58
# Number of recoveries per month.
# purchases_n_removals = df[df["event_type"].isin(["remove_from_cart", "purchase"])]
# user_activity = purchases_n_removals.groupby(["user_id", "event_type", "product_id", "user_session"]).size().to_frame().reset_index()
# user_activity.drop_duplicates(inplace=True)
# user_activity.drop(["event_type"], axis=1, inplace=True)
# user_activity.groupby(user_activity.columns.tolist(),as_index=False).size()
# print("Recoveries this month", len(recoveries[recoveries[0] > 1]))
# How much money is recovered?
#
# df.groupby(df.columns.tolist(),as_index=False).size()
# go back to removals and purchases
# add a purchase date and a removal date
purchases["purchase_date"] = purchases['event_time']
removals["removal_date"] = removals['event_time']
criteria = ["user_id", "product_id", "user_session", "price"]
recovery_data = pd.merge(left=purchases, right=removals, left_on=criteria, right_on=criteria)
# join on user_id and product_id
# look for where purchase date is greater than removal date
recoveries = recovery_data.query('purchase_date > removal_date')
print("Recoveries: ", len(recoveries)) # 26746
print("Total recovered: ", recoveries["price"].sum()) # $148635.94
updated_goal = total_lost - 148635.94
print("New target: ", updated_goal)
# 4565571.1
Number of customers: 368232
Spend per customer: 48.56670790508844
Loss per customer: 86.29653364575039
Loss per session: 42.07606245369075
Number of recovering needed: 54627.999999999396
Recoveries per customer: 6.740719045178372
user_sessions per month: 12.589446327315388
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:46: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:47: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Recoveries: 26407
Total recovered: 148635.94
New target: 4565571.1
# Bubble Chart showing the money lost by category.
category_data = removals.groupby(['category'])["price"].sum().reset_index()
fig = px.scatter(category_data, x="price", y="price",
size="price", color="category",
hover_name="category", log_x=True, size_max=60)
fig.show()
# Most popular Sub Categories.
sub_category_data = removals.groupby(['sub_category'])["price"].sum().reset_index()
fig = px.pie(sub_category_data, values='price', names='sub_category', title='Revenue Lost by Sub Category')
fig.show()
majority_lost = removals[removals.category.isin(['furniture', 'appliances'])]['price'].sum()
majority_lost = float("{:.2f}".format(majority_lost))
print(f"Money lost from furniture and appliances: ${majority_lost:,}")
Money lost from furniture and appliances: $1,058,743.46
# Creating a Tree-Map Diagram for drill downs.
# import plotly.express as px
# df = px.data.tips()
# fig = px.treemap(df, path=['day', 'time', 'sex'], values='total_bill')
# fig.show()
# Users by Day of month.
mau_day_df = df.groupby(['event_day'])['user_id'].size().reset_index()
fig = px.bar(mau_day_df, x='event_day', y='user_id', title="Users by Day of Month",
labels={
"event_day": "Day of Month",
"user_id": "Users",
})
fig.add_hline(x=2.5, line_width=3, line_dash="dash", line_color="green")
fig.show()
TypeError: add_hline() missing 1 required positional argument: 'y'
# Number of users per hour. # Should the data be restricted by event type?
mau_hour_df = df.groupby(['event_hour'])['user_id'].size().reset_index()
fig = px.bar(mau_hour_df, x='event_hour', y='user_id', title="Users by Time of Day",
labels={
"event_hour": "Hour",
"user_id": "Users",
})
fig.show()
# event_type object # this can certainly be encoded. This is your target though.
# There will only be two labels: purchase, remove from cart.
# obj_df["body_style_cat"] = obj_df["body_style"].cat.codes
# data = df[df.event_type.isin(['remove_from_cart', 'purchase'])]
data = pd.concat([purchases,removals])
print(data.columns)
drop_columns = ["category_code", "category_id", "user_session"]
one_hot_encode_columns = ["category", "sub_category", "item"]
target_encode_columns = ["brand"]
data.drop(drop_columns, axis=1, inplace=True)
# pd.get_dummies(obj_df, columns=["drive_wheels"]).head()
for column in one_hot_encode_columns:
pd.get_dummies(data, columns=[column]).head()
# data.drop(column)
# encoding the target column itself, Event Type
# oe_event = OneHotEncoder()
# oe_results = oe_event.fit_transform(obj_df[["event_type"]])
# pd.DataFrame(oe_results.toarray(), columns=oe_event.categories_).head()
ord_enc = OrdinalEncoder()
# data["make_code"] = ord_enc.fit_transform(data[["event_type"]])
data["event_type"] = ord_enc.fit_transform(data[["event_type"]])
# Target Encoding
encodings = data.groupby('brand')['event_type'].mean().reset_index()
encodings
exit()
# data = data.merge(encodings, how='left', on='brand')
# data.drop('brand', axis=1, inplace=True)
# # https://towardsdatascience.com/handling-categorical-data-the-right-way-9d1279956fc6
Index(['event_time', 'event_type', 'product_id', 'category_id',
'category_code', 'brand', 'price', 'user_id', 'user_session',
'event_day', 'event_month', 'event_year', 'event_hour', 'category',
'sub_category', 'item', 'purchase_date', 'removal_date'],
dtype='object')
# Preparing for Machine Learning.
# Should you select those categories only?
data = df[df.event_type.isin(['remove_from_cart', 'purchase'])]
train_data, test_data = train_test_split(data, test_size=0.2, random_state=0)
label = 'event_type'
y_train = train_data[label]
y_test = test_data[label]
X_train = pd.DataFrame(train_data.drop(columns=[label]))
X_test = pd.DataFrame(test_data.drop(columns=[label]))
dir = "AutogluonModels"
# predictor = TabularPredictor(label=label, problem_type='binary').fit(train_data, time_limit=20)
model = TabularPredictor(label=label, problem_type='binary', path=dir).fit(train_data, time_limit=20)
predictor = TabularPredictor(label=label, problem_type='binary').fit(train_data, time_limit=20)
# performance = model.evaluate(test_data)
preds = model = model.predict(test_data)
accuracy = float(np.sum(preds==test_data[target]))/test_data[target].shape[0]
print("Model ready.") # Notify user of model being ready.
print("Model accuracy: %f" % (accuracy)) # Print accuracy.
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning:
`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
Warning: path already exists! This predictor may overwrite an existing predictor! path="AutogluonModels"
Beginning AutoGluon training ... Time limit = 20s
AutoGluon will save models to "AutogluonModels/"
AutoGluon Version: 0.1.0
Train Data Rows: 998318
Train Data Columns: 15
Preprocessing data ...
Selected class <--> label mapping: class 1 = remove_from_cart, class 0 = purchase
Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (remove_from_cart) vs negative (purchase) class.
To explicitly set the positive_class, either rename classes to 1 and 0, or specify positive_class in Predictor init.
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 20930.27 MB
Train Data (Original) Memory Usage: 380.97 MB (1.8% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Fitting CategoryFeatureGenerator...
Fitting CategoryMemoryMinimizeFeatureGenerator...
Fitting DatetimeFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Useless Original Features (Count: 2): ['event_month', 'event_year']
These features carry no predictive signal and should be manually investigated.
This is typically a feature which has the same value for all rows.
These features do not need to be present at inference time.
Types of features in original data (raw dtype, special dtypes):
('datetime', []) : 1 | ['event_time']
('float', []) : 1 | ['price']
('int', []) : 5 | ['product_id', 'category_id', 'user_id', 'event_day', 'event_hour']
('object', []) : 6 | ['category_code', 'brand', 'user_session', 'category', 'sub_category', ...]
Types of features in processed data (raw dtype, special dtypes):
('category', []) : 6 | ['category_code', 'brand', 'user_session', 'category', 'sub_category', ...]
('float', []) : 1 | ['price']
('int', []) : 5 | ['product_id', 'category_id', 'user_id', 'event_day', 'event_hour']
('int', ['datetime_as_int']) : 1 | ['event_time']
6.8s = Fit runtime
13 features in original data used to generate 13 features in processed data.
Train Data (Processed) Memory Usage: 66.85 MB (0.3% of available memory)
Data preprocessing and feature engineering runtime = 8.14s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric argument of fit()
Automatically generating train/validation split with holdout_frac=0.01, Train Rows: 988334, Val Rows: 9984
Fitting model: RandomForestGini ... Training model for up to 11.86s of the 11.86s of remaining time.
Warning: Reducing model 'n_estimators' from 300 -> 138 due to low memory. Expected memory usage reduced from 32.44% -> 15.0% of available memory...
Warning: Model is expected to require 334.7s to train, which exceeds the maximum time limit of 11.9s, skipping model...
Time limit exceeded... Skipping RandomForestGini.
Fitting model: RandomForestEntr ... Training model for up to 2.02s of the 2.02s of remaining time.
Warning: Reducing model 'n_estimators' from 300 -> 142 due to low memory. Expected memory usage reduced from 31.56% -> 15.0% of available memory...
Warning: Model is expected to require 406.7s to train, which exceeds the maximum time limit of 2.0s, skipping model...
Time limit exceeded... Skipping RandomForestEntr.
No base models to train on, skipping weighted ensemble...
ValueError: AutoGluon did not successfully train any models
# Load our Machine Learning Model.
# model = TabularPredictor.load("AutogluonModels/")
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning:
`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
# shap.dependence_plot("education-num", shap_values, X_test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=0)
label = 'event_type'
y_train = train_data[label]
y_test = test_data[label]
X_train = pd.DataFrame(train_data.drop(columns=[label]))
X_test = pd.DataFrame(test_data.drop(columns=[label]))
# Plotting the Feature Importance.
# Importance scores...
importance_scores = model.feature_importance(test_data)
importance_scores["feature"] = importance_scores.index
mms = MinMaxScaler(feature_range=(-100, 100))
importance_scores[['importance']] = mms.fit_transform(importance_scores[['importance']])
fig = px.bar(importance_scores, x='feature', y='importance',
hover_data=['feature', 'importance'], color='importance')
fig.show()
KeyError: 'event_type'
df.user_id.nunique() # number of customers # 368,232
# Spend per customer
user_spending = purchases.groupby(['user_id'])['price'].sum().reset_index()
spend_per_customer = user_spending['price'].mean()
print(spend_per_customer) # $48.56
# Loss per customer
user_removals = removals.groupby(['user_id'])['price'].sum().reset_index()
loss_per_customer = user_spending['price'].mean()
print(loss_per_customer) # $86.29
# Number of successful recoveries.
print(471420.70/ 86.29)
# Number of sessions per month.
# Number of recoveries per month.
# who contributed the most to the loss?
highest_loss = user_removals['price'].max()
biggest_loser = user_removals.loc[user_removals['price'] == highest_loss, 'user_id'].iloc[0]
print(biggest_loser) # 500179293
500179293
# Let's work with that one user.
# user_session
# Find the user session with the most loss?
# Find the session where products of a particular category were removed.
category_ = X_train["category"] == "appliances"
sub_category_ = X_train["sub_category"] == "environment"
item_ = X_train["item"] == 'vacuum'
datapoint = X_train[category_ & sub_category_ & item_].sample(n=1, random_state=0)
datapoint
class AutogluonWrapper:
def __init__(self, predictor, feature_names):
self.ag_model = predictor
self.feature_names = feature_names
def predict_proba(self, X):
if isinstance(X, pd.Series):
X = X.values.reshape(1,-1)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.feature_names)
return self.ag_model.predict_proba(X)
baseline = X_train.sample(100)
# X_train.mode() could also be reasonable baseline for both numerical/categorical features rather than an entire dataset.
# We can now create a KernelExplainer which will return Kernel SHAP values to explain particular AutoGluon predictions.
ag_wrapper = AutogluonWrapper(model, X_train.columns)
explainer = shap.KernelExplainer(ag_wrapper.predict_proba, baseline)
print("Baseline prediction: ", np.mean(ag_wrapper.predict_proba(baseline)))
# this is the same as explainer.expected_value
NSHAP_SAMPLES = 100
# how many samples to use to approximate each Shapely value, larger values will be slower
Baseline prediction: purchase 0.341199
remove_from_cart 0.658801
dtype: float64
ROW_INDEX = 0 # index of an example datapoint
# removals[removals[] == ""].sample(n=1, random_state=0)
# single_datapoint = X_train.iloc[[ROW_INDEX]]
single_datapoint = datapoint
single_prediction = ag_wrapper.predict_proba(single_datapoint)
shap_values_single = explainer.shap_values(single_datapoint, nsamples=NSHAP_SAMPLES)
shap.force_plot(explainer.expected_value, shap_values_single, X_train.iloc[ROW_INDEX,:])
█
TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
shap_values = explainer.shap_values(X_test, nsamples=NSHAP_SAMPLES)
shap.force_plot(explainer.expected_value, shap_values, X_test)
NameError: name 'explainer' is not defined
shap.summary_plot(shap_values, X_test)
# predictor.leaderboard(silent=True)
shap.dependence_plot("education-num", shap_values, X_test)
# df.loc[df['First Season'] > 1990, 'First Season'] = 1
# df['First Season'] = (df['First Season'] > 1990).astype(int)
# selecting and changing
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning:
`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.