Customer Lifetime Analysis of Brazil e-commerce

#install packages !pip install lifetimes

# Importing the libraries import pandas as pd import numpy as np import sqlite3 as sql import matplotlib.pyplot as plt import seaborn as sns import warnings import lifetimes from lifetimes.utils import summary_data_from_transaction_data from datetime import datetime warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.simplefilter("ignore")

# connect to the database db_conn = sql.connect("/datasets/hidrive/base2.db")

#show all tables from the database pd.read_sql( """ select * from sqlite_master """, db_conn)

# we dont want to evaluate canceled orders, so we are not taking this data pd.read_sql( """ Select DISTINCT order_status from orders """, db_conn)

SQL_Query = pd.read_sql( """ Select c.customer_unique_id, o.order_purchase_timestamp, p.payment_value from customers as c join orders as o on o.customer_id = c.customer_id join order_payments as p on o.order_id = p.order_id Where NOT o.order_status= 'canceled' """, db_conn) df = pd.DataFrame(SQL_Query)

# Checking our dataframe df.info() df.head()

# Checking for null variables df.isnull().sum(axis=0)

#Counting unique customers df['customer_unique_id'].nunique()

#This function converts a scalar, array-like, Series or DataFrame/dict-like to a pandas datetime object. df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'], format="%Y-%m-%d %H:%M:%S") #Returns numpy array of python datetime.date objects. It can be used to access the values of the series as datetimelike and return several properties. df['order_purchase_timestamp'] = df.order_purchase_timestamp.dt.date # exploring its minimum (first purchase) and maximum (last purchase) values. df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp']) df['order_purchase_timestamp'].describe()

# The last purchase made in the dataset was on August 29, 2018, so we will use that date as our current date to simulate an immediate study of the company's transactions. today = '2018-09-03' date_today = datetime.strptime(today, '%Y-%m-%d') # datetime.strptime() Returns a datetime corresponding to date_string, parsed according to format. print(date_today) print(type(date_today))

# REGENCY: For every customer, create min and max purchase time, then calculate the difference between max and min time of purchase r = df.groupby('customer_unique_id').agg(['min', 'max'])['order_purchase_timestamp'] r['recency'] = r['max'] - r['min']

# T: New column, calculate from today and the customer's last purchase r['T'] = date_today - r['min'] r = r[['recency', 'T']]

# Let's take a look at our new variables r.head()

# Creating a new dictionary, aggregations aggregations = { 'order_purchase_timestamp':'count', 'payment_value': 'sum'} # Using it in the groupby function to obtain the f f = df.groupby('customer_unique_id').agg(aggregations) #Creating a new column 'frequency' (becaouse: This means that it’s one less than the total number of purchases) f['frequency'] = f['order_purchase_timestamp'] - 1 f = f[['frequency']]

# Merging r and f rf = pd.merge(r,f, left_index=True, right_index=True) # 'right_index' = Use the index from the right DataFrame as the join key. rf.head()

rfm = summary_data_from_transaction_data( df, customer_id_col='customer_unique_id', datetime_col='order_purchase_timestamp', monetary_value_col ='payment_value', observation_period_end='2018-08-29', datetime_format='%Y-%m-%d', freq='W') #Options: Y,M,W,D - year, month, weeks, days rfm rfm = summary_data_from_transaction_data( df, customer_id_col='customer_unique_id', datetime_col='order_purchase_timestamp', monetary_value_col ='payment_value', observation_period_end='2018-08-29', datetime_format='%Y-%m-%d', freq='W') #Options: Y,M,W,D - year, month, weeks, days rfm.head()

import plotly.express as px px.histogram(rfm, x=rfm['frequency'],title='Frequency of purchase', labels={'frequency':'Frequency'}, opacity=0.8, marginal='violin', color_discrete_sequence=['indianred'])

px.histogram(rfm, x=rfm['recency'],title='Recency of purchase', labels={'recency':'Recency'}, nbins=50, opacity=0.8, marginal='violin', color_discrete_sequence=['indianred'])

px.histogram(rfm, x=rfm['T'],title='Time from first purchase', labels={'T':'Weeks'}, opacity=0.8, marginal='violin', color_discrete_sequence=['indianred'])

from lifetimes import BetaGeoFitter bgf = BetaGeoFitter(penalizer_coef=0.001) #Fit the BG/BB model. Returns: BetaGeoBetaBinomFitter – fitted and with parameters estimated bgf.fit(rfm['frequency'], rfm['recency'], rfm['T'], verbose=True) #Verbose: Set to true to print out convergence diagnostics. print(bgf)

bgf.summary #Thats our trained model, it can be used in the lifetimes library for the following visualizations.

from lifetimes.plotting import plot_frequency_recency_matrix from lifetimes.plotting import plot_probability_alive_matrix fig = plt.figure(figsize=(12,8)) plot_frequency_recency_matrix(bgf, T=4) # T = Unit times(weeks) fig = plt.figure(figsize=(12,8)) plot_probability_alive_matrix(bgf)

from lifetimes.utils import calibration_and_holdout_data rfm_val = calibration_and_holdout_data(df, customer_id_col='customer_unique_id', datetime_col='order_purchase_timestamp', monetary_value_col ='payment_value', calibration_period_end='2018-05-29', observation_period_end='2018-08-29', datetime_format='%Y-%m-%d', freq='W') rfm_val.head(5)

bgf_val = BetaGeoFitter(penalizer_coef=0.001) bgf_val.fit(rfm_val['frequency_cal'], rfm_val['recency_cal'], rfm_val['T_cal'], verbose=True) print(bgf_val)

from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases fig = plt.figure(figsize=(12,8)) plot_calibration_purchases_vs_holdout_purchases(model=bgf_val, calibration_holdout_matrix=rfm_val)

rfm_gg = rfm[rfm['frequency'] > 0] len(rfm_gg) #len(): computes the number of elements

rfm_gg[['monetary_value', 'frequency']].corr() #corr(): computes the correlation

from lifetimes import GammaGammaFitter ggf = GammaGammaFitter(penalizer_coef = 0.0) ggf.fit(rfm_gg['frequency'], rfm_gg['monetary_value']) print(ggf)

rfm['avg_transaction'] = round(ggf.conditional_expected_average_profit(rfm_gg['frequency'], rfm_gg['monetary_value']), 2) rfm['avg_transaction'] = rfm['avg_transaction'].fillna(0) rfm.sort_values(by='avg_transaction', ascending=False)