#install packages
!pip install lifetimes
# Importing the libraries
import pandas as pd
import numpy as np
import sqlite3 as sql
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import lifetimes
from lifetimes.utils import summary_data_from_transaction_data
from datetime import datetime
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter("ignore")
# connect to the database
db_conn = sql.connect("/datasets/hidrive/base2.db")
#show all tables from the database
pd.read_sql(
"""
select *
from sqlite_master
""", db_conn)
# we dont want to evaluate canceled orders, so we are not taking this data
pd.read_sql(
"""
Select DISTINCT order_status
from orders
""", db_conn)
SQL_Query = pd.read_sql(
"""
Select
c.customer_unique_id,
o.order_purchase_timestamp,
p.payment_value
from customers as c
join orders as o on o.customer_id = c.customer_id
join order_payments as p on o.order_id = p.order_id
Where NOT o.order_status= 'canceled'
""", db_conn)
df = pd.DataFrame(SQL_Query)
# Checking our dataframe
df.info()
df.head()
# Checking for null variables
df.isnull().sum(axis=0)
#Counting unique customers
df['customer_unique_id'].nunique()
#This function converts a scalar, array-like, Series or DataFrame/dict-like to a pandas datetime object.
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'], format="%Y-%m-%d %H:%M:%S")
#Returns numpy array of python datetime.date objects. It can be used to access the values of the series as datetimelike and return several properties.
df['order_purchase_timestamp'] = df.order_purchase_timestamp.dt.date
# exploring its minimum (first purchase) and maximum (last purchase) values.
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_purchase_timestamp'].describe()
# The last purchase made in the dataset was on August 29, 2018, so we will use that date as our current date to simulate an immediate study of the company's transactions.
today = '2018-09-03'
date_today = datetime.strptime(today, '%Y-%m-%d') # datetime.strptime() Returns a datetime corresponding to date_string, parsed according to format.
print(date_today)
print(type(date_today))
# REGENCY: For every customer, create min and max purchase time, then calculate the difference between max and min time of purchase
r = df.groupby('customer_unique_id').agg(['min', 'max'])['order_purchase_timestamp']
r['recency'] = r['max'] - r['min']
# T: New column, calculate from today and the customer's last purchase
r['T'] = date_today - r['min']
r = r[['recency', 'T']]
# Let's take a look at our new variables
r.head()
# Creating a new dictionary, aggregations
aggregations = {
'order_purchase_timestamp':'count',
'payment_value': 'sum'}
# Using it in the groupby function to obtain the f
f = df.groupby('customer_unique_id').agg(aggregations)
#Creating a new column 'frequency' (becaouse: This means that it’s one less than the total number of purchases)
f['frequency'] = f['order_purchase_timestamp'] - 1
f = f[['frequency']]
# Merging r and f
rf = pd.merge(r,f, left_index=True, right_index=True) # 'right_index' = Use the index from the right DataFrame as the join key.
rf.head()
rfm = summary_data_from_transaction_data(
df,
customer_id_col='customer_unique_id',
datetime_col='order_purchase_timestamp',
monetary_value_col ='payment_value',
observation_period_end='2018-08-29',
datetime_format='%Y-%m-%d',
freq='W') #Options: Y,M,W,D - year, month, weeks, days
rfm
rfm = summary_data_from_transaction_data(
df,
customer_id_col='customer_unique_id',
datetime_col='order_purchase_timestamp',
monetary_value_col ='payment_value',
observation_period_end='2018-08-29',
datetime_format='%Y-%m-%d',
freq='W') #Options: Y,M,W,D - year, month, weeks, days
rfm.head()
import plotly.express as px
px.histogram(rfm, x=rfm['frequency'],title='Frequency of purchase',
labels={'frequency':'Frequency'},
opacity=0.8, marginal='violin',
color_discrete_sequence=['indianred'])
px.histogram(rfm, x=rfm['recency'],title='Recency of purchase',
labels={'recency':'Recency'}, nbins=50,
opacity=0.8, marginal='violin',
color_discrete_sequence=['indianred'])
px.histogram(rfm, x=rfm['T'],title='Time from first purchase',
labels={'T':'Weeks'},
opacity=0.8, marginal='violin',
color_discrete_sequence=['indianred'])
from lifetimes import BetaGeoFitter
bgf = BetaGeoFitter(penalizer_coef=0.001)
#Fit the BG/BB model. Returns: BetaGeoBetaBinomFitter – fitted and with parameters estimated
bgf.fit(rfm['frequency'], rfm['recency'], rfm['T'], verbose=True) #Verbose: Set to true to print out convergence diagnostics.
print(bgf)
bgf.summary
#Thats our trained model, it can be used in the lifetimes library for the following visualizations.
from lifetimes.plotting import plot_frequency_recency_matrix
from lifetimes.plotting import plot_probability_alive_matrix
fig = plt.figure(figsize=(12,8))
plot_frequency_recency_matrix(bgf, T=4) # T = Unit times(weeks)
fig = plt.figure(figsize=(12,8))
plot_probability_alive_matrix(bgf)
from lifetimes.utils import calibration_and_holdout_data
rfm_val = calibration_and_holdout_data(df,
customer_id_col='customer_unique_id',
datetime_col='order_purchase_timestamp',
monetary_value_col ='payment_value',
calibration_period_end='2018-05-29',
observation_period_end='2018-08-29',
datetime_format='%Y-%m-%d',
freq='W')
rfm_val.head(5)
bgf_val = BetaGeoFitter(penalizer_coef=0.001)
bgf_val.fit(rfm_val['frequency_cal'], rfm_val['recency_cal'], rfm_val['T_cal'], verbose=True)
print(bgf_val)
from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases
fig = plt.figure(figsize=(12,8))
plot_calibration_purchases_vs_holdout_purchases(model=bgf_val, calibration_holdout_matrix=rfm_val)
rfm_gg = rfm[rfm['frequency'] > 0]
len(rfm_gg) #len(): computes the number of elements
rfm_gg[['monetary_value', 'frequency']].corr() #corr(): computes the correlation
from lifetimes import GammaGammaFitter
ggf = GammaGammaFitter(penalizer_coef = 0.0)
ggf.fit(rfm_gg['frequency'], rfm_gg['monetary_value'])
print(ggf)
rfm['avg_transaction'] = round(ggf.conditional_expected_average_profit(rfm_gg['frequency'],
rfm_gg['monetary_value']), 2)
rfm['avg_transaction'] = rfm['avg_transaction'].fillna(0)
rfm.sort_values(by='avg_transaction', ascending=False)