Setup
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from operator import attrgetter
import matplotlib.colors as mcolors
import numpy as np
df = pd.read_excel('Online Retail.xlsx',
dtype={'CustomerID': str,
'InvoiceID': str},
engine='openpyxl',
parse_dates=['InvoiceDate'])
df.head()
Exploratory analysis
df.info()
# drop rows without customer ID
df2 = df.dropna(subset=['CustomerID'], inplace=False)
df2.head()
df2.describe().transpose()
# how many orders were placed by each customer?
n_orders = df2.groupby('CustomerID')['InvoiceNo'].nunique()
mult_orders_percent = np.sum(n_orders > 1) / df2['CustomerID'].nunique()
print(f'{100 * mult_orders_percent:.2f}% of customers ordered more than once.')
# look at distribution of the number of orders per customer
ax = sns.distplot(n_orders, kde=False, hist=True)
ax.set_title('Distribution of the number of orders per customer')
ax.set(xlabel='Number of orders', ylabel='Number of customers')
Cohort retention analysis
df2 = df2[['InvoiceNo', 'CustomerID', 'InvoiceDate']].drop_duplicates()
# create cohort and order month variables
df2['order_month'] = df2['InvoiceDate'].dt.to_period('M')
df2['cohort'] = df2.groupby('CustomerID')['order_month'].transform('min')
# count the number of unique customers in each cohort and order month
df_cohort = df2.groupby(['cohort', 'order_month']).agg(n_customers=('CustomerID', 'nunique')).reset_index(drop=False)
# find number of months between the cohort month and the month of the purchase
df_cohort['period_num'] = (df_cohort['order_month'] - df_cohort['cohort']).apply(attrgetter('n'))
# pivot df_cohort on cohort
cohort_pivot = df_cohort.pivot(index='cohort', columns='period_num', values='n_customers')
cohort_pivot
# divide number of customers in each subsequent period by cohort size
cohort_size = cohort_pivot.iloc[:, 0]
retention_matrix = cohort_pivot.divide(cohort_size, axis=0)
# plot retention matrix as heatmap
with sns.axes_style("white"):
fig, ax = plt.subplots(1, 2, figsize=(12, 8), sharey=True, gridspec_kw={'width_ratios': [1, 11]})
# retention matrix
sns.heatmap(retention_matrix,
mask=retention_matrix.isnull(),
annot=True,
fmt='.0%',
cmap='RdYlGn',
ax=ax[1])
ax[1].set_title('Monthly Cohorts: User Retention', fontsize=16)
ax[1].set(xlabel='# of periods',
ylabel='')
# cohort size
cohort_size_df = pd.DataFrame(cohort_size).rename(columns={0: 'cohort_size'})
white_cmap = mcolors.ListedColormap(['white'])
sns.heatmap(cohort_size_df,
annot=True,
cbar=False,
fmt='g',
cmap=white_cmap,
ax=ax[0])
fig.tight_layout()