Cohort Analysis Sample

import pandas as pd data = pd.read_csv('Advent_Emmail_Receipt_Case_Study.csv', dtype = {'product_description': 'str'}) data['email_time'] = pd.to_datetime(data['email_time']) data['quarter'] = data['email_time'].dt.to_period('Q') data = data[(data.email_time>='2019-1-1') & (data.email_time<='2020-12-31')] data = data[(data.item_price>0) & (data.order_subtotal>0)] data['product_subtitle']=data.product_subtitle.fillna('') data['desc'] = data.product_description +' '+data.product_subtitle

pdes = data.desc print(len(pdes)) brand=[] sku = [] for x in pdes: b,s = str_proc(x) brand.append(b) sku.append(s) data['brand'] = brand data['sku'] = sku data['sku'] = data.sku.str.slice(stop=15) data = data[data.sku!='']

unique_buyers = data.groupby('brand')['user_id'].nunique() unique_buyers.to_csv('unique_buyers.csv')

unique_buyers

brandname = ['ALLBIRDS','CROCS','DR. MARTENS', 'PUMA','SKECHERS', 'STEVE MADDEN','TIMBERLAND','UGG'] sdata = data[data["brand"].isin(brandname)].copy() brand_user = sdata.groupby(['brand','user_id']).count() brand_user.reset_index(inplace=True) brand_user = brand_user[['brand','user_id']] # Loop to calculate number of crossover customers for each brand overlap = pd.DataFrame(columns=brandname, index=brandname) for b1 in brandname: user1 = set(brand_user.user_id[brand_user.brand == b1]) for b2 in brandname: user2 = set(brand_user.user_id[brand_user.brand == b2]) overlap.loc[b1,b2] = len(user1.intersection(user2)) print(overlap)

             ALLBIRDS CROCS DR. MARTENS  PUMA SKECHERS STEVE MADDEN  \
ALLBIRDS          194     6           4     8        9            7   
CROCS               6  9417         197   519     1480          657   
DR. MARTENS         4   197        3769   195      244          565   
PUMA                8   519         195  9810     1204          806   
SKECHERS            9  1480         244  1204    31106         1412   
STEVE MADDEN        7   657         565   806     1412        13514   
TIMBERLAND          9   280         187   442      616          573   
UGG                24  1321         822  1226     2804         3137   

             TIMBERLAND    UGG  
ALLBIRDS              9     24  
CROCS               280   1321  
DR. MARTENS         187    822  
PUMA                442   1226  
SKECHERS            616   2804  
STEVE MADDEN        573   3137  
TIMBERLAND         5874   1073  
UGG                1073  25619

import numpy as np arr = overlap.to_numpy()

row_labels = ['ALLBIRDS', 'CROCS','DR. MARTENS', 'PUMA', 'SKECHERS', 'STEVE MADDEN', 'TIMBERLAND', 'UGG'] column_labels = ['ALLBIRDS', 'CROCS','DR. MARTENS', 'PUMA', 'SKECHERS', 'STEVE MADDEN', 'TIMBERLAND', 'UGG'] crossover = pd.DataFrame(arr, columns=column_labels, index=row_labels)

# used np array to improve readability of output crossover

quarters = pd.period_range('2019-4','2020-12', freq='Q') retention = pd.DataFrame(columns=quarters, index = brandname) # Loop to calculate quarterly customer retention rate as percent for b1 in brandname: bdata = sdata[sdata.brand==b1] for qt in quarters: qt0 = qt-1 user_qt0 = set(bdata.user_id[bdata.quarter==qt0]) user_qt1 = set(bdata.user_id[bdata.quarter==qt]) retention.loc[b1, qt] = len(user_qt0.intersection(user_qt1))/len(user_qt0)*100 print(retention)

                2019Q2     2019Q3     2019Q4    2020Q1     2020Q2     2020Q3  \
ALLBIRDS      5.555556  11.111111  13.513514       5.0  23.076923       25.0   
CROCS         7.174103   4.321729   4.983923  3.809524   8.695652   4.897706   
DR. MARTENS   4.659498   4.402516   5.592105  4.513064   5.080214   8.333333   
PUMA          6.980057   5.704971   5.873494  5.815832   7.434053   7.352062   
SKECHERS      11.90303  10.518877  11.157755  9.074373  11.462551   9.298532   
STEVE MADDEN  9.437751   9.301397   9.530324  7.139839   8.695652   9.394108   
TIMBERLAND    3.370787   6.148867   6.425703  2.720637   5.156538   5.236486   
UGG           3.938731   8.357075    21.0625   6.76815   5.394685  10.239521   

                 2020Q4  
ALLBIRDS      14.814815  
CROCS          5.608215  
DR. MARTENS    6.902985  
PUMA           5.197245  
SKECHERS      11.230926  
STEVE MADDEN   9.727626  
TIMBERLAND     7.009346  
UGG           19.877676

# Get percentage of loyal customers for each brand loyal = {} for b1 in brandname: user1 = set(brand_user.user_id[brand_user.brand == b1]) numtotal = len(user1) for b2 in brandname: if b1 == b2: continue user2 = set(brand_user.user_id[brand_user.brand == b2]) user1 = user1.difference(user2) loyal[b1] = len(user1)/numtotal*100 loyal_cust = pd.DataFrame({'Company':loyal.keys(), 'Loyal_customer':loyal.values()}).sort_values(['Loyal_customer'], ascending=False)

loyal_cust

sdata2 = sdata.copy() # Number of unique shoe skus for each brand usku = sdata2.groupby(['brand'])['sku'].agg(Num_uniuqe_SKU = lambda x: len(x.unique())) usku = usku.reset_index() print(usku)

          brand  Num_uniuqe_SKU
0      ALLBIRDS              90
1         CROCS            2299
2   DR. MARTENS            1054
3          PUMA            4164
4      SKECHERS            7268
5  STEVE MADDEN            4246
6    TIMBERLAND            1730
7           UGG            5251

popsku = sdata2.groupby(['brand','sku'])['item_quantity'].sum() popsku = popsku.reset_index() # Identify the most popular shoe sku for each brand brandpop = pd.DataFrame() for b in brandname: p = popsku[popsku.brand == b].sort_values('item_quantity',ascending = False) brandpop = brandpop.append(p.iloc[0,:]) print('Most popular shoe SKU') print(brandpop)

Most popular shoe SKU
              brand              sku  item_quantity
78         ALLBIRDS  WOOL RUNNERS NA           55.0
445           CROCS     CLASSIC CLOG          477.0
2600    DR. MARTENS  COMBAT BOOT LEA          250.0
4324           PUMA  COOL CAT SPORT           238.0
14707      SKECHERS  WORK RELAXED FI          745.0
15046  STEVE MADDEN  BANDI PLATFORM           415.0
19379    TIMBERLAND  CH PREMIUM WATE          236.0
24015           UGG  KOOLABURRA VICT         1543.0

pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None)

pdata = data[data.sku.isin(brandpop.sku)].copy() # Show quarterly price changes of most popular shoe skus pricechange = pdata.groupby(['quarter','brand','sku'])['item_price'].mean() pricechange = pricechange.reset_index() pricepivot = pricechange.pivot(index=['brand','sku'], columns='quarter', values='item_price')

pricepivot

x=data.groupby(['brand'])['order_total_amount'].agg(order_average='mean') # Average order size for each brand y=data.groupby(['brand','quarter'])['user_id'].agg(freq=lambda x: len(x)/len(x.unique())) quarterlyfreq = y.groupby(['brand'])['freq'].agg(freq='mean') # Average freq of orders in any given quarter # Force rank 8 companies by the dollar value of their customers quarterlyfreq['customer_value'] = quarterlyfreq.freq*x.order_average cust = quarterlyfreq['customer_value'] cust = cust.reset_index() cust = cust[cust.brand!=''].sort_values('customer_value', ascending=False) print(cust)

          brand  customer_value
8           UGG      234.285991
7    TIMBERLAND      201.672742
6  STEVE MADDEN      198.752948
3   DR. MARTENS      183.324555
4          PUMA      132.480963
5      SKECHERS      128.981963
1      ALLBIRDS      105.927484
2         CROCS       98.342576