import pandas as pd
data = pd.read_csv('Advent_Emmail_Receipt_Case_Study.csv', dtype = {'product_description': 'str'})
data['email_time'] = pd.to_datetime(data['email_time'])
data['quarter'] = data['email_time'].dt.to_period('Q')
data = data[(data.email_time>='2019-1-1') & (data.email_time<='2020-12-31')]
data = data[(data.item_price>0) & (data.order_subtotal>0)]
data['product_subtitle']=data.product_subtitle.fillna('')
data['desc'] = data.product_description +' '+data.product_subtitle
# use re to extract brand name from desc and identify unique shoe skus
import re
def str_proc(instr):
pt_brand = re.compile('allbirds|crocs|Dr. Martens|puma|skechers|steve madden|timberland|ugg',re.IGNORECASE)
pt_wordout = re.compile(r'\\swoman|womens|men|woman|mens|shoes|size|by|from|with|white|black|sz|small|color|\
medium|large|grey|blue|navy|pink|lavender|slate|us|espresso|brown|uk|eu|walnut|mint|\
sea|yellow|red|green|gold|silver|graphite|floral|leopard|khaki|coral|oyster|candy|rare|\
euc|gray|mossy|oak|lemon|army|lapis|in|great|condition|paypal|bright|dark|lime|punch|\
lightening|flames|pool|melon|galaxy|neo|neon|pepper|purple|tangerine|nwob|volt|mineral|\
fresco|blossom|cantaloupe|citrus|dusty|new|spicy|camo|hot|barely|carnation|rose|teal|\
tropical|grass|burgundy|garnet|mw|taupe|wheat|rust|nubuck|all|steeple|beige|violet|fawn|\
chestnut\\W',re.IGNORECASE)
pt_specialout = re.compile('[^a-zA-Z_]')
brand = pt_brand.search(instr)
if brand == None:
brand = ''
else:
brand = brand.group()
out = pt_brand.sub('', instr)
out = re.sub('\s\w\s', ' ', out) #get rid of a single letter
out = re.sub('\s\w$','', out) #get rid of a single letter in the end
out = re.sub('\s+','_', out) #replace splace by _
out = pt_specialout.sub('',out) #get rid of special char
out = pt_wordout.sub('',out) #get rid of common words
out = re.sub('_+',' ', out) #replace multiple _ to space
out = re.sub('\s\w\s', ' ', out)
out = re.sub('\s\w$','', out)
out = re.sub('\s*$','', out)
out = re.sub('^\s*','',out)
return brand.upper(),out.upper()
pdes = data.desc
print(len(pdes))
brand=[]
sku = []
for x in pdes:
b,s = str_proc(x)
brand.append(b)
sku.append(s)
data['brand'] = brand
data['sku'] = sku
data['sku'] = data.sku.str.slice(stop=15)
data = data[data.sku!='']
159776
unique_buyers = data.groupby('brand')['user_id'].nunique()
unique_buyers.to_csv('unique_buyers.csv')
unique_buyers
brandname = ['ALLBIRDS','CROCS','DR. MARTENS', 'PUMA','SKECHERS', 'STEVE MADDEN','TIMBERLAND','UGG']
sdata = data[data["brand"].isin(brandname)].copy()
brand_user = sdata.groupby(['brand','user_id']).count()
brand_user.reset_index(inplace=True)
brand_user = brand_user[['brand','user_id']]
# Loop to calculate number of crossover customers for each brand
overlap = pd.DataFrame(columns=brandname, index=brandname)
for b1 in brandname:
user1 = set(brand_user.user_id[brand_user.brand == b1])
for b2 in brandname:
user2 = set(brand_user.user_id[brand_user.brand == b2])
overlap.loc[b1,b2] = len(user1.intersection(user2))
print(overlap)
ALLBIRDS CROCS DR. MARTENS PUMA SKECHERS STEVE MADDEN \
ALLBIRDS 194 6 4 8 9 7
CROCS 6 9417 197 519 1480 657
DR. MARTENS 4 197 3769 195 244 565
PUMA 8 519 195 9810 1204 806
SKECHERS 9 1480 244 1204 31106 1412
STEVE MADDEN 7 657 565 806 1412 13514
TIMBERLAND 9 280 187 442 616 573
UGG 24 1321 822 1226 2804 3137
TIMBERLAND UGG
ALLBIRDS 9 24
CROCS 280 1321
DR. MARTENS 187 822
PUMA 442 1226
SKECHERS 616 2804
STEVE MADDEN 573 3137
TIMBERLAND 5874 1073
UGG 1073 25619
import numpy as np
arr = overlap.to_numpy()
row_labels = ['ALLBIRDS', 'CROCS','DR. MARTENS', 'PUMA', 'SKECHERS', 'STEVE MADDEN', 'TIMBERLAND', 'UGG']
column_labels = ['ALLBIRDS', 'CROCS','DR. MARTENS', 'PUMA', 'SKECHERS', 'STEVE MADDEN', 'TIMBERLAND', 'UGG']
crossover = pd.DataFrame(arr, columns=column_labels, index=row_labels)
# used np array to improve readability of output
crossover
quarters = pd.period_range('2019-4','2020-12', freq='Q')
retention = pd.DataFrame(columns=quarters, index = brandname)
# Loop to calculate quarterly customer retention rate as percent
for b1 in brandname:
bdata = sdata[sdata.brand==b1]
for qt in quarters:
qt0 = qt-1
user_qt0 = set(bdata.user_id[bdata.quarter==qt0])
user_qt1 = set(bdata.user_id[bdata.quarter==qt])
retention.loc[b1, qt] = len(user_qt0.intersection(user_qt1))/len(user_qt0)*100
print(retention)
2019Q2 2019Q3 2019Q4 2020Q1 2020Q2 2020Q3 \
ALLBIRDS 5.555556 11.111111 13.513514 5.0 23.076923 25.0
CROCS 7.174103 4.321729 4.983923 3.809524 8.695652 4.897706
DR. MARTENS 4.659498 4.402516 5.592105 4.513064 5.080214 8.333333
PUMA 6.980057 5.704971 5.873494 5.815832 7.434053 7.352062
SKECHERS 11.90303 10.518877 11.157755 9.074373 11.462551 9.298532
STEVE MADDEN 9.437751 9.301397 9.530324 7.139839 8.695652 9.394108
TIMBERLAND 3.370787 6.148867 6.425703 2.720637 5.156538 5.236486
UGG 3.938731 8.357075 21.0625 6.76815 5.394685 10.239521
2020Q4
ALLBIRDS 14.814815
CROCS 5.608215
DR. MARTENS 6.902985
PUMA 5.197245
SKECHERS 11.230926
STEVE MADDEN 9.727626
TIMBERLAND 7.009346
UGG 19.877676
# Get percentage of loyal customers for each brand
loyal = {}
for b1 in brandname:
user1 = set(brand_user.user_id[brand_user.brand == b1])
numtotal = len(user1)
for b2 in brandname:
if b1 == b2:
continue
user2 = set(brand_user.user_id[brand_user.brand == b2])
user1 = user1.difference(user2)
loyal[b1] = len(user1)/numtotal*100
loyal_cust = pd.DataFrame({'Company':loyal.keys(), 'Loyal_customer':loyal.values()}).sort_values(['Loyal_customer'], ascending=False)
loyal_cust
sdata2 = sdata.copy()
# Number of unique shoe skus for each brand
usku = sdata2.groupby(['brand'])['sku'].agg(Num_uniuqe_SKU = lambda x: len(x.unique()))
usku = usku.reset_index()
print(usku)
brand Num_uniuqe_SKU
0 ALLBIRDS 90
1 CROCS 2299
2 DR. MARTENS 1054
3 PUMA 4164
4 SKECHERS 7268
5 STEVE MADDEN 4246
6 TIMBERLAND 1730
7 UGG 5251
popsku = sdata2.groupby(['brand','sku'])['item_quantity'].sum()
popsku = popsku.reset_index()
# Identify the most popular shoe sku for each brand
brandpop = pd.DataFrame()
for b in brandname:
p = popsku[popsku.brand == b].sort_values('item_quantity',ascending = False)
brandpop = brandpop.append(p.iloc[0,:])
print('Most popular shoe SKU')
print(brandpop)
Most popular shoe SKU
brand sku item_quantity
78 ALLBIRDS WOOL RUNNERS NA 55.0
445 CROCS CLASSIC CLOG 477.0
2600 DR. MARTENS COMBAT BOOT LEA 250.0
4324 PUMA COOL CAT SPORT 238.0
14707 SKECHERS WORK RELAXED FI 745.0
15046 STEVE MADDEN BANDI PLATFORM 415.0
19379 TIMBERLAND CH PREMIUM WATE 236.0
24015 UGG KOOLABURRA VICT 1543.0
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pdata = data[data.sku.isin(brandpop.sku)].copy()
# Show quarterly price changes of most popular shoe skus
pricechange = pdata.groupby(['quarter','brand','sku'])['item_price'].mean()
pricechange = pricechange.reset_index()
pricepivot = pricechange.pivot(index=['brand','sku'], columns='quarter', values='item_price')
pricepivot
x=data.groupby(['brand'])['order_total_amount'].agg(order_average='mean') # Average order size for each brand
y=data.groupby(['brand','quarter'])['user_id'].agg(freq=lambda x: len(x)/len(x.unique()))
quarterlyfreq = y.groupby(['brand'])['freq'].agg(freq='mean') # Average freq of orders in any given quarter
# Force rank 8 companies by the dollar value of their customers
quarterlyfreq['customer_value'] = quarterlyfreq.freq*x.order_average
cust = quarterlyfreq['customer_value']
cust = cust.reset_index()
cust = cust[cust.brand!=''].sort_values('customer_value', ascending=False)
print(cust)
brand customer_value
8 UGG 234.285991
7 TIMBERLAND 201.672742
6 STEVE MADDEN 198.752948
3 DR. MARTENS 183.324555
4 PUMA 132.480963
5 SKECHERS 128.981963
1 ALLBIRDS 105.927484
2 CROCS 98.342576