import pandas as pd
data = pd.read_csv('Advent_Emmail_Receipt_Case_Study.csv', dtype = {'product_description': 'str'})
data['email_time'] = pd.to_datetime(data['email_time'])
data['quarter'] = data['email_time'].dt.to_period('Q')
data = data[(data.email_time>='2019-1-1') & (data.email_time<='2020-12-31')]
data = data[(data.item_price>0) & (data.order_subtotal>0)]
data['product_subtitle']=data.product_subtitle.fillna('')
data['desc'] = data.product_description +' '+data.product_subtitle
# use re to extract brand name from desc and identify unique shoe skus
import re
def str_proc(instr):
pt_brand = re.compile('allbirds|crocs|Dr. Martens|puma|skechers|steve madden|timberland|ugg',re.IGNORECASE)
pt_wordout = re.compile(r'\\swoman|womens|men|woman|mens|shoes|size|by|from|with|white|black|sz|small|color|\
medium|large|grey|blue|navy|pink|lavender|slate|us|espresso|brown|uk|eu|walnut|mint|\
sea|yellow|red|green|gold|silver|graphite|floral|leopard|khaki|coral|oyster|candy|rare|\
euc|gray|mossy|oak|lemon|army|lapis|in|great|condition|paypal|bright|dark|lime|punch|\
lightening|flames|pool|melon|galaxy|neo|neon|pepper|purple|tangerine|nwob|volt|mineral|\
fresco|blossom|cantaloupe|citrus|dusty|new|spicy|camo|hot|barely|carnation|rose|teal|\
tropical|grass|burgundy|garnet|mw|taupe|wheat|rust|nubuck|all|steeple|beige|violet|fawn|\
chestnut\\W',re.IGNORECASE)
pt_specialout = re.compile('[^a-zA-Z_]')
brand = pt_brand.search(instr)
if brand == None:
brand = ''
else:
brand = brand.group()
out = pt_brand.sub('', instr)
out = re.sub('\s\w\s', ' ', out) #get rid of a single letter
out = re.sub('\s\w$','', out) #get rid of a single letter in the end
out = re.sub('\s+','_', out) #replace splace by _
out = pt_specialout.sub('',out) #get rid of special char
out = pt_wordout.sub('',out) #get rid of common words
out = re.sub('_+',' ', out) #replace multiple _ to space
out = re.sub('\s\w\s', ' ', out)
out = re.sub('\s\w$','', out)
out = re.sub('\s*$','', out)
out = re.sub('^\s*','',out)
return brand.upper(),out.upper()
pdes = data.desc
print(len(pdes))
brand=[]
sku = []
for x in pdes:
b,s = str_proc(x)
brand.append(b)
sku.append(s)
data['brand'] = brand
data['sku'] = sku
data['sku'] = data.sku.str.slice(stop=15)
data = data[data.sku!='']
unique_buyers = data.groupby('brand')['user_id'].nunique()
unique_buyers.to_csv('unique_buyers.csv')
unique_buyers
brandname = ['ALLBIRDS','CROCS','DR. MARTENS', 'PUMA','SKECHERS', 'STEVE MADDEN','TIMBERLAND','UGG']
sdata = data[data["brand"].isin(brandname)].copy()
brand_user = sdata.groupby(['brand','user_id']).count()
brand_user.reset_index(inplace=True)
brand_user = brand_user[['brand','user_id']]
# Loop to calculate number of crossover customers for each brand
overlap = pd.DataFrame(columns=brandname, index=brandname)
for b1 in brandname:
user1 = set(brand_user.user_id[brand_user.brand == b1])
for b2 in brandname:
user2 = set(brand_user.user_id[brand_user.brand == b2])
overlap.loc[b1,b2] = len(user1.intersection(user2))
print(overlap)
import numpy as np
arr = overlap.to_numpy()
row_labels = ['ALLBIRDS', 'CROCS','DR. MARTENS', 'PUMA', 'SKECHERS', 'STEVE MADDEN', 'TIMBERLAND', 'UGG']
column_labels = ['ALLBIRDS', 'CROCS','DR. MARTENS', 'PUMA', 'SKECHERS', 'STEVE MADDEN', 'TIMBERLAND', 'UGG']
crossover = pd.DataFrame(arr, columns=column_labels, index=row_labels)
# used np array to improve readability of output
crossover
quarters = pd.period_range('2019-4','2020-12', freq='Q')
retention = pd.DataFrame(columns=quarters, index = brandname)
# Loop to calculate quarterly customer retention rate as percent
for b1 in brandname:
bdata = sdata[sdata.brand==b1]
for qt in quarters:
qt0 = qt-1
user_qt0 = set(bdata.user_id[bdata.quarter==qt0])
user_qt1 = set(bdata.user_id[bdata.quarter==qt])
retention.loc[b1, qt] = len(user_qt0.intersection(user_qt1))/len(user_qt0)*100
print(retention)
# Get percentage of loyal customers for each brand
loyal = {}
for b1 in brandname:
user1 = set(brand_user.user_id[brand_user.brand == b1])
numtotal = len(user1)
for b2 in brandname:
if b1 == b2:
continue
user2 = set(brand_user.user_id[brand_user.brand == b2])
user1 = user1.difference(user2)
loyal[b1] = len(user1)/numtotal*100
loyal_cust = pd.DataFrame({'Company':loyal.keys(), 'Loyal_customer':loyal.values()}).sort_values(['Loyal_customer'], ascending=False)
loyal_cust
sdata2 = sdata.copy()
# Number of unique shoe skus for each brand
usku = sdata2.groupby(['brand'])['sku'].agg(Num_uniuqe_SKU = lambda x: len(x.unique()))
usku = usku.reset_index()
print(usku)
popsku = sdata2.groupby(['brand','sku'])['item_quantity'].sum()
popsku = popsku.reset_index()
# Identify the most popular shoe sku for each brand
brandpop = pd.DataFrame()
for b in brandname:
p = popsku[popsku.brand == b].sort_values('item_quantity',ascending = False)
brandpop = brandpop.append(p.iloc[0,:])
print('Most popular shoe SKU')
print(brandpop)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pdata = data[data.sku.isin(brandpop.sku)].copy()
# Show quarterly price changes of most popular shoe skus
pricechange = pdata.groupby(['quarter','brand','sku'])['item_price'].mean()
pricechange = pricechange.reset_index()
pricepivot = pricechange.pivot(index=['brand','sku'], columns='quarter', values='item_price')
pricepivot
x=data.groupby(['brand'])['order_total_amount'].agg(order_average='mean') # Average order size for each brand
y=data.groupby(['brand','quarter'])['user_id'].agg(freq=lambda x: len(x)/len(x.unique()))
quarterlyfreq = y.groupby(['brand'])['freq'].agg(freq='mean') # Average freq of orders in any given quarter
# Force rank 8 companies by the dollar value of their customers
quarterlyfreq['customer_value'] = quarterlyfreq.freq*x.order_average
cust = quarterlyfreq['customer_value']
cust = cust.reset_index()
cust = cust[cust.brand!=''].sort_values('customer_value', ascending=False)
print(cust)