import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import ipywidgets
from numpy import mean
from IPython.display import display_html
from itertools import chain,cycle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import gc
# import the sales tables from folder
path = r'/work/data/Sales'
sales_files = glob.glob(path+'/*.csv')
sales_list = []
for file in sales_files:
df = pd.read_csv(file, index_col = None, header= 0)
sales_list.append(df)
sales = pd.concat(sales_list, axis = 0, ignore_index = True)
# import products table from csv file
products = pd.read_csv('data/Contoso_Products.csv')
# import the customers table from csv file
customers = pd.read_csv('data/Contoso_Customers.csv', encoding= 'unicode_escape')
#import territories table from csv file
territories = pd.read_csv('data/Contoso_Territories.csv')
# make a list for tables
tables_list = [sales,products,customers,territories]
# iterate over the tables-list and print info for each
for table in tables_list:
print(table.info(),'\n')
totalCustomers = customers['CustomerKey'].count()
print(f'There are total of {totalCustomers} customer in the "customer" table')
salesCustomers = sales['CustomerKey'].drop_duplicates().count()
print(f'There are total of {salesCustomers} customer in the "sales" table')
# merge the sales dataframe with the products data frame and assign that to sales_product
sales_products = sales.merge(
products[['ProductKey','ProductPrice']],left_on = 'ProductKey', right_on = 'ProductKey')
sales_products['Revenue'] = round(sales_products['ProductPrice']*sales_products['OrderQuantity'],2)
sales_products.head(2)
# merge the sales_products dataframe with the customer dataframe and assign it to customer_analysis_df
customer_analysis= sales_products.merge(customers, left_on = 'CustomerKey', right_on='CustomerKey')
# add 'Customer' column which include the customer's 'FirstName' and 'lastName'
customer_analysis['Customer'] = customer_analysis['FirstName']+' '+customer_analysis['LastName']
# Capitalize the 'Customer' column
customer_analysis['Customer'] = [name.title() for name in customer_analysis['Customer'].values]
# delete the 'FirstName' and 'LastName' columns
customer_analysis.drop(['FirstName','LastName'], axis = 1, inplace = True)
# clean the 'AnnualIncome' column by removing the $ sign and convert its data type into intger
customer_analysis['AnnualIncome'] = customer_analysis['AnnualIncome'].str.split('$').str.get(1).str.replace(',','').astype('int')
# convert the 'orderDate' and 'birthDate' column into datetime dataframe
customer_analysis['OrderDate'] = pd.to_datetime(customer_analysis['OrderDate'])
customer_analysis['BirthDate'] = pd.to_datetime(customer_analysis['BirthDate'])
def annual_income_level(x):
if x < 40000:
return 'Low Income Level'
elif x <= 80000:
return 'Medium Income Level'
elif x > 80000:
return 'High Income Level'
customer_analysis['IncomeLevel'] = customer_analysis['AnnualIncome'].\
apply(lambda x : annual_income_level(x))
customer_analysis['AgeBasedOnOrderDate'] = (
customer_analysis['OrderDate'].dt.year - customer_analysis['BirthDate'].dt.year)
customer_analysis['AgeBasedOnPresentDate'] = (
customer_analysis['OrderDate'].dt.year.max() - customer_analysis['BirthDate'].dt.year)
print(f"Maximum age based on order date is {customer_analysis['AgeBasedOnOrderDate'].max()}")
print(f"Minimum age based on order date is {customer_analysis['AgeBasedOnOrderDate'].min()}")
print('\n')
print(f"Maximum age based on present date is {customer_analysis['AgeBasedOnPresentDate'].max()}")
print(f"Minimum age based on present date is {customer_analysis['AgeBasedOnPresentDate'].min()}")
# assuming that no customer will be less than 14 years old, make a function to assign the age groups for
# the two age columns
def age_group_order_date(x):
if x < 20:
return '14-19'
elif x < 40:
return '20-39'
elif x < 60:
return '40-59'
elif x < 80:
return '60-79'
elif x < 100:
return '80-99'
else:
return '>= 100'
customer_analysis['AgeGroup_BasedOnOrderDate'] = customer_analysis['AgeBasedOnOrderDate'].\
apply(lambda x: age_group_order_date(x))
customer_analysis['AgeGroup_BasedOnPresentDate'] = customer_analysis['AgeBasedOnPresentDate'].\
apply(lambda x: age_group_order_date(x))
customer_analysis = customer_analysis.merge(
territories, left_on = 'TerritoryKey', right_on = 'SalesTerritoryKey')
def region_function(x):
if x == 'Southwest':
return 'USA-SW'
elif x == 'Northwest':
return 'USA-NW'
elif x == 'Southeast':
return 'USA-SE'
elif x == 'Northeast':
return 'USA-NE'
elif x == 'Central':
return 'USA-C'
else:
return x
customer_analysis['Region'] = customer_analysis['Region'].apply(lambda x : region_function(x))
# delete tthe unnecessary columns
customer_analysis.drop(
['StockDate','TerritoryKey','SalesTerritoryKey','EmailAddress','Prefix','ProductPrice','OrderLineItem',
'BirthDate'], axis = 1, inplace = True)
# delete the sales_products dataframe
del sales_products
# convert the object columns with repeated values into category type
cols_names = [str(col) for col in customer_analysis.columns]
for col in cols_names:
if (customer_analysis[col].dtype == 'O') & (len(customer_analysis[col].value_counts()) < 20):
customer_analysis[col] = customer_analysis[col].astype('category')
customer_analysis.info()
def make_customer_category_rev_df(df,col):
"""a function that takes a dataframe and a name of a categorical column then aggreagte the dataframe
by "CustomerKey" and the categorical column to add the "TotalRevenue"."""
customer_category_rev_df = df.groupby(['CustomerKey',col]).\
agg({'Revenue':'sum'}).reset_index().\
rename(columns = {'Revenue':'TotalRevenue'})
customer_category_rev_df = customer_category_rev_df[customer_category_rev_df['TotalRevenue']!=0]
if col == 'AgeGroup_BasedOnPresentDate':
customer_category_rev_df.rename(columns = {'AgeGroup_BasedOnPresentDate':'AgeGroup'}, inplace = True)
if col == 'AgeGroup_BasedOnOrderDate':
customer_category_rev_df.rename(columns = {'AgeGroup_BasedOnOrderDate':'AgeGroup'}, inplace = True)
if col == 'TotalChildren':
customer_category_rev_df[col] = customer_category_rev_df[col].astype('str')
return customer_category_rev_df
def make_customer_count(df,col):
"""a function that takes in a dataframe and one of its categorical columns' name then calculates the
customer count for each category in a categorical column."""
df_count = df.groupby(col).agg({'CustomerKey':'count'}).\
rename(columns = {'CustomerKey':'CustomersCount'}).\
sort_values('CustomersCount', ascending =False)
if col == 'AgeGroup':
df_count.drop('>= 100',axis=0, inplace=True)
return df_count
# write a function that display the customers count per category
def make_count_vis(df,title):
"""a function that takes in a dataframe and a title then visualizes the customers count for each category
in a column in bar chart and their percentage in and pie chart."""
print(df)
chart_order = df.index
if title == 'Occupation':
explode_list = [0,0,0,0,0.11]
elif title == 'Age group':
explode_list = [0,0,0,0.14]
elif title == 'Region':
explode_list = [0,0,0.11]
else:
explode_list =[]
for i in range(len(df.index)):
explode_list.append(0)
plt.figure(figsize = (18,6), facecolor = 'lightblue')
plt.suptitle(f'Number of Customers by {title}', size = 22)
plt.subplot(1,2,1)
sns.barplot(x = df.index, y= 'CustomersCount', data = df, order = chart_order )
# plt.title(f'{title} by Count', size = 18)
plt.xticks(size = 14)
plt.xlabel(f'{title}', size = 18)
plt.ylabel('Count of Customers',size = 18)
plt.yticks(size = 18)
plt.subplot(1,2,2)
if len(df.index) == 2:
plt.pie(x = df['CustomersCount'],
labels = df.index,
startangle= 90,
counterclock= False,
textprops={'fontsize': 14},
autopct= '%2.1F%%')
else:
plt.pie(x = df['CustomersCount'],
labels = df.index,
startangle= 90,
counterclock= False,
textprops={'fontsize': 14},
autopct= '%2.1F%%',
explode = explode_list
)
# plt.title(f'{title} by Percentage of Total', size = 18)
circle = plt.Circle((0,0), 0.78, color = 'white')
get_current_figure = plt.gcf()
get_current_figure.gca().add_artist(circle)
plt.tight_layout()
plt.show()
# write a function to display tables horizontally with their titles
def display_side_by_side(*args,titles=cycle([''])):
"""a function that takes in a list of dataframes and their titles then displays them side by side."""
html_str=''
for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
html_str+='<th style="text-align:center"><td style="vertical-align:top">'
html_str+=f'<h6>{title}</h6>'
html_str+=df.to_html().replace('table','table style="display:inline"')
html_str+='</td></th>'
display_html(html_str,raw=True)
# write a function to calculate the basic statistics of a revenue vs. a categories and display the output
def make_basic_stats(df,col):
"""a function that takes in a dataframe and one of its categorical columns' name then calculates the basic
statistics for the revenue of each category in a categorical column then uses display_side_by_side to display the dataframes of the basic statistics side by side."""
categories_list = df[col].value_counts().index
if col == 'AgeGroup':
categories_list = ['20-39','40-59','60-79','80-99','>= 100']
names = []
stats_list = []
for category in categories_list:
name = category+ ' Revenue Statistics'
names.append(name)
category_stats = df[df[col] == category]['TotalRevenue'].describe().to_frame()
stats_list.append(category_stats)
if col == 'HomeOwner':
names = ['Homowners Revenue Statistics','Non-Homeowners Revenue Statistics']
if col == 'Gender':
names = ['Males Revenue Statistics','Females Revenue Statistics']
display_side_by_side(*stats_list, titles = names)
# make_rev_agg
def make_agg_rev(df, col):
"""a function that takes in a dataframe and a categorical column name then aggregates the Revenue of each
category in the column and return dataframe that includes Total Revenue, Average Revenue and Medain Revenue."""
df_agg_rev = df.groupby(col).agg({'TotalRevenue':['sum','mean','median']})
df_agg_rev = df_agg_rev['TotalRevenue'].\
rename(columns = {'sum':'TotalRevenue','mean':'AverageRevenue','median':'MedianRevenue'}).\
sort_values('AverageRevenue', ascending = False)
if col == 'AgeGroup':
df_agg_rev.drop('>= 100',axis=0, inplace=True)
return df_agg_rev
# make_rev_vis
def make_rev_vis(df1,df2,title,col):
"""a function that takes in 2 dataframes, a title and a categorical column name then visualizes the Total
Revenue, Average Revenue and Median Revenue of each category in the column in multiple charts."""
print(df1)
chart_order = df1.index
if (title == 'Education Level') | (title =='Occupation'):
explode_list = [0,0,0,0,0.11]
elif title =='Region':
explode_list = [0,0.14,0,-0.1,0.14]
elif title == 'Age Group':
explode_list = [0,0,0,0.14]
else:
explode_list = None
fig = plt.figure(figsize=(26,20), facecolor = 'lightblue')
plt.suptitle(f'Revenue by {title}', size = 22)
plt.subplot(3,2,1)
sns.barplot(x=df1.index, y='TotalRevenue', data=df1, order = chart_order)
plt.title(f'Total Revenue by {title}', size =18)
plt.xlabel(f'{title}', size = 18)
plt.ylabel('Total Revenue', size=18)
plt.xticks(size = 18)
plt.yticks(size = 18)
plt.subplot(3,2,2)
plt.pie(x = df1['TotalRevenue'],
labels = df1.index,
radius= 1.1,
startangle= 90,
counterclock= False,
autopct= '%2.1F%%',
textprops={'fontsize': 18},
explode = explode_list
)
circle = plt.Circle((0,0), 0.9, color = 'white')
get_current_figure = plt.gcf()
get_current_figure.gca().add_artist(circle)
plt.subplot(3,2,3)
sns.barplot(x=df1.index, y='AverageRevenue', data=df1, order = chart_order)
plt.title(f'Average Revenue by {title}', size =18)
plt.xlabel(f'{title}', size = 18)
plt.ylabel('Average Revenue', size=18)
plt.xticks(size = 18)
plt.yticks(size = 18)
plt.subplot(3,2,4)
sns.barplot(x=df1.index, y='MedianRevenue', data=df1, order = chart_order)
plt.title(f'Median Revenue by {title}', size =18)
plt.xlabel(f'{title}', size = 18)
plt.ylabel('Median Revenue', size=18)
plt.xticks(size = 18)
plt.yticks(size = 18)
plt.subplot(3,2,5)
sns.boxplot(x='TotalRevenue', y=col, data=df2, order = chart_order)
plt.title(f'Revenue Distribution by {title}', size = 18)
plt.xlabel('Total Revenue', size =18)
plt.ylabel('Total Children', size = 18)
plt.xticks(size = 18)
plt.yticks(size = 18, rotation = 60)
plt.subplot(3,2,6)
ax6 = sns.kdeplot(data = df2, x = 'TotalRevenue', hue=col)
plt.title(f'Revenue Kde Distribution by {title}', size = 18)
plt.xlabel('Total Revenue', size = 18)
plt.ylabel('Density',size =18)
plt.setp(ax6.get_legend().get_texts(), fontsize='18')
plt.setp(ax6.get_legend().get_title(), fontsize='22')
plt.tight_layout()
plt.show()
def make_stats_test(df1,df2,col):
"""a function that takes in 2 dataframes and categorical column then perform hypothesis testing on
the 'TotalRevenue' of 2 categorical values in the categorical column"""
category1 = df2.index[0]
val1 = df2['AverageRevenue'][0]
category2 = df2.index[1]
val2 = df2['AverageRevenue'][1]
_,p_ev = stats.levene(
df1[df1[col] == category1]['TotalRevenue'],
df1[df1[col] == category2]['TotalRevenue'],
center='mean')
if p_ev > 0.05:
equal_variance = True
else:
equal_variance = False
s_avg,p_avg = stats.ttest_ind(
df1[df1[col] == category1]['TotalRevenue'],
df1[df1[col] == category2]['TotalRevenue'],
equal_var = equal_variance)
print('Statistics:',s_avg)
print('P_value',p_avg)
if col == 'TotalChildren':
category1 = '1 child'
category2 = '3 children'
if col == 'HomeOwner':
category1 = 'homeowner'
category2 = 'non homeowner'
if col == 'MaritalStatus':
category1 = 'single'
category2 = 'married'
if col == 'Gender':
category1 = 'female'
category2 = 'male'
if p_avg > 0.05:
print(f"There is no significant evidence that {category1} customers spend more than {category2} customers on average")
else:
print(f"There is a significant evidence that {category1} customers spend more than {category2 } customers on average")
customer_gender_rev = make_customer_category_rev_df(customer_analysis,'Gender')
customer_gender_rev.head()
gender_count = make_customer_count(customer_gender_rev, 'Gender')
gender_count
# colors = ['blue','orange'],
make_count_vis(gender_count, 'Gender')
make_basic_stats(customer_gender_rev,'Gender')
rev_by_gender = make_agg_rev(customer_gender_rev,'Gender')
rev_by_gender
gender_list = ['All']+rev_by_gender.index.tolist()
make_rev_vis(rev_by_gender,customer_gender_rev,'Gender','Gender')
make_stats_test(customer_gender_rev,rev_by_gender,'Gender')
del (customer_gender_rev,
gender_count,
rev_by_gender)
gc.collect()
customer_maritalStatus_rev = make_customer_category_rev_df(customer_analysis,'MaritalStatus')
customer_maritalStatus_rev.head()
maritalStatus_count = make_customer_count(customer_maritalStatus_rev,'MaritalStatus')
maritalStatus_count
make_count_vis(maritalStatus_count,'Marital Status')
make_basic_stats(customer_maritalStatus_rev,'MaritalStatus')
rev_by_maritalStatus = make_agg_rev(customer_maritalStatus_rev,'MaritalStatus')
rev_by_maritalStatus
maritalStatus_list = ['All']+rev_by_maritalStatus.index.tolist()
make_rev_vis(rev_by_maritalStatus,customer_maritalStatus_rev,'Marital Stataus','MaritalStatus')
make_stats_test(customer_maritalStatus_rev,rev_by_maritalStatus,'MaritalStatus')
del (customer_maritalStatus_rev,
maritalStatus_count,
rev_by_maritalStatus
)
customer_homeowner_rev = make_customer_category_rev_df(customer_analysis,'HomeOwner')
customer_homeowner_rev.head()
homeowner_count = make_customer_count(customer_homeowner_rev, 'HomeOwner')
homeowner_count
make_count_vis(homeowner_count,'Homewonership')
make_basic_stats(customer_homeowner_rev,'HomeOwner')
rev_by_homeowner = make_agg_rev(customer_homeowner_rev,'HomeOwner')
rev_by_homeowner
home_ownership_list = ['All','Y',"N"]
make_rev_vis(rev_by_homeowner, customer_homeowner_rev,'Homeownership','HomeOwner')
make_stats_test(customer_homeowner_rev,rev_by_homeowner,'HomeOwner')
del(customer_homeowner_rev,
homeowner_count,
rev_by_homeowner
)
gc.collect()
customer_eduLevel_rev = make_customer_category_rev_df(customer_analysis,'EducationLevel')
customer_eduLevel_rev.head()
eduLevel_count = make_customer_count(customer_eduLevel_rev,'EducationLevel')
eduLevel_count
make_count_vis(eduLevel_count,'Education Level')
make_basic_stats(customer_eduLevel_rev,'EducationLevel')
rev_by_eduLevel = make_agg_rev(customer_eduLevel_rev,'EducationLevel')
rev_by_eduLevel
make_rev_vis(rev_by_eduLevel, customer_eduLevel_rev,'Education Level','EducationLevel')
make_stats_test(customer_eduLevel_rev,rev_by_eduLevel,'EducationLevel')
del(customer_eduLevel_rev,
eduLevel_count,
rev_by_eduLevel)
gc.collect()
customer_occupation_rev = make_customer_category_rev_df(customer_analysis,'Occupation')
customer_occupation_rev.head()
occupation_count = make_customer_count(customer_occupation_rev,'Occupation')
occupation_count
make_count_vis(occupation_count,'Occupation')
make_basic_stats(customer_occupation_rev,'Occupation')
rev_by_occupation = make_agg_rev(customer_occupation_rev,'Occupation')
rev_by_occupation
make_rev_vis(rev_by_occupation,customer_occupation_rev,'Occupation','Occupation')
make_stats_test(customer_occupation_rev,rev_by_occupation,'Occupation')
del (customer_occupation_rev,
occupation_count,
rev_by_occupation)
gc.collect()
customer_incomeLevel_rev = make_customer_category_rev_df(customer_analysis,'IncomeLevel')
customer_incomeLevel_rev.head()
incomeLevel_count = make_customer_count(customer_incomeLevel_rev,'IncomeLevel')
incomeLevel_count
make_count_vis(incomeLevel_count,'Income Level')
make_basic_stats(customer_incomeLevel_rev,'IncomeLevel')
rev_by_incomeLevel = make_agg_rev(customer_incomeLevel_rev,'IncomeLevel')
rev_by_incomeLevel
make_rev_vis(rev_by_incomeLevel,customer_incomeLevel_rev,'Income Level','IncomeLevel')
make_stats_test(customer_incomeLevel_rev,rev_by_incomeLevel,'IncomeLevel')
del (customer_incomeLevel_rev,
incomeLevel_count,
rev_by_incomeLevel)
gc.collect()
customer_ageGroup1_rev = make_customer_category_rev_df(customer_analysis,'AgeGroup_BasedOnPresentDate')
customer_ageGroup1_rev.head()
ageGroup_count = make_customer_count(customer_ageGroup1_rev,'AgeGroup')
ageGroup_count
make_count_vis(ageGroup_count,'Age group')
customer_ageGroup2_rev = make_customer_category_rev_df(customer_analysis,'AgeGroup_BasedOnOrderDate')
customer_ageGroup2_rev.head()
make_basic_stats(customer_ageGroup2_rev,'AgeGroup')
rev_by_ageGroup = make_agg_rev(customer_ageGroup2_rev,'AgeGroup')
rev_by_ageGroup
make_rev_vis(rev_by_ageGroup,customer_ageGroup2_rev,'Age Group','AgeGroup')
make_stats_test(customer_ageGroup2_rev,rev_by_ageGroup,'AgeGroup')
del (customer_ageGroup1_rev,
ageGroup_count,
customer_ageGroup2_rev,
rev_by_ageGroup)
gc.collect()
customer_tchildren_rev=make_customer_category_rev_df(customer_analysis,'TotalChildren')
customer_tchildren_rev.head()
tchildren_count = make_customer_count(customer_tchildren_rev,'TotalChildren')
tchildren_count
make_count_vis(tchildren_count, 'Total Children')
make_basic_stats(customer_tchildren_rev,'TotalChildren')
rev_by_tchildren = make_agg_rev(customer_tchildren_rev,'TotalChildren')
rev_by_tchildren
make_rev_vis(rev_by_tchildren,customer_tchildren_rev,'Total Children','TotalChildren')
make_stats_test(customer_tchildren_rev,rev_by_tchildren,'TotalChildren')
del (customer_tchildren_rev,
tchildren_count,
rev_by_tchildren)
gc.collect()
customer_country_rev = make_customer_category_rev_df(customer_analysis,'Country')
customer_country_rev.head()
country_count = make_customer_count(customer_country_rev,'Country')
country_count
make_count_vis(country_count,'Country')
make_basic_stats(customer_country_rev,'Country')
rev_by_country = make_agg_rev(customer_country_rev,'Country')
rev_by_country
make_rev_vis(rev_by_country,customer_country_rev,'Country','Country')
make_stats_test(customer_country_rev,rev_by_country,'Country')
del (customer_country_rev,country_count,rev_by_country)
gc.collect()
df = customer_analysis[customer_analysis['Country']=='United States'].\
groupby(['CustomerKey','Region']).\
agg({'Revenue':'sum'}).rename(columns={'Revenue':'TotalRevenue'})
df=df[df['TotalRevenue']!=0]
df.reset_index(inplace = True)
for region in df['Region'].value_counts().index:
regional_avg = df[df['Region']==region]['TotalRevenue'].mean()
regional_customer_keys = df[df['TotalRevenue']>regional_avg]['CustomerKey'].tolist()
regional_customers_info = customers[customers['CustomerKey']. isin(regional_customer_keys)]
regional_customers_info.to_csv(f'results/{region}_customers_above_regional_average.csv',index=False)
del df
gc.collect()
test_df = customer_analysis.\
groupby(['CustomerKey','Gender','HomeOwner']).\
agg({'Revenue':'sum'})
test_df=test_df[test_df['Revenue']!=0]
test_df.reset_index(inplace = True)
test_df.drop(['CustomerKey','Revenue'], axis = 1, inplace=True)
# H0: There is no association between gender and homeownership
# Ha: There is an association between gender and homeownership
alpha = 0.05
cont_table = pd.crosstab(test_df['Gender'],test_df['HomeOwner'])
s,p_value,df,ev = stats.chi2_contingency(cont_table)
print('alpha: ',alpha)
print('p_value: ',p_value)
if p_value < alpha:
print('There is an association between gender and homeownership')
else:
print('There is no assocaition between gender and homeownership')
del test_df,cont_table,alpha,p_value
gc.collect()
# make the gender widget
gender = ipywidgets.Dropdown(options =gender_list,
value = 'All',
description = 'Gender',
disabled = False);
# make the marital_status widget
marital_status = ipywidgets.Dropdown(options = maritalStatus_list,
value = 'All',
description = 'Marital Status',
disabled = False);
# make the home_ownership widget
home_ownership = ipywidgets.Dropdown(options =home_ownership_list,
value = 'All',
description = 'Home Ownership',
disabled = False);
# define the plotting of the AnnualIncome distribution
def dashboard(gender, marital_status,home_ownership):
dashb_df = customer_analysis[customer_analysis['Country']=='United States']
dashb_df = dashb_df.set_index('OrderDate',drop=False).sort_index().to_period('M')
dashb_df.index.name = 'OrderPeriod'
dashb_df.reset_index(inplace = True)
dashb_df['OrderPeriod'] = dashb_df['OrderPeriod'].astype('str')
if gender != 'All':
dashb_df = dashb_df[dashb_df['Gender'] == gender]
if marital_status != 'All':
dashb_df = dashb_df[dashb_df['MaritalStatus']==marital_status]
if home_ownership != 'All':
dashb_df= dashb_df[dashb_df['HomeOwner'] == home_ownership]
top20_customers = dashb_df.groupby('Customer').\
agg({'Revenue':'sum'}).\
rename(columns ={'Revenue':'TotalRevenue'}).\
sort_values('TotalRevenue', ascending = False).\
head(20)
region_rev = dashb_df.groupby('Region').\
agg({'Revenue':'sum'}).\
rename(columns = {'Revenue':'TotalRevenue'}).\
sort_values('TotalRevenue', ascending = False)
region_rev = region_rev[region_rev['TotalRevenue'] != 0]
southeast_central_northeast = (region_rev.loc['USA-NE'] +
region_rev.loc['USA-C'] +
region_rev.loc['USA-SE'])
region_rev.drop(labels = ['USA-SE','USA-C','USA-NE'], axis = 0, inplace = True)
region_rev.loc['USA-SE, USA-C & USA-NE']=southeast_central_northeast
fig = plt.figure(figsize=(14,10), constrained_layout=True, facecolor = 'lightblue')
fig.suptitle('Revenue Breakdown in USA', size =18)
spec = fig.add_gridspec(2,3)
ax1 = fig.add_subplot(spec[:, 0])
ax1 = sns.barplot(y = top20_customers.index, x = 'TotalRevenue',
data = top20_customers, color = 'blue')
plt.title('Top 20 Customers by Revenue')
for index, value in enumerate(top20_customers['TotalRevenue']):
label = format(int(value),',')
plt.annotate(label, xy=(value - 4700, index), color='white', size = 14)
ax2 = fig.add_subplot(spec[0, 1:])
ax2 = sns.lineplot(x ='OrderPeriod', y = 'Revenue', data = dashb_df, estimator='sum', ci=None)
plt.title('Revenue Timeline')
plt.ylabel('TotalRevenue')
plt.xticks(rotation = 45)
ax3 = fig.add_subplot(spec[1, 1:])
ax3 = sns.barplot(x = region_rev.index, y = 'TotalRevenue', data = region_rev, ci = None)
plt.title('Revenue by Region')
for index, value in enumerate(region_rev['TotalRevenue']):
label = format(int(value),',')
plt.annotate(label, xy=(index-0.25, value-500), color='black')
#add interaction to the funtion
ipywidgets.interact(dashboard,
gender = gender,
marital_status = marital_status,
home_ownership = home_ownership)
rfm_df = customer_analysis.groupby(['CustomerKey','OrderNumber']).\
agg({'Revenue':'sum','OrderDate':'max','AnnualIncome':'mean'}).\
reset_index().\
groupby('CustomerKey').\
agg({'OrderNumber':'count','Revenue':'sum','OrderDate':'max','AnnualIncome':'mean'}).\
reset_index().\
rename(columns = {'OrderNumber':'Frequency','Revenue':'Monetary'})
rfm_df['Recency'] = (rfm_df['OrderDate'].max()-rfm_df['OrderDate'])
rfm_df['Recency'] = rfm_df['Recency'].astype(str).str.split(' ').str.get(0).astype(int)
rfm_df.head()
rfm_stats = rfm_df[['Frequency','Monetary','Recency']].describe()
rfm_stats
f_quarter = (rfm_df['Frequency'].max() - rfm_df['Frequency'].min())/4
f_quarter
rfm_quantiles = {
'recency':{'0.25':rfm_stats.loc['25%','Recency'],'0.50':rfm_stats.loc['50%','Recency'],'0.75':rfm_stats.loc['75%','Recency']},
'frequency':{'0.25':f_quarter,'0.50':(f_quarter*2),'0.75':(3*f_quarter)},
'monetary':{'0.25':rfm_stats.loc['25%','Monetary'],'0.50':rfm_stats.loc['50%','Monetary'],'0.75':rfm_stats.loc['75%','Monetary']}
}
rfm_quantiles
# write a function that return the recency score based on the Recency value
def r_scroe(x):
if x <= rfm_quantiles['recency']['0.25']:
return 4
elif x <= rfm_quantiles['recency']['0.50']:
return 3
elif x <= rfm_quantiles['recency']['0.75']:
return 2
else:
return 1
# write a function that return the frequency score based on the Frequency value
def f_scroe(x):
if x <= rfm_quantiles['frequency']['0.25']:
return 1
elif x <= rfm_quantiles['frequency']['0.50']:
return 2
elif x <= rfm_quantiles['frequency']['0.75']:
return 3
else:
return 4
# write a function that return the moonetary score based on the Monetary value
def m_scroe(x):
if x <= rfm_quantiles['monetary']['0.25']:
return 1
elif x <= rfm_quantiles['monetary']['0.50']:
return 2
elif x <= rfm_quantiles['monetary']['0.75']:
return 3
else:
return 4
# get the r-score
rfm_df['R-Score'] = rfm_df['Recency'].apply(lambda x : r_scroe(x))
# get the f-score
rfm_df['F-Score'] = rfm_df['Frequency'].apply(lambda x : f_scroe(x))
# get the m-score
rfm_df['M-Score'] = rfm_df['Monetary'].apply(lambda x : m_scroe(x))
# get the RFM-Score
rfm_df['RFM-Score'] = (rfm_df['R-Score']*100) + (rfm_df['F-Score']*10) + (rfm_df['M-Score'])
# sort the values based on RFM-Score
rfm_df.sort_values('RFM-Score', ascending = False, inplace =True)
# make a new column 'RFM-Loyalty-Level' based on the RFM-Score
loyalty_level = ['Bronze','Silver','Gold','Platinum']
score_cuts = pd.qcut(rfm_df['RFM-Score'],4,loyalty_level)
rfm_df['RFM-Loyalty-Level'] = score_cuts
rfm_df.head()
for level in loyalty_level:
level_customers = rfm_df[rfm_df['RFM-Loyalty-Level'] == level]['CustomerKey']
level_customers = customers[customers['CustomerKey'].isin(level_customers)]
level_customers.to_csv(f'results/{level}_customers.csv', index = False)