Contoso_Customer

import pandas as pd import glob import matplotlib.pyplot as plt import seaborn as sns from scipy import stats import ipywidgets from numpy import mean from IPython.display import display_html from itertools import chain,cycle import warnings warnings.simplefilter(action='ignore', category=FutureWarning) import gc

# import the sales tables from folder path = r'/work/data/Sales' sales_files = glob.glob(path+'/*.csv') sales_list = [] for file in sales_files: df = pd.read_csv(file, index_col = None, header= 0) sales_list.append(df) sales = pd.concat(sales_list, axis = 0, ignore_index = True) # import products table from csv file products = pd.read_csv('data/Contoso_Products.csv') # import the customers table from csv file customers = pd.read_csv('data/Contoso_Customers.csv', encoding= 'unicode_escape') #import territories table from csv file territories = pd.read_csv('data/Contoso_Territories.csv') # make a list for tables tables_list = [sales,products,customers,territories] # iterate over the tables-list and print info for each for table in tables_list: print(table.info(),'\n')

totalCustomers = customers['CustomerKey'].count() print(f'There are total of {totalCustomers} customer in the "customer" table')

salesCustomers = sales['CustomerKey'].drop_duplicates().count() print(f'There are total of {salesCustomers} customer in the "sales" table')

# merge the sales dataframe with the products data frame and assign that to sales_product sales_products = sales.merge( products[['ProductKey','ProductPrice']],left_on = 'ProductKey', right_on = 'ProductKey') sales_products['Revenue'] = round(sales_products['ProductPrice']*sales_products['OrderQuantity'],2) sales_products.head(2)

# merge the sales_products dataframe with the customer dataframe and assign it to customer_analysis_df customer_analysis= sales_products.merge(customers, left_on = 'CustomerKey', right_on='CustomerKey')

# add 'Customer' column which include the customer's 'FirstName' and 'lastName' customer_analysis['Customer'] = customer_analysis['FirstName']+' '+customer_analysis['LastName'] # Capitalize the 'Customer' column customer_analysis['Customer'] = [name.title() for name in customer_analysis['Customer'].values] # delete the 'FirstName' and 'LastName' columns customer_analysis.drop(['FirstName','LastName'], axis = 1, inplace = True) # clean the 'AnnualIncome' column by removing the $ sign and convert its data type into intger customer_analysis['AnnualIncome'] = customer_analysis['AnnualIncome'].str.split('$').str.get(1).str.replace(',','').astype('int') # convert the 'orderDate' and 'birthDate' column into datetime dataframe customer_analysis['OrderDate'] = pd.to_datetime(customer_analysis['OrderDate']) customer_analysis['BirthDate'] = pd.to_datetime(customer_analysis['BirthDate'])

def annual_income_level(x): if x < 40000: return 'Low Income Level' elif x <= 80000: return 'Medium Income Level' elif x > 80000: return 'High Income Level' customer_analysis['IncomeLevel'] = customer_analysis['AnnualIncome'].\ apply(lambda x : annual_income_level(x))

customer_analysis['AgeBasedOnOrderDate'] = ( customer_analysis['OrderDate'].dt.year - customer_analysis['BirthDate'].dt.year) customer_analysis['AgeBasedOnPresentDate'] = ( customer_analysis['OrderDate'].dt.year.max() - customer_analysis['BirthDate'].dt.year)

print(f"Maximum age based on order date is {customer_analysis['AgeBasedOnOrderDate'].max()}") print(f"Minimum age based on order date is {customer_analysis['AgeBasedOnOrderDate'].min()}") print('\n') print(f"Maximum age based on present date is {customer_analysis['AgeBasedOnPresentDate'].max()}") print(f"Minimum age based on present date is {customer_analysis['AgeBasedOnPresentDate'].min()}")

# assuming that no customer will be less than 14 years old, make a function to assign the age groups for # the two age columns def age_group_order_date(x): if x < 20: return '14-19' elif x < 40: return '20-39' elif x < 60: return '40-59' elif x < 80: return '60-79' elif x < 100: return '80-99' else: return '>= 100' customer_analysis['AgeGroup_BasedOnOrderDate'] = customer_analysis['AgeBasedOnOrderDate'].\ apply(lambda x: age_group_order_date(x)) customer_analysis['AgeGroup_BasedOnPresentDate'] = customer_analysis['AgeBasedOnPresentDate'].\ apply(lambda x: age_group_order_date(x))

customer_analysis = customer_analysis.merge( territories, left_on = 'TerritoryKey', right_on = 'SalesTerritoryKey')

def region_function(x): if x == 'Southwest': return 'USA-SW' elif x == 'Northwest': return 'USA-NW' elif x == 'Southeast': return 'USA-SE' elif x == 'Northeast': return 'USA-NE' elif x == 'Central': return 'USA-C' else: return x customer_analysis['Region'] = customer_analysis['Region'].apply(lambda x : region_function(x))

# delete tthe unnecessary columns customer_analysis.drop( ['StockDate','TerritoryKey','SalesTerritoryKey','EmailAddress','Prefix','ProductPrice','OrderLineItem', 'BirthDate'], axis = 1, inplace = True) # delete the sales_products dataframe del sales_products # convert the object columns with repeated values into category type cols_names = [str(col) for col in customer_analysis.columns] for col in cols_names: if (customer_analysis[col].dtype == 'O') & (len(customer_analysis[col].value_counts()) < 20): customer_analysis[col] = customer_analysis[col].astype('category')

customer_analysis.info()

def make_customer_category_rev_df(df,col): """a function that takes a dataframe and a name of a categorical column then aggreagte the dataframe by "CustomerKey" and the categorical column to add the "TotalRevenue".""" customer_category_rev_df = df.groupby(['CustomerKey',col]).\ agg({'Revenue':'sum'}).reset_index().\ rename(columns = {'Revenue':'TotalRevenue'}) customer_category_rev_df = customer_category_rev_df[customer_category_rev_df['TotalRevenue']!=0] if col == 'AgeGroup_BasedOnPresentDate': customer_category_rev_df.rename(columns = {'AgeGroup_BasedOnPresentDate':'AgeGroup'}, inplace = True) if col == 'AgeGroup_BasedOnOrderDate': customer_category_rev_df.rename(columns = {'AgeGroup_BasedOnOrderDate':'AgeGroup'}, inplace = True) if col == 'TotalChildren': customer_category_rev_df[col] = customer_category_rev_df[col].astype('str') return customer_category_rev_df

def make_customer_count(df,col): """a function that takes in a dataframe and one of its categorical columns' name then calculates the customer count for each category in a categorical column.""" df_count = df.groupby(col).agg({'CustomerKey':'count'}).\ rename(columns = {'CustomerKey':'CustomersCount'}).\ sort_values('CustomersCount', ascending =False) if col == 'AgeGroup': df_count.drop('>= 100',axis=0, inplace=True) return df_count

# write a function that display the customers count per category def make_count_vis(df,title): """a function that takes in a dataframe and a title then visualizes the customers count for each category in a column in bar chart and their percentage in and pie chart.""" print(df) chart_order = df.index if title == 'Occupation': explode_list = [0,0,0,0,0.11] elif title == 'Age group': explode_list = [0,0,0,0.14] elif title == 'Region': explode_list = [0,0,0.11] else: explode_list =[] for i in range(len(df.index)): explode_list.append(0) plt.figure(figsize = (18,6), facecolor = 'lightblue') plt.suptitle(f'Number of Customers by {title}', size = 22) plt.subplot(1,2,1) sns.barplot(x = df.index, y= 'CustomersCount', data = df, order = chart_order ) # plt.title(f'{title} by Count', size = 18) plt.xticks(size = 14) plt.xlabel(f'{title}', size = 18) plt.ylabel('Count of Customers',size = 18) plt.yticks(size = 18) plt.subplot(1,2,2) if len(df.index) == 2: plt.pie(x = df['CustomersCount'], labels = df.index, startangle= 90, counterclock= False, textprops={'fontsize': 14}, autopct= '%2.1F%%') else: plt.pie(x = df['CustomersCount'], labels = df.index, startangle= 90, counterclock= False, textprops={'fontsize': 14}, autopct= '%2.1F%%', explode = explode_list ) # plt.title(f'{title} by Percentage of Total', size = 18) circle = plt.Circle((0,0), 0.78, color = 'white') get_current_figure = plt.gcf() get_current_figure.gca().add_artist(circle) plt.tight_layout() plt.show()

# write a function to display tables horizontally with their titles def display_side_by_side(*args,titles=cycle([''])): """a function that takes in a list of dataframes and their titles then displays them side by side.""" html_str='' for df,title in zip(args, chain(titles,cycle(['</br>'])) ): html_str+='<th style="text-align:center"><td style="vertical-align:top">' html_str+=f'<h6>{title}</h6>' html_str+=df.to_html().replace('table','table style="display:inline"') html_str+='</td></th>' display_html(html_str,raw=True) # write a function to calculate the basic statistics of a revenue vs. a categories and display the output def make_basic_stats(df,col): """a function that takes in a dataframe and one of its categorical columns' name then calculates the basic statistics for the revenue of each category in a categorical column then uses display_side_by_side to display the dataframes of the basic statistics side by side.""" categories_list = df[col].value_counts().index if col == 'AgeGroup': categories_list = ['20-39','40-59','60-79','80-99','>= 100'] names = [] stats_list = [] for category in categories_list: name = category+ ' Revenue Statistics' names.append(name) category_stats = df[df[col] == category]['TotalRevenue'].describe().to_frame() stats_list.append(category_stats) if col == 'HomeOwner': names = ['Homowners Revenue Statistics','Non-Homeowners Revenue Statistics'] if col == 'Gender': names = ['Males Revenue Statistics','Females Revenue Statistics'] display_side_by_side(*stats_list, titles = names)

# make_rev_agg def make_agg_rev(df, col): """a function that takes in a dataframe and a categorical column name then aggregates the Revenue of each category in the column and return dataframe that includes Total Revenue, Average Revenue and Medain Revenue.""" df_agg_rev = df.groupby(col).agg({'TotalRevenue':['sum','mean','median']}) df_agg_rev = df_agg_rev['TotalRevenue'].\ rename(columns = {'sum':'TotalRevenue','mean':'AverageRevenue','median':'MedianRevenue'}).\ sort_values('AverageRevenue', ascending = False) if col == 'AgeGroup': df_agg_rev.drop('>= 100',axis=0, inplace=True) return df_agg_rev

# make_rev_vis def make_rev_vis(df1,df2,title,col): """a function that takes in 2 dataframes, a title and a categorical column name then visualizes the Total Revenue, Average Revenue and Median Revenue of each category in the column in multiple charts.""" print(df1) chart_order = df1.index if (title == 'Education Level') | (title =='Occupation'): explode_list = [0,0,0,0,0.11] elif title =='Region': explode_list = [0,0.14,0,-0.1,0.14] elif title == 'Age Group': explode_list = [0,0,0,0.14] else: explode_list = None fig = plt.figure(figsize=(26,20), facecolor = 'lightblue') plt.suptitle(f'Revenue by {title}', size = 22) plt.subplot(3,2,1) sns.barplot(x=df1.index, y='TotalRevenue', data=df1, order = chart_order) plt.title(f'Total Revenue by {title}', size =18) plt.xlabel(f'{title}', size = 18) plt.ylabel('Total Revenue', size=18) plt.xticks(size = 18) plt.yticks(size = 18) plt.subplot(3,2,2) plt.pie(x = df1['TotalRevenue'], labels = df1.index, radius= 1.1, startangle= 90, counterclock= False, autopct= '%2.1F%%', textprops={'fontsize': 18}, explode = explode_list ) circle = plt.Circle((0,0), 0.9, color = 'white') get_current_figure = plt.gcf() get_current_figure.gca().add_artist(circle) plt.subplot(3,2,3) sns.barplot(x=df1.index, y='AverageRevenue', data=df1, order = chart_order) plt.title(f'Average Revenue by {title}', size =18) plt.xlabel(f'{title}', size = 18) plt.ylabel('Average Revenue', size=18) plt.xticks(size = 18) plt.yticks(size = 18) plt.subplot(3,2,4) sns.barplot(x=df1.index, y='MedianRevenue', data=df1, order = chart_order) plt.title(f'Median Revenue by {title}', size =18) plt.xlabel(f'{title}', size = 18) plt.ylabel('Median Revenue', size=18) plt.xticks(size = 18) plt.yticks(size = 18) plt.subplot(3,2,5) sns.boxplot(x='TotalRevenue', y=col, data=df2, order = chart_order) plt.title(f'Revenue Distribution by {title}', size = 18) plt.xlabel('Total Revenue', size =18) plt.ylabel('Total Children', size = 18) plt.xticks(size = 18) plt.yticks(size = 18, rotation = 60) plt.subplot(3,2,6) ax6 = sns.kdeplot(data = df2, x = 'TotalRevenue', hue=col) plt.title(f'Revenue Kde Distribution by {title}', size = 18) plt.xlabel('Total Revenue', size = 18) plt.ylabel('Density',size =18) plt.setp(ax6.get_legend().get_texts(), fontsize='18') plt.setp(ax6.get_legend().get_title(), fontsize='22') plt.tight_layout() plt.show()

def make_stats_test(df1,df2,col): """a function that takes in 2 dataframes and categorical column then perform hypothesis testing on the 'TotalRevenue' of 2 categorical values in the categorical column""" category1 = df2.index[0] val1 = df2['AverageRevenue'][0] category2 = df2.index[1] val2 = df2['AverageRevenue'][1] _,p_ev = stats.levene( df1[df1[col] == category1]['TotalRevenue'], df1[df1[col] == category2]['TotalRevenue'], center='mean') if p_ev > 0.05: equal_variance = True else: equal_variance = False s_avg,p_avg = stats.ttest_ind( df1[df1[col] == category1]['TotalRevenue'], df1[df1[col] == category2]['TotalRevenue'], equal_var = equal_variance) print('Statistics:',s_avg) print('P_value',p_avg) if col == 'TotalChildren': category1 = '1 child' category2 = '3 children' if col == 'HomeOwner': category1 = 'homeowner' category2 = 'non homeowner' if col == 'MaritalStatus': category1 = 'single' category2 = 'married' if col == 'Gender': category1 = 'female' category2 = 'male' if p_avg > 0.05: print(f"There is no significant evidence that {category1} customers spend more than {category2} customers on average") else: print(f"There is a significant evidence that {category1} customers spend more than {category2 } customers on average")

customer_gender_rev = make_customer_category_rev_df(customer_analysis,'Gender') customer_gender_rev.head()

gender_count = make_customer_count(customer_gender_rev, 'Gender') gender_count

# colors = ['blue','orange'], make_count_vis(gender_count, 'Gender')

make_basic_stats(customer_gender_rev,'Gender')

rev_by_gender = make_agg_rev(customer_gender_rev,'Gender') rev_by_gender

gender_list = ['All']+rev_by_gender.index.tolist()

make_rev_vis(rev_by_gender,customer_gender_rev,'Gender','Gender')

make_stats_test(customer_gender_rev,rev_by_gender,'Gender')

del (customer_gender_rev, gender_count, rev_by_gender) gc.collect()

customer_maritalStatus_rev = make_customer_category_rev_df(customer_analysis,'MaritalStatus') customer_maritalStatus_rev.head()

maritalStatus_count = make_customer_count(customer_maritalStatus_rev,'MaritalStatus') maritalStatus_count

make_count_vis(maritalStatus_count,'Marital Status')

make_basic_stats(customer_maritalStatus_rev,'MaritalStatus')

rev_by_maritalStatus = make_agg_rev(customer_maritalStatus_rev,'MaritalStatus') rev_by_maritalStatus

maritalStatus_list = ['All']+rev_by_maritalStatus.index.tolist()

make_rev_vis(rev_by_maritalStatus,customer_maritalStatus_rev,'Marital Stataus','MaritalStatus')

make_stats_test(customer_maritalStatus_rev,rev_by_maritalStatus,'MaritalStatus')

del (customer_maritalStatus_rev, maritalStatus_count, rev_by_maritalStatus )

customer_homeowner_rev = make_customer_category_rev_df(customer_analysis,'HomeOwner') customer_homeowner_rev.head()

homeowner_count = make_customer_count(customer_homeowner_rev, 'HomeOwner') homeowner_count

make_count_vis(homeowner_count,'Homewonership')

make_basic_stats(customer_homeowner_rev,'HomeOwner')

rev_by_homeowner = make_agg_rev(customer_homeowner_rev,'HomeOwner') rev_by_homeowner

home_ownership_list = ['All','Y',"N"]

make_rev_vis(rev_by_homeowner, customer_homeowner_rev,'Homeownership','HomeOwner')

make_stats_test(customer_homeowner_rev,rev_by_homeowner,'HomeOwner')

del(customer_homeowner_rev, homeowner_count, rev_by_homeowner ) gc.collect()

customer_eduLevel_rev = make_customer_category_rev_df(customer_analysis,'EducationLevel') customer_eduLevel_rev.head()

eduLevel_count = make_customer_count(customer_eduLevel_rev,'EducationLevel') eduLevel_count

make_count_vis(eduLevel_count,'Education Level')

make_basic_stats(customer_eduLevel_rev,'EducationLevel')

rev_by_eduLevel = make_agg_rev(customer_eduLevel_rev,'EducationLevel') rev_by_eduLevel

make_rev_vis(rev_by_eduLevel, customer_eduLevel_rev,'Education Level','EducationLevel')

make_stats_test(customer_eduLevel_rev,rev_by_eduLevel,'EducationLevel')

del(customer_eduLevel_rev, eduLevel_count, rev_by_eduLevel) gc.collect()

customer_occupation_rev = make_customer_category_rev_df(customer_analysis,'Occupation') customer_occupation_rev.head()

occupation_count = make_customer_count(customer_occupation_rev,'Occupation') occupation_count

make_count_vis(occupation_count,'Occupation')

make_basic_stats(customer_occupation_rev,'Occupation')

rev_by_occupation = make_agg_rev(customer_occupation_rev,'Occupation') rev_by_occupation

make_rev_vis(rev_by_occupation,customer_occupation_rev,'Occupation','Occupation')

make_stats_test(customer_occupation_rev,rev_by_occupation,'Occupation')

del (customer_occupation_rev, occupation_count, rev_by_occupation) gc.collect()

customer_incomeLevel_rev = make_customer_category_rev_df(customer_analysis,'IncomeLevel') customer_incomeLevel_rev.head()

incomeLevel_count = make_customer_count(customer_incomeLevel_rev,'IncomeLevel') incomeLevel_count

make_count_vis(incomeLevel_count,'Income Level')

make_basic_stats(customer_incomeLevel_rev,'IncomeLevel')

rev_by_incomeLevel = make_agg_rev(customer_incomeLevel_rev,'IncomeLevel') rev_by_incomeLevel

make_rev_vis(rev_by_incomeLevel,customer_incomeLevel_rev,'Income Level','IncomeLevel')

make_stats_test(customer_incomeLevel_rev,rev_by_incomeLevel,'IncomeLevel')

del (customer_incomeLevel_rev, incomeLevel_count, rev_by_incomeLevel) gc.collect()

customer_ageGroup1_rev = make_customer_category_rev_df(customer_analysis,'AgeGroup_BasedOnPresentDate') customer_ageGroup1_rev.head()

ageGroup_count = make_customer_count(customer_ageGroup1_rev,'AgeGroup') ageGroup_count

make_count_vis(ageGroup_count,'Age group')

customer_ageGroup2_rev = make_customer_category_rev_df(customer_analysis,'AgeGroup_BasedOnOrderDate') customer_ageGroup2_rev.head()

make_basic_stats(customer_ageGroup2_rev,'AgeGroup')

rev_by_ageGroup = make_agg_rev(customer_ageGroup2_rev,'AgeGroup') rev_by_ageGroup

make_rev_vis(rev_by_ageGroup,customer_ageGroup2_rev,'Age Group','AgeGroup')

make_stats_test(customer_ageGroup2_rev,rev_by_ageGroup,'AgeGroup')

del (customer_ageGroup1_rev, ageGroup_count, customer_ageGroup2_rev, rev_by_ageGroup) gc.collect()

customer_tchildren_rev=make_customer_category_rev_df(customer_analysis,'TotalChildren') customer_tchildren_rev.head()

tchildren_count = make_customer_count(customer_tchildren_rev,'TotalChildren') tchildren_count

make_count_vis(tchildren_count, 'Total Children')

make_basic_stats(customer_tchildren_rev,'TotalChildren')

rev_by_tchildren = make_agg_rev(customer_tchildren_rev,'TotalChildren') rev_by_tchildren

make_rev_vis(rev_by_tchildren,customer_tchildren_rev,'Total Children','TotalChildren')

make_stats_test(customer_tchildren_rev,rev_by_tchildren,'TotalChildren')

del (customer_tchildren_rev, tchildren_count, rev_by_tchildren) gc.collect()

customer_country_rev = make_customer_category_rev_df(customer_analysis,'Country') customer_country_rev.head()

country_count = make_customer_count(customer_country_rev,'Country') country_count

make_count_vis(country_count,'Country')

make_basic_stats(customer_country_rev,'Country')

rev_by_country = make_agg_rev(customer_country_rev,'Country') rev_by_country

make_rev_vis(rev_by_country,customer_country_rev,'Country','Country')

make_stats_test(customer_country_rev,rev_by_country,'Country')

del (customer_country_rev,country_count,rev_by_country) gc.collect()

df = customer_analysis[customer_analysis['Country']=='United States'].\ groupby(['CustomerKey','Region']).\ agg({'Revenue':'sum'}).rename(columns={'Revenue':'TotalRevenue'}) df=df[df['TotalRevenue']!=0] df.reset_index(inplace = True) for region in df['Region'].value_counts().index: regional_avg = df[df['Region']==region]['TotalRevenue'].mean() regional_customer_keys = df[df['TotalRevenue']>regional_avg]['CustomerKey'].tolist() regional_customers_info = customers[customers['CustomerKey']. isin(regional_customer_keys)] regional_customers_info.to_csv(f'results/{region}_customers_above_regional_average.csv',index=False)

del df gc.collect()

test_df = customer_analysis.\ groupby(['CustomerKey','Gender','HomeOwner']).\ agg({'Revenue':'sum'}) test_df=test_df[test_df['Revenue']!=0] test_df.reset_index(inplace = True) test_df.drop(['CustomerKey','Revenue'], axis = 1, inplace=True) # H0: There is no association between gender and homeownership # Ha: There is an association between gender and homeownership alpha = 0.05 cont_table = pd.crosstab(test_df['Gender'],test_df['HomeOwner']) s,p_value,df,ev = stats.chi2_contingency(cont_table) print('alpha: ',alpha) print('p_value: ',p_value) if p_value < alpha: print('There is an association between gender and homeownership') else: print('There is no assocaition between gender and homeownership')

del test_df,cont_table,alpha,p_value gc.collect()

# make the gender widget gender = ipywidgets.Dropdown(options =gender_list, value = 'All', description = 'Gender', disabled = False); # make the marital_status widget marital_status = ipywidgets.Dropdown(options = maritalStatus_list, value = 'All', description = 'Marital Status', disabled = False); # make the home_ownership widget home_ownership = ipywidgets.Dropdown(options =home_ownership_list, value = 'All', description = 'Home Ownership', disabled = False); # define the plotting of the AnnualIncome distribution def dashboard(gender, marital_status,home_ownership): dashb_df = customer_analysis[customer_analysis['Country']=='United States'] dashb_df = dashb_df.set_index('OrderDate',drop=False).sort_index().to_period('M') dashb_df.index.name = 'OrderPeriod' dashb_df.reset_index(inplace = True) dashb_df['OrderPeriod'] = dashb_df['OrderPeriod'].astype('str') if gender != 'All': dashb_df = dashb_df[dashb_df['Gender'] == gender] if marital_status != 'All': dashb_df = dashb_df[dashb_df['MaritalStatus']==marital_status] if home_ownership != 'All': dashb_df= dashb_df[dashb_df['HomeOwner'] == home_ownership] top20_customers = dashb_df.groupby('Customer').\ agg({'Revenue':'sum'}).\ rename(columns ={'Revenue':'TotalRevenue'}).\ sort_values('TotalRevenue', ascending = False).\ head(20) region_rev = dashb_df.groupby('Region').\ agg({'Revenue':'sum'}).\ rename(columns = {'Revenue':'TotalRevenue'}).\ sort_values('TotalRevenue', ascending = False) region_rev = region_rev[region_rev['TotalRevenue'] != 0] southeast_central_northeast = (region_rev.loc['USA-NE'] + region_rev.loc['USA-C'] + region_rev.loc['USA-SE']) region_rev.drop(labels = ['USA-SE','USA-C','USA-NE'], axis = 0, inplace = True) region_rev.loc['USA-SE, USA-C & USA-NE']=southeast_central_northeast fig = plt.figure(figsize=(14,10), constrained_layout=True, facecolor = 'lightblue') fig.suptitle('Revenue Breakdown in USA', size =18) spec = fig.add_gridspec(2,3) ax1 = fig.add_subplot(spec[:, 0]) ax1 = sns.barplot(y = top20_customers.index, x = 'TotalRevenue', data = top20_customers, color = 'blue') plt.title('Top 20 Customers by Revenue') for index, value in enumerate(top20_customers['TotalRevenue']): label = format(int(value),',') plt.annotate(label, xy=(value - 4700, index), color='white', size = 14) ax2 = fig.add_subplot(spec[0, 1:]) ax2 = sns.lineplot(x ='OrderPeriod', y = 'Revenue', data = dashb_df, estimator='sum', ci=None) plt.title('Revenue Timeline') plt.ylabel('TotalRevenue') plt.xticks(rotation = 45) ax3 = fig.add_subplot(spec[1, 1:]) ax3 = sns.barplot(x = region_rev.index, y = 'TotalRevenue', data = region_rev, ci = None) plt.title('Revenue by Region') for index, value in enumerate(region_rev['TotalRevenue']): label = format(int(value),',') plt.annotate(label, xy=(index-0.25, value-500), color='black') #add interaction to the funtion ipywidgets.interact(dashboard, gender = gender, marital_status = marital_status, home_ownership = home_ownership)

rfm_df = customer_analysis.groupby(['CustomerKey','OrderNumber']).\ agg({'Revenue':'sum','OrderDate':'max','AnnualIncome':'mean'}).\ reset_index().\ groupby('CustomerKey').\ agg({'OrderNumber':'count','Revenue':'sum','OrderDate':'max','AnnualIncome':'mean'}).\ reset_index().\ rename(columns = {'OrderNumber':'Frequency','Revenue':'Monetary'}) rfm_df['Recency'] = (rfm_df['OrderDate'].max()-rfm_df['OrderDate']) rfm_df['Recency'] = rfm_df['Recency'].astype(str).str.split(' ').str.get(0).astype(int) rfm_df.head()

rfm_stats = rfm_df[['Frequency','Monetary','Recency']].describe() rfm_stats

f_quarter = (rfm_df['Frequency'].max() - rfm_df['Frequency'].min())/4 f_quarter

rfm_quantiles = { 'recency':{'0.25':rfm_stats.loc['25%','Recency'],'0.50':rfm_stats.loc['50%','Recency'],'0.75':rfm_stats.loc['75%','Recency']}, 'frequency':{'0.25':f_quarter,'0.50':(f_quarter*2),'0.75':(3*f_quarter)}, 'monetary':{'0.25':rfm_stats.loc['25%','Monetary'],'0.50':rfm_stats.loc['50%','Monetary'],'0.75':rfm_stats.loc['75%','Monetary']} } rfm_quantiles

# write a function that return the recency score based on the Recency value def r_scroe(x): if x <= rfm_quantiles['recency']['0.25']: return 4 elif x <= rfm_quantiles['recency']['0.50']: return 3 elif x <= rfm_quantiles['recency']['0.75']: return 2 else: return 1 # write a function that return the frequency score based on the Frequency value def f_scroe(x): if x <= rfm_quantiles['frequency']['0.25']: return 1 elif x <= rfm_quantiles['frequency']['0.50']: return 2 elif x <= rfm_quantiles['frequency']['0.75']: return 3 else: return 4 # write a function that return the moonetary score based on the Monetary value def m_scroe(x): if x <= rfm_quantiles['monetary']['0.25']: return 1 elif x <= rfm_quantiles['monetary']['0.50']: return 2 elif x <= rfm_quantiles['monetary']['0.75']: return 3 else: return 4

# get the r-score rfm_df['R-Score'] = rfm_df['Recency'].apply(lambda x : r_scroe(x)) # get the f-score rfm_df['F-Score'] = rfm_df['Frequency'].apply(lambda x : f_scroe(x)) # get the m-score rfm_df['M-Score'] = rfm_df['Monetary'].apply(lambda x : m_scroe(x)) # get the RFM-Score rfm_df['RFM-Score'] = (rfm_df['R-Score']*100) + (rfm_df['F-Score']*10) + (rfm_df['M-Score']) # sort the values based on RFM-Score rfm_df.sort_values('RFM-Score', ascending = False, inplace =True) # make a new column 'RFM-Loyalty-Level' based on the RFM-Score loyalty_level = ['Bronze','Silver','Gold','Platinum'] score_cuts = pd.qcut(rfm_df['RFM-Score'],4,loyalty_level) rfm_df['RFM-Loyalty-Level'] = score_cuts rfm_df.head()

for level in loyalty_level: level_customers = rfm_df[rfm_df['RFM-Loyalty-Level'] == level]['CustomerKey'] level_customers = customers[customers['CustomerKey'].isin(level_customers)] level_customers.to_csv(f'results/{level}_customers.csv', index = False)