import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# for ABC product analysis
import inventorize3 as inv
# for market basket analysis
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
# import the sales tables from folder
path = r'/work/Sales'
sales_files = glob.glob(path+'/*.csv')
sales_list = []
for file in sales_files:
df = pd.read_csv(file, index_col = None, header= 0)
sales_list.append(df)
sales = pd.concat(sales_list, axis = 0, ignore_index = True)
# import products table from csv file
products = pd.read_csv('/work/Contoso_Products.csv')
#import territories table from csv file
territories = pd.read_csv('/work/Contoso_Territories.csv')
# import categories table from csv file
categories = pd.read_csv('/work/Contoso_Categories.csv')
# import subcategories tbale from csv file
subcategories = pd.read_csv('/work/Contoso_Product_Subcategories.csv')
# make a list for tables
tables_list = [sales,products,territories,categories,subcategories]
# iterate over the tables-list and print info for each
for table in tables_list:
print(table.info(),'\n')
# pproduct= purchased product
# merge the sales dataframe with the products data frame and assign that to sales_product
pproduct_analysis = sales.merge(
products,left_on = 'ProductKey', right_on = 'ProductKey')
pproduct_analysis = pproduct_analysis.merge(
subcategories, left_on = 'ProductSubcategoryKey', right_on = 'ProductSubcategoryKey')
pproduct_analysis = pproduct_analysis.merge(
categories, left_on = 'ProductCategoryKey', right_on = 'ProductCategoryKey')
pproduct_analysis = pproduct_analysis.merge(
territories, left_on ='TerritoryKey',right_on='SalesTerritoryKey')
pproduct_analysis = pproduct_analysis[pproduct_analysis['Country']=='United States']
pproduct_analysis['Revenue'] = round(
pproduct_analysis['ProductPrice']*pproduct_analysis['OrderQuantity'],2)
pproduct_analysis['Cost'] = round(
pproduct_analysis['ProductCost']*pproduct_analysis['OrderQuantity'],2)
pproduct_analysis['GrossProfit'] = (pproduct_analysis['Revenue'] - pproduct_analysis['Cost'])
pproduct_analysis['GrossMargin'] = round(
pproduct_analysis['GrossProfit']/pproduct_analysis['Revenue'],2)
pproduct_analysis['OrderDate'] = pd.to_datetime(pproduct_analysis['OrderDate'])
pproduct_analysis['StockDate'] = pd.to_datetime(pproduct_analysis['StockDate'])
pproduct_analysis['ProductShelfTime'] = (
pproduct_analysis['OrderDate'] - pproduct_analysis['StockDate']).astype(str).\
str.split(' ').\
str.get(0).\
astype(int)
pproduct_analysis.drop(
['TerritoryKey','ProductSubcategoryKey',
'SalesTerritoryKey','ProductSubcategoryKey',
'CustomerKey','Country','Continent'],
axis=1,inplace=True)
pproduct_analysis.head()
# optimazing the storage
categorical_cols_names = [col for col in pproduct_analysis.select_dtypes(include= 'O').columns if col != 'OrderNumber']
for col in categorical_cols_names:
pproduct_analysis[col] = pproduct_analysis[col].astype('category')
pproduct_analysis.info()
tot_orders = len(pproduct_analysis['OrderNumber'].value_counts().index)
print(f'Total Orders = {tot_orders:,} orders')
tot_qty = pproduct_analysis['OrderQuantity'].sum()
print(f'Total Quantity Sold = {tot_qty:,} items')
tot_rev = pproduct_analysis['Revenue'].sum().round(0)
print(f'Total Revenue = ${tot_rev:,}')
tot_cost = pproduct_analysis['Cost'].sum().round(0)
print(f'Total Cost = ${tot_cost:,}')
tot_gross_pro = pproduct_analysis['GrossProfit'].sum().round(0)
print(f'Total Gross Profit = ${tot_gross_pro:,}')
avg_order_size = round(tot_qty/tot_orders,2)
print(f'Average Order Size = {avg_order_size}')
avg_rev_per_order = round(tot_rev/tot_orders,2)
print(f'Average Revenue per Order = ${avg_rev_per_order}')
avg_rev_per_item = round(tot_rev/tot_qty,2)
print(f'Average Revenue per Itme = ${avg_rev_per_item}')
avg_cost_per_item = round(tot_cost/tot_qty,2)
print(f'Average Cost per Item = {avg_cost_per_item}')
avg_gross_pro_per_order = (tot_gross_pro/tot_orders).round(2)
print(f'Average Gross Profit per Order = ${avg_gross_pro_per_order}')
avg_gross_pro_per_item = (tot_gross_pro/tot_qty).round(2)
print(f'Average Gross Profit per Item = ${avg_gross_pro_per_item}')
avg_shelf_time = pproduct_analysis['ProductShelfTime'].mean()
print(f'Average Shelf-Time = {avg_shelf_time:.0f}')
tot_gross_mar = tot_gross_pro/tot_rev
print(f'Total Gross Margin = %{tot_gross_mar*100:.2f}')
order_rev = pproduct_analysis[['OrderNumber','Revenue']].\
groupby('OrderNumber').\
agg(TotalRevenue=('Revenue','sum'))
print(order_rev.describe())
plt.figure(figsize=(18,6), facecolor='lightblue')
plt.suptitle('Revenue per Order Distribution', size= 18)
plt.subplot(1,2,1)
sns.histplot(data=order_rev, bins = 30)
plt.subplot(1,2,2)
sns.boxplot(x = order_rev['TotalRevenue'])
plt.tight_layout()
plt.show()
categories_rev = pproduct_analysis.loc[:,['CategoryName','Revenue']].\
groupby('CategoryName').\
agg(TotalRevenue=('Revenue','sum')).\
sort_values('TotalRevenue', ascending=False)
print(categories_rev)
plt.figure(figsize=(8,4),facecolor='lightblue')
plt.title('Total Revenue by Category', size=18)
sns.barplot(x=categories_rev['TotalRevenue'],
y=categories_rev.index,
order = categories_rev.index.tolist()
)
plt.xlabel('Total Revenue', size=14)
plt.ylabel('Category', size=14)
plt.show()
best_categories_by_rev = categories_rev.index.tolist()[0]
print(f'Best category by Gross Profit is "{best_categories_by_rev}"')
subcategories_rev = pproduct_analysis.loc[:,['SubcategoryName','Revenue']].\
groupby('SubcategoryName').\
agg(TotalRevenue=('Revenue','sum')).\
sort_values('TotalRevenue', ascending=False)
subcategories_rev['CumulativeTotalRevenue'] = subcategories_rev['TotalRevenue'].cumsum()
plt.figure(figsize=(14,8),facecolor='lightblue')
plt.subplot(2,1,1)
plt.title('Total Revenue by Subategory', size=18)
sns.barplot(x=subcategories_rev.index,
y='TotalRevenue',
data = subcategories_rev,
order = subcategories_rev.index,
color='#7B68EE')
plt.xlabel('Total Revenue', size=14)
plt.ylabel('Subcategory', size=14)
plt.xticks(rotation=45)
plt.subplot(2,1,2)
plt.title('Cumulative Total Revenue by Subategory', size=18)
plt.plot(subcategories_rev.index,
subcategories_rev['CumulativeTotalRevenue'])
plt.xlabel('Cumulative Total Revenue', size=14)
plt.ylabel('Subcategory', size=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
models_rev = pproduct_analysis.loc[:,['ModelName','Revenue']].\
groupby('ModelName').\
agg(TotalRevenue=('Revenue','sum')).\
sort_values('TotalRevenue', ascending=False)
models_rev['CumulativeTotalRevenue'] = models_rev['TotalRevenue'].cumsum()
plt.figure(figsize=(22,14), facecolor='lightblue')
plt.subplot(2,1,1)
plt.title('Total Revenue by Model', size=18)
sns.barplot(x=models_rev.index,
y='TotalRevenue',
data = models_rev,
order = models_rev.index,
color='#7B68EE')
plt.xlabel('Model', size=14)
plt.ylabel('Total Revenue', size=14)
plt.xticks(size = 14,rotation=85)
plt.subplot(2,1,2)
plt.title('Cumulative Total Revenue by Model', size=18)
plt.plot(models_rev.index,
models_rev['CumulativeTotalRevenue'])
plt.xlabel('Modele', size=14)
plt.ylabel('Cumulative Total Revenue', size=14)
plt.xticks(size = 14,rotation=85)
plt.tight_layout()
plt.show()
products_rev = pproduct_analysis.loc[:,['ProductSKU','Revenue']].\
groupby('ProductSKU').\
agg(TotalRevenue=('Revenue','sum')).\
sort_values('TotalRevenue', ascending=False)
products_rev.index = [str(val) for val in products_rev.index]
products_rev['CumulativeTotalRevenue'] = products_rev['TotalRevenue'].cumsum()
products_rev['PercentofTotalRevenue'] = products_rev['TotalRevenue']/products_rev['TotalRevenue'].sum()
products_rev['CumulativePercentofTotalRevenue'] = products_rev['PercentofTotalRevenue'].cumsum()
iloc_index_80 = len(products_rev[products_rev['CumulativePercentofTotalRevenue'] >= 0.8])
products_rev.iloc[:iloc_index_80].index
top10_products_rev = pproduct_analysis.loc[:,['ProductName','Revenue']].\
groupby('ProductName').\
agg(TotalRevenue=('Revenue','sum')).\
sort_values('TotalRevenue', ascending=False).\
head(10)
print(top10_products_rev)
plt.figure(figsize=(8,4),facecolor='lightblue')
plt.title('Top 10 Products by Total Gross Profit', size=18)
sns.barplot(x=top10_products_rev['TotalRevenue'],
y=top10_products_rev.index,
order = top10_products_rev.index.tolist(),
color='#BC8F8F')
plt.xlabel('Total Revenue', size=14)
plt.ylabel('Product', size=14)
plt.show()
bottom10_products_rev = pproduct_analysis.loc[:,['ProductName','Revenue']].\
groupby('ProductName').\
agg(TotalRevenue=('Revenue','sum')).\
sort_values('TotalRevenue').\
head(10)
print(bottom10_products_rev)
plt.figure(figsize=(8,4),facecolor='lightblue')
plt.title('Bottom 10 Products by Total Gross Profit', size=18)
sns.barplot(x=bottom10_products_rev['TotalRevenue'],
y=bottom10_products_rev.index,
order = bottom10_products_rev.index.tolist(),
color='#BC8F8F')
plt.xlabel('Total Revenue', size=14)
plt.ylabel('Product', size=14)
plt.show()
abc_analysis = (pproduct_analysis.loc[:,['ProductSKU','OrderQuantity','Revenue']].
groupby('ProductSKU').agg(TotalQuantity=('OrderQuantity','sum'),
TotalRevenue=('Revenue','sum')).
reset_index()
)
abc_analysis.head()
a = inv. productmix(abc_analysis['ProductSKU'],
abc_analysis['TotalQuantity'],
abc_analysis['TotalRevenue']
)
a = a[['sales_category','revenue_category','product_mix']].\
rename(columns={'sales_category':'QuantityRank',
'revenue_category':'RevenueRank',
'product_mix':'ProductRank'
}
)
abc_analysis = pd.concat([abc_analysis,a],axis=1)
abc_analysis.head()
prod_rank_name = {'A_A':'Margin & Volume Driver',
'A_B':'Volume Driver',
'A_C':'Volume Driver',
'B_A':'Margin Driver',
'B_B':'Regular',
'B_C':'Regular',
'C_A':'Margin Driver',
'C_B':'Regular',
'C_C':'Slow Moving'
}
abc_analysis['ProductRankName'] = abc_analysis['ProductRank'].map(prod_rank_name)
abc_analysis.head()
abc_analysis[['TotalQuantity','TotalRevenue']].describe(percentiles=[0.333,0.666])
def make_count_qty_rev_vis(df,criteria):
if criteria=='Qunatity':
facecolor='#FFE4C4'
elif criteria=='Revenue':
facecolor='#F0E68C'
else:
facecolor='#E6E6FA'
plt.figure(figsize=(22,14), facecolor=facecolor)
plt.suptitle(f'Total Revenue, Total Quantity Sold and Total Product Count by {criteria} ABC Rank',
size=24)
plt.subplot(1,3,1)
plt.title(f'Products Count by {criteria} ABC Rank', size=20)
sns.barplot(x='TotalProductCount',
y=df.index,
color='#4682B4',
order=df.index,
orient='h',
data=df
)
plt.xlabel('Total Product Count', color='black', size=18)
plt.ylabel(f'{criteria} ABC Rank', color='black', size=18)
plt.xticks(color='black', size = 18)
plt.yticks(color='black', size = 18)
plt.subplot(1,3,2)
plt.title(f'Quantity Sold by {criteria} ABC Rank', size = 20)
sns.barplot(x='TotalQuantitySold',
y=df.index,
color='#191970',
order=df.index,
orient='h',
data=df
)
plt.xlabel('Total Quantity Sold', color='black', size=18)
plt.ylabel('')
plt.xticks(color='black', size = 18)
plt.yticks(color='black', size = 18)
plt.subplot(1,3,3)
plt.title(f'Revenue by {criteria} ABC Rank', size=20)
sns.barplot(x='TotalRevenue',
y=df.index,
color='#1E90FF',
order=df.index,
orient='h',
data=df
)
plt.xlabel('Total Revenue', color='black', size=18)
plt.ylabel('')
plt.xticks(color='black', size = 18)
plt.yticks(color='black', size = 18)
plt.tight_layout()
plt.show()
abc_qty_rank_agg = (abc_analysis.groupby('QuantityRank').
agg(TotalProductCount=('QuantityRank','count'),
TotalQuantitySold=('TotalQuantity','sum'),
TotalRevenue=('TotalRevenue','sum'))
)
abc_qty_rank_agg
make_count_qty_rev_vis(abc_qty_rank_agg,'Quantity')
abc_rev_rank_agg = (abc_analysis.groupby('RevenueRank').
agg(TotalProductCount=('RevenueRank','count'),
TotalQuantitySold=('TotalQuantity','sum'),
TotalRevenue=('TotalRevenue','sum'))
)
abc_rev_rank_agg
make_count_qty_rev_vis(abc_rev_rank_agg,'Revenue')
abc_prod_rank_agg = (abc_analysis.groupby('ProductRank').
agg(TotalProductCount=('ProductRank','count'),
TotalQuantitySold=('TotalQuantity','sum'),
TotalRevenue=('TotalRevenue','sum'))
)
abc_prod_rank_agg
make_count_qty_rev_vis(abc_rev_rank_agg,'Product')
regions = pproduct_analysis['Region'].astype('O').value_counts().index.tolist()
for region in regions:
region_df = pproduct_analysis.loc[pproduct_analysis['Region']==region,['CategoryName','Revenue']]
best_category_in_region = region_df.\
groupby('CategoryName').\
agg(TotalRevenue=('Revenue','sum')).\
sort_values('TotalRevenue',ascending=False)
plt.figure(figsize=(8,4),facecolor='lightblue')
plt.title(f'Total Revenue by Category in {region}', size=18)
sns.barplot(x=best_category_in_region['TotalRevenue'],
y=best_category_in_region.index,
order = best_category_in_region.index.tolist())
plt.xlabel('Total Revenue', size=14)
plt.ylabel('Category', size=14)
# best_category_in_region = best_category_in_region.index.tolist()[0]
# print(f'Best category in {region} is {best_category_in_region}')
best_subcategory_in_region_in_category = {'Region':[],'Category':[],'Subcategory':[]}
for region in pproduct_analysis['Region'].astype('O').value_counts().index.tolist():
for category in pproduct_analysis['CategoryName'].astype('O').value_counts().index.tolist():
region_condi = pproduct_analysis['Region']==region
category_condi = pproduct_analysis["CategoryName"]==category
region_category_df = pproduct_analysis.loc[(region_condi)&(category_condi),
['SubcategoryName','Revenue']]
best_subcategory_in_region_category = region_category_df.\
groupby('SubcategoryName').\
agg(TotalRevenue=('Revenue','sum')).\
sort_values('TotalRevenue', ascending = False)
best_subcategory_in_region_category = best_subcategory_in_region_category.index.tolist()[0]
best_subcategory_in_region_in_category['Region'].append(region)
best_subcategory_in_region_in_category['Category'].append(category)
best_subcategory_in_region_in_category['Subcategory'].append(best_subcategory_in_region_category)
region_category_subcategory_df = pd.DataFrame(best_subcategory_in_region_in_category)
region_category_subcategory_pivoted_df = region_category_subcategory_df.\
pivot('Region','Category').\
loc[:,'Subcategory']
print('Best Subcategories by Revenue in each Category in each Region')
region_category_subcategory_pivoted_df
best_subcategories = {}
for index, row in region_category_subcategory_pivoted_df.iterrows():
best_subcategories[index] = row.tolist()
for region in best_subcategories.keys():
best_subcategories[region] =[val for val in best_subcategories[region] if str(val)!='nan']
for region in best_subcategories.keys():
print(best_subcategories[region])
best_model_in_region_in_subcategory = {'Region':[],'Subcategory':[],'Model':[]}
for region in best_subcategories.keys():
for subcategory in best_subcategories[region]:
region_condi = pproduct_analysis['Region']==region
subcategory_condi = pproduct_analysis["SubcategoryName"]==subcategory
region_subcategory_df = pproduct_analysis.loc[(region_condi)&(subcategory_condi),
['ModelName','Revenue']]
best_model_in_region_subcategory = (region_subcategory_df.\
groupby('ModelName').\
agg(TotalRevenue=('Revenue','sum')).\
sort_values('TotalRevenue', ascending = False))
best_model_in_region_subcategory = (best_model_in_region_subcategory.index.tolist()[0])
best_model_in_region_in_subcategory['Region'].append(region)
best_model_in_region_in_subcategory['Subcategory'].append(subcategory)
best_model_in_region_in_subcategory['Model'].append(best_model_in_region_subcategory)
region_subcategory_model_df = pd.DataFrame(best_model_in_region_in_subcategory)
region_subcategory_model_pivoted_df = region_subcategory_model_df.\
pivot('Region','Subcategory').\
loc[:,'Model']
print('Best Models in best Subcategories by Revenue in each category in each Region')
region_subcategory_model_pivoted_df
best_models = {}
for index, row in region_subcategory_model_pivoted_df.iterrows():
best_models[index] = row.tolist()
for region in best_models.keys():
best_models[region] =[val for val in best_models[region] if str(val)!='nan']
for region in best_models.keys():
print(best_models[region])
best_product_in_region_in_model = {'Region':[],'Model':[],'Product':[]}
for region in best_models.keys():
for model in best_models[region]:
region_condi = pproduct_analysis['Region']==region
subcategory_condi = pproduct_analysis["ModelName"]==model
region_model_df = pproduct_analysis.loc[(region_condi)&(subcategory_condi),
['ProductName','Revenue']]
best_product_in_region_model = (region_model_df.\
groupby('ProductName').\
agg(TotalRevenue=('Revenue','sum')).\
sort_values('TotalRevenue', ascending = False))
best_product_in_region_model = (
best_product_in_region_model.index.tolist()[0])
best_product_in_region_in_model['Region'].append(region)
best_product_in_region_in_model['Model'].append(model)
best_product_in_region_in_model['Product'].append(best_product_in_region_model)
region_model_product_df = pd.DataFrame(best_product_in_region_in_model)
region_model_product_pivoted_df = region_model_product_df.\
pivot('Region','Model').\
loc[:,'Product']
print('Best Product in best Models in best Subcateogries by Revenue in each Category in each Region')
region_model_product_pivoted_df
bst_prod_in_bst_mdl_in_bst_subcate_in_cate_in_region = (
region_model_product_df.merge(region_subcategory_model_df,
left_on=['Region','Model'],
right_on=['Region','Model']).\
merge(region_category_subcategory_df,
left_on=['Region','Subcategory'],
right_on=['Region','Subcategory'])
)
bst_prod_in_bst_mdl_in_bst_subcate_in_cate_in_region = (
bst_prod_in_bst_mdl_in_bst_subcate_in_cate_in_region[['Region','Category',
'Subcategory','Model',
'Product']])
bst_prod_in_bst_mdl_in_bst_subcate_in_cate_in_region.rename(
columns={'Subcategory':'BestSubcategoryByRevenue',
'Model':'BestModelByRevenue',
'Product':'BestProductByRevenue'},
inplace=True)
bst_prod_in_bst_mdl_in_bst_subcate_in_cate_in_region
basket = pproduct_analysis.loc[:,['OrderNumber','ProductDescription','OrderQuantity']].\
set_index(['OrderNumber','ProductDescription']).\
unstack().\
fillna(0)
basket = basket.loc[:,'OrderQuantity']
basket.head(3)
def encode_units(x):
if x<=0:
return 0
if x>=1:
return 1
basket = basket.applymap(encode_units)
# generate frequent itemsets
frequent_itemsets = apriori(basket, min_support=0.05, use_colnames=True)
#generate rules
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)
rules
print('The following products should be placed close to each other in the store or recommended to the \
customer buys one of them.')
rules[['antecedents','consequents']]