!pip install statsmodels==0.13.5
# import libarary
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
from scipy.stats import iqr
# import dataset
df = pd.read_csv('product_sales.csv')
# dataset overview
pd.set_option('display.max_columns', None)
print(df.head(10))
print(df.info())
# check how many sales methods were used for that customer
print(df['sales_method'].drop_duplicates())
# Number of sales methods used
number_method = df.groupby('sales_method')['customer_id'].count()
print(number_method)
ax = number_method.plot.bar()
ax.bar_label(ax.containers[0], label_type='edge')
plt.show()
# treating duplicates to ensure 3 sales methods
df['sales_method'] = df['sales_method'].replace('em + call','Email + Call')
df['sales_method'] = df['sales_method'].replace('email','Email')
print(df['sales_method'].drop_duplicates())
# check the customer_id is unique
print(df[df.duplicated(subset='customer_id', keep=False)])
# Check the relationship between revenue column and nb_sold column by sales_method Email
df_Email = df[df['sales_method'] == 'Email']
print(df_Email[['revenue','nb_sold']].corr())
# Check the relationship between revenue column and nb_sold column by sales_method Email + Call
df_Email_Call = df[df['sales_method'] == 'Email + Call']
print(df_Email_Call[['revenue','nb_sold']].corr())
# Check the relationship between revenue column and nb_sold column by sales_method Call
df_Call = df[df['sales_method'] == 'Call']
print(df_Email_Call[['revenue','nb_sold']].corr())
# Visualize the relationship between revenue column and nb_sold column classified by sales_method.
sns.relplot(x='nb_sold', y='revenue', data=df,hue='sales_method')
# Create df_trainmodels
df_trainmodels = df[~df['revenue'].isna()]
# Create df_results
df_results = df[df['revenue'].isna()]
# Create the model then predict the missing values
x = smf.ols('revenue ~ nb_sold + C(sales_method)', data = df_trainmodels).fit()
results = x.predict(df_results)
print(results)
# Fill missing values with results
df['revenue'] = df['revenue'].fillna(results)
print(df.head())
df['revenue'] = df['revenue'].round(2)
# Visualize outliers
sns.boxplot(data = df, x = 'sales_method', y = 'revenue')
# Find outliers
def detect_outliers_iqr(data):
outliers_lwr = []
outliers_upr = []
data = sorted(data)
q1 = np.quantile(data, 0.25)
q3 = np.quantile(data, 0.75)
IQR = q3-q1
lwr_bound = q1-(1.5*IQR)
upr_bound = q3+(1.5*IQR)
for i in data:
if i < lwr_bound:
outliers_lwr.append(i)
elif i > upr_bound:
outliers_upr.append(i)
return outliers_lwr, outliers_upr, lwr_bound, upr_bound
Email_outliers_lwr,Email_outliers_upr,Email_lwr_bound,Email_upr_bound = detect_outliers_iqr(df.loc[df['sales_method'] == 'Email', 'revenue'])
print(Email_outliers_lwr,Email_outliers_upr,Email_lwr_bound,Email_upr_bound)
Email_Call_outliers_lwr,Email_Call_outliers_upr,Email_Call_lwr_bound,Email_Call_upr_bound = detect_outliers_iqr(df.loc[df['sales_method'] == 'Email + Call', 'revenue'])
print(Email_Call_outliers_lwr,Email_Call_outliers_upr,Email_Call_lwr_bound,Email_Call_upr_bound)
Call_outliers_lwr,Call_outliers_upr,Call_lwr_bound,Call_upr_bound = detect_outliers_iqr(df.loc[df['sales_method'] == 'Call', 'revenue'])
print(Call_outliers_lwr,Call_outliers_upr,Call_lwr_bound,Call_upr_bound)
# Assign outliers to the upper bound of Email
x = (df.loc[df['sales_method'] == 'Email', 'revenue'].isin(Email_outliers_upr)) & (df['sales_method'] == 'Email')
df.revenue[x] = Email_upr_bound
# Assign outliers to the lower bound of Email + Call
y = (df.loc[df['sales_method'] == 'Email + Call', 'revenue'].isin(Email_Call_outliers_lwr)) & (df['sales_method'] == 'Email + Call')
df.revenue[y] = Email_Call_lwr_bound
# Assign outliers to the upper bound of Email + Call
z = (df.loc[df['sales_method'] == 'Email + Call', 'revenue'].isin(Email_Call_outliers_upr)) & (df['sales_method'] == 'Email + Call')
df.revenue[z] = Email_Call_upr_bound
# Assign outliers to the upper bound of Call
t = (df.loc[df['sales_method'] == 'Call', 'revenue'].isin(Call_outliers_upr)) & (df['sales_method'] == 'Call')
df.revenue[t] = Call_upr_bound
sns.boxplot(data = df, x = 'sales_method', y = 'revenue')
# How many customers were there for each approach?
Customers = df.groupby('sales_method')['customer_id'].count()
print(Customers)
Customers.plot(kind='pie', y='sales_method', autopct='%1.0f%%')
plt.show()
# What does the spread of the revenue look like overall?
df.hist('revenue', bins = 200)
# What does the spread of the revenue look like for each method?
Mean_revenue = df.groupby('sales_method')['revenue'].sum()
print(Mean_revenue)
cx = Mean_revenue.plot.bar()
cx.bar_label(cx.containers[0], label_type='edge')
plt.show()
# Was there any difference in revenue over time for each of the methods?
cx = df.groupby(['week','sales_method'])['revenue'].sum()
cx.unstack().plot()
print(cx)
dx = df.groupby(['week','sales_method'])['revenue'].mean()
dx.unstack().plot()
print(dx)