!pip install statsmodels==0.13.5
# import libarary
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
from scipy.stats import iqr
# import dataset
df = pd.read_csv('product_sales.csv')
# dataset overview
pd.set_option('display.max_columns', None)
print(df.head(10))
week sales_method customer_id nb_sold revenue \
0 2 Email 2e72d641-95ac-497b-bbf8-4861764a7097 10 NaN
1 6 Email + Call 3998a98d-70f5-44f7-942e-789bb8ad2fe7 15 225.47
2 5 Call d1de9884-8059-4065-b10f-86eef57e4a44 11 52.55
3 4 Email 78aa75a4-ffeb-4817-b1d0-2f030783c5d7 11 NaN
4 3 Email 10e6d446-10a5-42e5-8210-1b5438f70922 9 90.49
5 6 Call 6489e678-40f2-4fed-a48e-d0dff9c09205 13 65.01
6 4 Email eb6bd5f1-f115-4e4b-80a6-5e67fcfbfb94 11 113.38
7 1 Email 047df079-071b-4380-9012-2bfe9bce45d5 10 99.94
8 5 Email 771586bd-7b64-40be-87df-afe884d2af9e 11 108.34
9 5 Call 56491dae-bbe7-49f0-a651-b823a01103d8 11 53.82
years_as_customer nb_site_visits state
0 0 24 Arizona
1 1 28 Kansas
2 6 26 Wisconsin
3 3 25 Indiana
4 0 28 Illinois
5 10 24 Mississippi
6 9 28 Georgia
7 1 22 Oklahoma
8 10 31 Massachusetts
9 7 23 Missouri
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 week 15000 non-null int64
1 sales_method 15000 non-null object
2 customer_id 15000 non-null object
3 nb_sold 15000 non-null int64
4 revenue 13926 non-null float64
5 years_as_customer 15000 non-null int64
6 nb_site_visits 15000 non-null int64
7 state 15000 non-null object
dtypes: float64(1), int64(4), object(3)
memory usage: 937.6+ KB
None
# check how many sales methods were used for that customer
print(df['sales_method'].drop_duplicates())
0 Email
1 Email + Call
2 Call
22 em + call
1693 email
Name: sales_method, dtype: object
# Number of sales methods used
number_method = df.groupby('sales_method')['customer_id'].count()
print(number_method)
ax = number_method.plot.bar()
ax.bar_label(ax.containers[0], label_type='edge')
plt.show()
sales_method
Call 4962
Email 7456
Email + Call 2549
em + call 23
email 10
Name: customer_id, dtype: int64
# treating duplicates to ensure 3 sales methods
df['sales_method'] = df['sales_method'].replace('em + call','Email + Call')
df['sales_method'] = df['sales_method'].replace('email','Email')
print(df['sales_method'].drop_duplicates())
0 Email
1 Email + Call
2 Call
Name: sales_method, dtype: object
# check the customer_id is unique
print(df[df.duplicated(subset='customer_id', keep=False)])
Empty DataFrame
Columns: [week, sales_method, customer_id, nb_sold, revenue, years_as_customer, nb_site_visits, state]
Index: []
# Check the relationship between revenue column and nb_sold column by sales_method Email
df_Email = df[df['sales_method'] == 'Email']
print(df_Email[['revenue','nb_sold']].corr())
# Check the relationship between revenue column and nb_sold column by sales_method Email + Call
df_Email_Call = df[df['sales_method'] == 'Email + Call']
print(df_Email_Call[['revenue','nb_sold']].corr())
# Check the relationship between revenue column and nb_sold column by sales_method Call
df_Call = df[df['sales_method'] == 'Call']
print(df_Email_Call[['revenue','nb_sold']].corr())
# Visualize the relationship between revenue column and nb_sold column classified by sales_method.
sns.relplot(x='nb_sold', y='revenue', data=df,hue='sales_method')
revenue nb_sold
revenue 1.000000 0.967843
nb_sold 0.967843 1.000000
revenue nb_sold
revenue 1.000000 0.989591
nb_sold 0.989591 1.000000
revenue nb_sold
revenue 1.000000 0.989591
nb_sold 0.989591 1.000000
# Create df_trainmodels
df_trainmodels = df[~df['revenue'].isna()]
# Create df_results
df_results = df[df['revenue'].isna()]
# Create the model then predict the missing values
x = smf.ols('revenue ~ nb_sold + C(sales_method)', data = df_trainmodels).fit()
results = x.predict(df_results)
print(results)
0 99.551250
3 108.302383
16 99.551250
17 208.270929
28 117.053516
...
14951 51.964797
14957 25.711398
14970 182.017530
14992 182.017530
14999 190.768663
Length: 1074, dtype: float64
# Fill missing values with results
df['revenue'] = df['revenue'].fillna(results)
print(df.head())
week sales_method customer_id nb_sold \
0 2 Email 2e72d641-95ac-497b-bbf8-4861764a7097 10
1 6 Email + Call 3998a98d-70f5-44f7-942e-789bb8ad2fe7 15
2 5 Call d1de9884-8059-4065-b10f-86eef57e4a44 11
3 4 Email 78aa75a4-ffeb-4817-b1d0-2f030783c5d7 11
4 3 Email 10e6d446-10a5-42e5-8210-1b5438f70922 9
revenue years_as_customer nb_site_visits state
0 99.551250 0 24 Arizona
1 225.470000 1 28 Kansas
2 52.550000 6 26 Wisconsin
3 108.302383 3 25 Indiana
4 90.490000 0 28 Illinois
df['revenue'] = df['revenue'].round(2)
# Visualize outliers
sns.boxplot(data = df, x = 'sales_method', y = 'revenue')
# Find outliers
def detect_outliers_iqr(data):
outliers_lwr = []
outliers_upr = []
data = sorted(data)
q1 = np.quantile(data, 0.25)
q3 = np.quantile(data, 0.75)
IQR = q3-q1
lwr_bound = q1-(1.5*IQR)
upr_bound = q3+(1.5*IQR)
for i in data:
if i < lwr_bound:
outliers_lwr.append(i)
elif i > upr_bound:
outliers_upr.append(i)
return outliers_lwr, outliers_upr, lwr_bound, upr_bound
Email_outliers_lwr,Email_outliers_upr,Email_lwr_bound,Email_upr_bound = detect_outliers_iqr(df.loc[df['sales_method'] == 'Email', 'revenue'])
print(Email_outliers_lwr,Email_outliers_upr,Email_lwr_bound,Email_upr_bound)
Email_Call_outliers_lwr,Email_Call_outliers_upr,Email_Call_lwr_bound,Email_Call_upr_bound = detect_outliers_iqr(df.loc[df['sales_method'] == 'Email + Call', 'revenue'])
print(Email_Call_outliers_lwr,Email_Call_outliers_upr,Email_Call_lwr_bound,Email_Call_upr_bound)
Call_outliers_lwr,Call_outliers_upr,Call_lwr_bound,Call_upr_bound = detect_outliers_iqr(df.loc[df['sales_method'] == 'Call', 'revenue'])
print(Call_outliers_lwr,Call_outliers_upr,Call_lwr_bound,Call_upr_bound)
[] [131.16, 131.19, 131.2, 131.36, 131.41, 131.59, 131.63, 131.65, 131.66, 131.71, 131.74, 131.75, 131.78, 131.88, 131.91, 131.97, 132.01, 132.06, 132.12, 132.18, 132.21, 132.23, 132.26, 132.41, 132.44, 132.52, 132.55, 132.97, 133.07, 133.08, 133.09, 133.09, 133.13, 133.15, 133.22, 133.26, 133.27, 133.3, 133.38, 133.47, 133.59, 133.6, 133.82, 134.09, 134.11, 134.22, 134.23, 134.27, 134.37, 134.39, 134.47, 134.56, 134.95, 134.97, 135.1, 135.32, 135.42, 135.43, 135.46, 135.66, 135.81, 135.98, 136.23, 136.28, 136.34, 136.47, 136.62, 136.83, 136.89, 137.14, 137.29, 137.65, 137.82, 138.09, 138.12, 138.95, 139.14, 139.16, 139.2, 139.22, 139.7, 140.01, 140.66, 141.84, 144.01, 145.15, 145.71, 148.97] 62.43500000000001 131.015
[122.11, 122.32] [231.56, 231.62, 231.62, 231.65, 231.65, 231.74, 231.75, 231.75, 231.77, 231.83, 231.86, 231.94, 231.94, 232.01, 232.01, 232.02, 232.03, 232.07, 232.14, 232.21, 232.36, 232.47, 232.52, 232.55, 232.68, 232.73, 232.79, 232.83, 232.86, 232.89, 232.97, 233.03, 233.05, 233.09, 233.32, 233.37, 233.39, 233.77, 233.8, 233.97, 234.03, 234.08, 234.1, 234.44, 234.98, 235.4, 235.58, 235.59, 236.75, 237.19, 238.23, 238.32] 124.15000000000002 231.55
[] [69.6, 69.61, 69.9, 70.11, 70.16, 70.31, 70.67, 70.69, 71.04, 71.06, 71.36, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 78.22, 86.97, 86.97, 86.97] 24.599999999999998 69.56
# Assign outliers to the upper bound of Email
x = (df.loc[df['sales_method'] == 'Email', 'revenue'].isin(Email_outliers_upr)) & (df['sales_method'] == 'Email')
df.revenue[x] = Email_upr_bound
# Assign outliers to the lower bound of Email + Call
y = (df.loc[df['sales_method'] == 'Email + Call', 'revenue'].isin(Email_Call_outliers_lwr)) & (df['sales_method'] == 'Email + Call')
df.revenue[y] = Email_Call_lwr_bound
# Assign outliers to the upper bound of Email + Call
z = (df.loc[df['sales_method'] == 'Email + Call', 'revenue'].isin(Email_Call_outliers_upr)) & (df['sales_method'] == 'Email + Call')
df.revenue[z] = Email_Call_upr_bound
# Assign outliers to the upper bound of Call
t = (df.loc[df['sales_method'] == 'Call', 'revenue'].isin(Call_outliers_upr)) & (df['sales_method'] == 'Call')
df.revenue[t] = Call_upr_bound
/tmp/ipykernel_88/3977186669.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df.revenue[x] = Email_upr_bound
/tmp/ipykernel_88/3977186669.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df.revenue[y] = Email_Call_lwr_bound
/tmp/ipykernel_88/3977186669.py:9: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df.revenue[z] = Email_Call_upr_bound
/tmp/ipykernel_88/3977186669.py:12: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df.revenue[t] = Call_upr_bound
sns.boxplot(data = df, x = 'sales_method', y = 'revenue')
# How many customers were there for each approach?
Customers = df.groupby('sales_method')['customer_id'].count()
print(Customers)
Customers.plot(kind='pie', y='sales_method', autopct='%1.0f%%')
plt.show()
sales_method
Call 4962
Email 7466
Email + Call 2572
Name: customer_id, dtype: int64
# What does the spread of the revenue look like overall?
df.hist('revenue', bins = 200)
# What does the spread of the revenue look like for each method?
Mean_revenue = df.groupby('sales_method')['revenue'].sum()
print(Mean_revenue)
cx = Mean_revenue.plot.bar()
cx.bar_label(cx.containers[0], label_type='edge')
plt.show()
sales_method
Call 236340.10
Email 725212.99
Email + Call 473143.42
Name: revenue, dtype: float64
# Was there any difference in revenue over time for each of the methods?
cx = df.groupby(['week','sales_method'])['revenue'].sum()
cx.unstack().plot()
print(cx)
week sales_method
1 Call 26630.710
Email 246454.250
Email + Call 19493.370
2 Call 35044.570
Email 148786.270
Email + Call 31182.310
3 Call 37480.160
Email 106860.140
Email + Call 54495.340
4 Call 51794.430
Email 116899.560
Email + Call 89616.440
5 Call 55685.900
Email 80814.535
Email + Call 147495.190
6 Call 29704.330
Email 25398.235
Email + Call 130860.770
Name: revenue, dtype: float64
dx = df.groupby(['week','sales_method'])['revenue'].mean()
dx.unstack().plot()
print(dx)
week sales_method
1 Call 35.132863
Email 87.550355
Email + Call 131.711959
2 Call 43.533627
Email 100.125350
Email + Call 155.911550
3 Call 41.552284
Email 92.921861
Email + Call 151.797604
4 Call 51.536746
Email 108.743777
Email + Call 181.043313
5 Call 53.338985
Email 108.767880
Email + Call 187.414473
6 Call 66.304308
Email 128.925051
Email + Call 224.461012
Name: revenue, dtype: float64