Data Analyst Professional Certification

!pip install statsmodels==0.13.5

# import libarary import pandas as pd import numpy as np import matplotlib.pyplot as plt import statsmodels.formula.api as smf import seaborn as sns from scipy.stats import iqr # import dataset df = pd.read_csv('product_sales.csv') # dataset overview pd.set_option('display.max_columns', None) print(df.head(10))

print(df.info())

# check how many sales methods were used for that customer print(df['sales_method'].drop_duplicates())

# Number of sales methods used number_method = df.groupby('sales_method')['customer_id'].count() print(number_method) ax = number_method.plot.bar() ax.bar_label(ax.containers[0], label_type='edge') plt.show()

# treating duplicates to ensure 3 sales methods df['sales_method'] = df['sales_method'].replace('em + call','Email + Call') df['sales_method'] = df['sales_method'].replace('email','Email') print(df['sales_method'].drop_duplicates())

# check the customer_id is unique print(df[df.duplicated(subset='customer_id', keep=False)])

# Check the relationship between revenue column and nb_sold column by sales_method Email df_Email = df[df['sales_method'] == 'Email'] print(df_Email[['revenue','nb_sold']].corr()) # Check the relationship between revenue column and nb_sold column by sales_method Email + Call df_Email_Call = df[df['sales_method'] == 'Email + Call'] print(df_Email_Call[['revenue','nb_sold']].corr()) # Check the relationship between revenue column and nb_sold column by sales_method Call df_Call = df[df['sales_method'] == 'Call'] print(df_Email_Call[['revenue','nb_sold']].corr()) # Visualize the relationship between revenue column and nb_sold column classified by sales_method. sns.relplot(x='nb_sold', y='revenue', data=df,hue='sales_method')

# Create df_trainmodels df_trainmodels = df[~df['revenue'].isna()] # Create df_results df_results = df[df['revenue'].isna()] # Create the model then predict the missing values x = smf.ols('revenue ~ nb_sold + C(sales_method)', data = df_trainmodels).fit() results = x.predict(df_results) print(results)

# Fill missing values with results df['revenue'] = df['revenue'].fillna(results) print(df.head())

df['revenue'] = df['revenue'].round(2)

# Visualize outliers sns.boxplot(data = df, x = 'sales_method', y = 'revenue')

# Find outliers def detect_outliers_iqr(data): outliers_lwr = [] outliers_upr = [] data = sorted(data) q1 = np.quantile(data, 0.25) q3 = np.quantile(data, 0.75) IQR = q3-q1 lwr_bound = q1-(1.5*IQR) upr_bound = q3+(1.5*IQR) for i in data: if i < lwr_bound: outliers_lwr.append(i) elif i > upr_bound: outliers_upr.append(i) return outliers_lwr, outliers_upr, lwr_bound, upr_bound Email_outliers_lwr,Email_outliers_upr,Email_lwr_bound,Email_upr_bound = detect_outliers_iqr(df.loc[df['sales_method'] == 'Email', 'revenue']) print(Email_outliers_lwr,Email_outliers_upr,Email_lwr_bound,Email_upr_bound) Email_Call_outliers_lwr,Email_Call_outliers_upr,Email_Call_lwr_bound,Email_Call_upr_bound = detect_outliers_iqr(df.loc[df['sales_method'] == 'Email + Call', 'revenue']) print(Email_Call_outliers_lwr,Email_Call_outliers_upr,Email_Call_lwr_bound,Email_Call_upr_bound) Call_outliers_lwr,Call_outliers_upr,Call_lwr_bound,Call_upr_bound = detect_outliers_iqr(df.loc[df['sales_method'] == 'Call', 'revenue']) print(Call_outliers_lwr,Call_outliers_upr,Call_lwr_bound,Call_upr_bound)

# Assign outliers to the upper bound of Email x = (df.loc[df['sales_method'] == 'Email', 'revenue'].isin(Email_outliers_upr)) & (df['sales_method'] == 'Email') df.revenue[x] = Email_upr_bound # Assign outliers to the lower bound of Email + Call y = (df.loc[df['sales_method'] == 'Email + Call', 'revenue'].isin(Email_Call_outliers_lwr)) & (df['sales_method'] == 'Email + Call') df.revenue[y] = Email_Call_lwr_bound # Assign outliers to the upper bound of Email + Call z = (df.loc[df['sales_method'] == 'Email + Call', 'revenue'].isin(Email_Call_outliers_upr)) & (df['sales_method'] == 'Email + Call') df.revenue[z] = Email_Call_upr_bound # Assign outliers to the upper bound of Call t = (df.loc[df['sales_method'] == 'Call', 'revenue'].isin(Call_outliers_upr)) & (df['sales_method'] == 'Call') df.revenue[t] = Call_upr_bound

sns.boxplot(data = df, x = 'sales_method', y = 'revenue')

# How many customers were there for each approach? Customers = df.groupby('sales_method')['customer_id'].count() print(Customers) Customers.plot(kind='pie', y='sales_method', autopct='%1.0f%%') plt.show()

# What does the spread of the revenue look like overall? df.hist('revenue', bins = 200)

# What does the spread of the revenue look like for each method? Mean_revenue = df.groupby('sales_method')['revenue'].sum() print(Mean_revenue) cx = Mean_revenue.plot.bar() cx.bar_label(cx.containers[0], label_type='edge') plt.show()

# Was there any difference in revenue over time for each of the methods? cx = df.groupby(['week','sales_method'])['revenue'].sum() cx.unstack().plot() print(cx)

dx = df.groupby(['week','sales_method'])['revenue'].mean() dx.unstack().plot() print(dx)