# installing required package
pip install openpyxl
# imported the dataset and performing eda analysis
import pandas as pd
df = pd.read_excel("/work/2019 Winter Data Science Intern Challenge Data Set.xlsx")
df.head()
# to understand how many 30-day window provided
df.sort_values(by=['created_at'])['created_at']
df.describe()
# check any duplicate rows
df.duplicated().sum()
total_revenue_by_shopid = df.groupby(['shop_id'])['order_amount'].agg('sum')
total_revenue_by_shopid.name ='total_revenue'
total_orders_by_shopid = df.groupby(['shop_id'])['total_items'].agg('sum')
total_orders_by_shopid.name = 'total_orders'
df_by_shopid = pd.concat([total_revenue_by_shopid, total_orders_by_shopid], axis=1)
aov_by_shopid = df_by_shopid['total_revenue']/df_by_shopid['total_orders']
aov_by_shopid.name = 'aov'
df_by_shopid = pd.concat([total_revenue_by_shopid, total_orders_by_shopid , aov_by_shopid], axis=1)
df_by_shopid
df_by_shopid.describe()
df.sort_values(by=['order_amount'], ascending= False)
df_shopid_42 = df[df['shop_id'] == 42]
df_shopid_42.describe()
df_shopid_78 = df[df['shop_id'] == 78]
df_shopid_78.describe()
# dropping aov_per_shop record for shop_id 78
clean_df_by_shopid = df_by_shopid.drop([78])
clean_df_by_shopid.describe()
print("Mean AOV:",clean_df_by_shopid['aov'].mean())
print("Median AOV:",clean_df_by_shopid['aov'].median())
print("Mode AOV:",clean_df_by_shopid['aov'].mode()[0])
print("Mean AOV:",df_by_shopid['aov'].mean())
print("Median AOV:",df_by_shopid['aov'].median())
print("Mode AOV:",df_by_shopid['aov'].mode()[0])