Summer 2022 Data Science Intern Challenge
Question 1:
Question 2:
Exploratory Data Analysis for Question 1
import pandas as pd
import numpy as np
df = pd.read_csv("/work/Summer-2022-Data-Science-Intern-Challenge-/2019 Winter Data Science Intern Challenge Data Set - Sheet1.csv")
df
order_idint64
1 - 5000
shop_idint64
1 - 100
0
1
53
1
2
92
2
3
44
3
4
18
4
5
18
5
6
58
6
7
87
7
8
22
8
9
64
9
10
52
df['order_amount'].describe()
df['total_items'].describe()
aov = df.groupby(['shop_id'])['order_amount', 'total_items'].sum()
aov['average_order_value'] = aov['order_amount'] / aov['total_items']
aov
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
"""Entry point for launching an IPython kernel.
order_amountint64
6840 - 11990176
total_itemsint64
67 - 34063
1
13588
86
2
9588
102
3
14652
99
4
13184
103
5
13064
92
6
22627
121
7
12208
109
8
11088
84
9
13806
117
10
17612
119
aov['average_order_value'].describe()
pd.set_option("max_rows", None)
aov.sort_values(by='average_order_value', ascending=False)
order_amountint64
6840 - 11990176
total_itemsint64
67 - 34063
78
2263800
88
42
11990176
34063
12
18693
93
89
23128
118
99
18330
94
50
17756
92
38
13680
72
51
16643
89
6
22627
121
11
17480
95
# checking for median
aov['average_order_value'].median()
# checking for mode
aov['average_order_value'].mode()