import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
filename = "shopify-selling-data.csv"
sns.set(style="darkgrid")
sns.set(rc={'figure.figsize':(20,8)})
# The original table, sorted by date created:
db = 'challenge1_format.db'
def run_query(q):
with sqlite3.connect(db) as conn:
return pd.read_sql(q,conn)
def show_table():
q = '''
SELECT * FROM challenge1_format
ORDER BY created_at;
'''
return run_query(q)
t = show_table()
t
order_idint64
1 - 5000
shop_idint64
1 - 100
0
1863
39
1
1742
39
2
3229
97
3
1268
80
4
2690
49
5
1442
39
6
1116
52
7
3225
22
8
1687
74
9
3957
52
# The original table's summary of the order_amount column:
t.drop(columns = ['order_id', 'shop_id', 'user_id', 'total_items']).describe().round(2)
order_amountfloat64
count
5000
mean
3145.13
std
41282.54
min
90
25%
163
50%
284
75%
390
max
704000
q0 = '''
SELECT shop_id,
SUM(order_amount)/SUM(total_items) AS price_per_unit,
SUM(total_items) AS total_item_sold,
COUNT(order_id) AS shop_num_order,
SUM(order_amount) AS total_order_value,
ROUND(SUM(order_amount)*1.0/COUNT(order_id), 2) AS shop_AOV
FROM challenge1_format
GROUP BY shop_id
ORDER BY total_order_value DESC;
'''
t0 = run_query(q0)
t0.head(12)
shop_idint64
6 - 90
price_per_unitint64
153 - 25725
0
42
352
1
78
25725
2
89
196
3
81
177
4
6
187
5
13
160
6
59
178
7
71
164
8
19
163
9
70
173
t0.drop(columns='shop_id').describe().round(2)
price_per_unitfloat64
total_item_soldfloat64
count
100
100
mean
407.99
439.36
std
2557.46
3396.37
min
90
67
25%
132.75
88
50%
153
100
75%
168.25
111.25
max
25725
34063
fig_1_1 = t0.sort_values(by = 'total_item_sold', ascending = False)
# fig_1_1 = fig_1_1[fig_1_1.shop_id != 78]
# fig_1_1 = fig_1_1.drop(index = 0) #dropping shop 42 and 78 row.
fig_1_1.shop_id = fig_1_1.shop_id.astype(str)
sns.set(font_scale = 0.6)
fig_1_1 = sns.scatterplot(data = fig_1_1, x = 'shop_id', y = 'total_item_sold')
fig_1_1.set_title('Figure 1.1: Total item sold for each shop, including the outliers.', fontsize = 14)
fig_1_1.set_xlabel("Shop ID", fontsize = 14)
fig_1_1.set_ylabel("Total items sold", fontsize = 14)
fig_1_2 = t0.sort_values(by = 'price_per_unit', ascending = False)
fig_1_2.shop_id = fig_1_2.shop_id.astype(str)
sns.set(font_scale = 0.6)
fig_1_2 = sns.scatterplot(data = fig_1_2, x = 'shop_id', y = 'price_per_unit')
fig_1_2.set_title('Figure 1.2: Price per Unit for each shop, including the outliers.', fontsize = 14)
fig_1_2.set_xlabel("Shop ID", fontsize = 14)
fig_1_2.set_ylabel("Price per Unit", fontsize = 14)
adj_t = t.copy()
mask = (adj_t['shop_id'] == 42) & (adj_t['total_items'] == 2000)
adj_t.loc[mask, 'total_items'] = adj_t.loc[mask, 'total_items']/1000
adj_t.loc[mask, 'order_amount'] = adj_t.loc[mask, 'order_amount']/1000
# adj_t.query('shop_id == 42').sort_values(by = 'total_items', ascending=False) #check the result
adj_detailed = adj_t.groupby('shop_id').agg({'order_id':'count', 'order_amount': 'sum', 'total_items':'sum'})
adj_detailed.reset_index(inplace = True)
adj_detailed = adj_detailed.rename(columns = {'order_id': 'shop_num_order', 'order_amount':'total_order_value', 'total_items': 'total_item_sold'})
adj_detailed.sort_values(by = 'total_item_sold', ascending = False)
shop_idint64
1 - 100
shop_num_orderint64
35 - 68
12
13
63
83
84
59
70
71
66
52
53
68
80
81
59
40
41
59
23
24
55
18
19
64
13
14
58
68
69
60
mov = adj_t['order_amount'].median()
print("The Median Order Value is: " + str(mov))
The Median Order Value is: 284.0
# # Basic statistics for detailed table:
# adj_detailed.describe().apply(lambda s: s.apply('{0:.2f}'.format))
adj_detailed.sort_values(by = 'total_item_sold', ascending = False).head(10).drop(columns = ['shop_num_order', 'total_order_value'])
shop_idint64
13 - 84
total_item_soldfloat64
121.0 - 136.0
12
13
136
83
84
132
70
71
130
52
53
130
80
81
128
40
41
127
23
24
126
18
19
126
13
14
121
68
69
121
rev.sort_values(by = 'total_order_value', ascending = False).drop(columns = ['shop_num_order', 'total_item_sold']).head(15)
shop_idint64
6 - 90
total_order_valuefloat64
18693.0 - 34144.0
41
42
34144
88
89
23128
80
81
22656
5
6
22627
12
13
21760
58
59
21538
70
71
21320
18
19
20538
69
70
20241
83
84
20196