import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
filename = "shopify-selling-data.csv"
sns.set(style="darkgrid")
sns.set(rc={'figure.figsize':(20,8)})
# The original table, sorted by date created:
db = 'challenge1_format.db'
def run_query(q):
with sqlite3.connect(db) as conn:
return pd.read_sql(q,conn)
def show_table():
q = '''
SELECT * FROM challenge1_format
ORDER BY created_at;
'''
return run_query(q)
t = show_table()
t
# The original table's summary of the order_amount column:
t.drop(columns = ['order_id', 'shop_id', 'user_id', 'total_items']).describe().round(2)
q0 = '''
SELECT shop_id,
SUM(order_amount)/SUM(total_items) AS price_per_unit,
SUM(total_items) AS total_item_sold,
COUNT(order_id) AS shop_num_order,
SUM(order_amount) AS total_order_value,
ROUND(SUM(order_amount)*1.0/COUNT(order_id), 2) AS shop_AOV
FROM challenge1_format
GROUP BY shop_id
ORDER BY total_order_value DESC;
'''
t0 = run_query(q0)
t0.head(12)
t0.drop(columns='shop_id').describe().round(2)
fig_1_1 = t0.sort_values(by = 'total_item_sold', ascending = False)
# fig_1_1 = fig_1_1[fig_1_1.shop_id != 78]
# fig_1_1 = fig_1_1.drop(index = 0) #dropping shop 42 and 78 row.
fig_1_1.shop_id = fig_1_1.shop_id.astype(str)
sns.set(font_scale = 0.6)
fig_1_1 = sns.scatterplot(data = fig_1_1, x = 'shop_id', y = 'total_item_sold')
fig_1_1.set_title('Figure 1.1: Total item sold for each shop, including the outliers.', fontsize = 14)
fig_1_1.set_xlabel("Shop ID", fontsize = 14)
fig_1_1.set_ylabel("Total items sold", fontsize = 14)
fig_1_2 = t0.sort_values(by = 'price_per_unit', ascending = False)
fig_1_2.shop_id = fig_1_2.shop_id.astype(str)
sns.set(font_scale = 0.6)
fig_1_2 = sns.scatterplot(data = fig_1_2, x = 'shop_id', y = 'price_per_unit')
fig_1_2.set_title('Figure 1.2: Price per Unit for each shop, including the outliers.', fontsize = 14)
fig_1_2.set_xlabel("Shop ID", fontsize = 14)
fig_1_2.set_ylabel("Price per Unit", fontsize = 14)
adj_t = t.copy()
mask = (adj_t['shop_id'] == 42) & (adj_t['total_items'] == 2000)
adj_t.loc[mask, 'total_items'] = adj_t.loc[mask, 'total_items']/1000
adj_t.loc[mask, 'order_amount'] = adj_t.loc[mask, 'order_amount']/1000
# adj_t.query('shop_id == 42').sort_values(by = 'total_items', ascending=False) #check the result
adj_detailed = adj_t.groupby('shop_id').agg({'order_id':'count', 'order_amount': 'sum', 'total_items':'sum'})
adj_detailed.reset_index(inplace = True)
adj_detailed = adj_detailed.rename(columns = {'order_id': 'shop_num_order', 'order_amount':'total_order_value', 'total_items': 'total_item_sold'})
adj_detailed.sort_values(by = 'total_item_sold', ascending = False)
mov = adj_t['order_amount'].median()
print("The Median Order Value is: " + str(mov))
# # Basic statistics for detailed table:
# adj_detailed.describe().apply(lambda s: s.apply('{0:.2f}'.format))
adj_detailed.sort_values(by = 'total_item_sold', ascending = False).head(10).drop(columns = ['shop_num_order', 'total_order_value'])
rev.sort_values(by = 'total_order_value', ascending = False).drop(columns = ['shop_num_order', 'total_item_sold']).head(15)