import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,3)
%config InlineBackend.figure_format='retina'
!ls e-commerce-data
olist_customers_dataset.csv olist_orders_dataset.csv
olist_geolocation_dataset.csv olist_products_dataset.csv
olist_order_items_dataset.csv olist_sellers_dataset.csv
olist_order_payments_dataset.csv product_category_name_translation.csv
olist_order_reviews_dataset.csv
# load tables
orders = pd.read_csv('./e-commerce-data/olist_orders_dataset.csv')
orders_payments = pd.read_csv('./e-commerce-data/olist_order_payments_dataset.csv')
orders.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 99441 non-null object
1 customer_id 99441 non-null object
2 order_status 99441 non-null object
3 order_purchase_timestamp 99441 non-null object
4 order_approved_at 99281 non-null object
5 order_delivered_carrier_date 97658 non-null object
6 order_delivered_customer_date 96476 non-null object
7 order_estimated_delivery_date 99441 non-null object
dtypes: object(8)
memory usage: 6.1+ MB
orders_payments.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 103886 non-null object
1 payment_sequential 103886 non-null int64
2 payment_type 103886 non-null object
3 payment_installments 103886 non-null int64
4 payment_value 103886 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB
# merged tables and convert timestamp to pandas-like format
orders_merged = orders.merge(orders_payments, how='inner', on='order_id')
orders_merged['order_purchase_timestamp'] = pd.to_datetime(orders_merged['order_purchase_timestamp'])
# multiply the number of installments with the payment value
orders_merged['order_value'] = orders_merged['payment_value'] * orders_merged['payment_installments']
# group by dimension and plot
orders_merged.groupby(orders_merged['order_purchase_timestamp'].dt.day)['order_value'].sum().plot()
plt.title('Sales by day')
orders_merged.groupby(orders_merged['order_purchase_timestamp'].dt.week)['order_value'].sum().plot()
plt.title('Sales by week')
orders_merged.groupby(orders_merged['order_purchase_timestamp'].dt.month)['order_value'].sum().plot()
plt.title('Sales by month')
print('Average value of order by month')
avg_value = orders_merged.groupby(orders_merged['order_purchase_timestamp'].dt.month)['order_value'].mean().reset_index()
avg_value.columns = ['Month', 'Value']
avg_value = avg_value.round(2)
avg_value['Month'] = pd.to_datetime(avg_value['Month'], format='%m').dt.month_name().str.slice(stop=3)
avg_value
Average value of order by month
avg_value
customers = pd.read_csv('e-commerce-data/olist_customers_dataset.csv')
customers.head()
customers['customer_city'].value_counts().to_frame()
orders = pd.read_csv('e-commerce-data/olist_orders_dataset.csv')
orders.info()
print('\n')
order_items = pd.read_csv('e-commerce-data/olist_order_items_dataset.csv')
order_items.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 99441 non-null object
1 customer_id 99441 non-null object
2 order_status 99441 non-null object
3 order_purchase_timestamp 99441 non-null object
4 order_approved_at 99281 non-null object
5 order_delivered_carrier_date 97658 non-null object
6 order_delivered_customer_date 96476 non-null object
7 order_estimated_delivery_date 99441 non-null object
dtypes: object(8)
memory usage: 6.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 112650 non-null object
1 order_item_id 112650 non-null int64
2 product_id 112650 non-null object
3 seller_id 112650 non-null object
4 shipping_limit_date 112650 non-null object
5 price 112650 non-null float64
6 freight_value 112650 non-null float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB
orders_merged_items = orders.merge(order_items, how='inner', on='order_id')[['product_id']]
orders_merged_items.head()
product_desc = pd.read_csv('e-commerce-data/olist_products_dataset.csv')
product_desc.head()
# join orders_merged_items with product_desc on product_id column
product_count = orders_merged_items.merge(product_desc, how='inner', on='product_id')
product_count.head()
eng_cat = pd.read_csv('e-commerce-data/product_category_name_translation.csv')
eng_cat.head()
product_count = product_count.merge(eng_cat)
product_count['product_category_name_english'].value_counts()[0:20].plot(kind='bar')