# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import plotly.express as px
import os
import warnings
# Local server SQL database
import sqlite3 as sq
# Setting of Large numbers format
pd.options.display.float_format = '{:,.2f}'.format
# Set data frame display max 10 rows
pd.set_option('display.max_rows', 10)
# Warning is suppressed
warnings.simplefilter(action='ignore', category=FutureWarning)
# Set-up a connection to a newly named project1.db
con = sq.connect('project1.db')
cur = con.cursor()
# Get the count of customers_dataset table
cur.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='../input/brazilian-ecommerce/olist_customers_dataset.csv' OR name='customers_dataset' ''')
# If the count is 1, then customers_dataset table already exists
if cur.fetchone()[0]==1 : {
print('Table already created.')
}
else: # Read all files from csv to db format
path = '/work/Data'
all_files = glob.glob(path + "/*.csv")
for file in all_files: # For all files in our directory
df = pd.read_csv(file, index_col=0) # Read each CSV file
df.to_sql(file, con) # Create the read file as a table in the database.
# Get the count of customers_dataset table
cur.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='customers_dataset' ''')
# If the count is 1, then customers_dataset table already renamed
if cur.fetchone()[0]==1:
print('Table already renamed.')
else:
# Rename all tables
rename_tables_query1 = 'ALTER TABLE "/work/Data/olist_customers_dataset.csv" RENAME TO "customers_dataset"'
rename_tables_query2 = 'ALTER TABLE "/work/Data/olist_geolocation_dataset.csv" RENAME TO "geolocation_dataset"'
rename_tables_query3 = 'ALTER TABLE "/work/Data/olist_orders_dataset.csv" RENAME TO "orders_dataset"'
rename_tables_query4 = 'ALTER TABLE "/work/Data/olist_order_items_dataset.csv" RENAME TO "order_items_dataset"'
rename_tables_query5 = 'ALTER TABLE "/work/Data/olist_order_payments_dataset.csv" RENAME TO "order_payments_dataset"'
rename_tables_query6 = 'ALTER TABLE "/work/Data/olist_order_reviews_dataset.csv" RENAME TO "order_reviews_dataset"'
rename_tables_query7 = 'ALTER TABLE "/work/Data/olist_products_dataset.csv" RENAME TO "products_dataset"'
rename_tables_query8 = 'ALTER TABLE "/work/Data/olist_sellers_dataset.csv" RENAME TO "sellers_dataset"'
rename_tables_query9 = 'ALTER TABLE "/work/Data/product_category_name_translation.csv" RENAME TO "product_category_name_translation"'
# The function read_sql takes a query string and a database connection, and performs the query.
rename_tables1 = pd.read_sql(rename_tables_query1, con)
rename_tables2 = pd.read_sql(rename_tables_query2, con)
rename_tables3 = pd.read_sql(rename_tables_query3, con)
rename_tables4 = pd.read_sql(rename_tables_query4, con)
rename_tables5 = pd.read_sql(rename_tables_query5, con)
rename_tables6 = pd.read_sql(rename_tables_query6, con)
rename_tables7 = pd.read_sql(rename_tables_query7, con)
rename_tables8 = pd.read_sql(rename_tables_query8, con)
rename_tables9 = pd.read_sql(rename_tables_query9, con)
# Read all table names
table_list = [a for a in cur.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
# Table list
print(table_list)
# Write a SQL query of Total Orders and Total Sales
q1 = (
'SELECT count(a.order_id) AS Total_Orders, '
' sum(b.price + b.freight_value) AS Total_Sales '
'FROM orders_dataset AS a '
'INNER JOIN order_items_dataset AS b '
'ON a.order_id = b.order_id '
)
# Convert the SQL query to Pandas data Frame
r1 = pd.read_sql(q1, con)
r1
# Write a SQL query of Orders by Time
q2 = (
'SELECT strftime("%Y-%m", order_approved_at) AS date, '
' COUNT(order_id) AS order_qty '
'FROM orders_dataset '
'GROUP BY date '
'ORDER BY date '
)
# Convert the SQL query to Pandas data Frame
r2 = pd.read_sql(q2, con)
r2.head(5)
# Plotting orders by time data
fig1 = px.bar(r2, x="date", y="order_qty", orientation='v', title='Orders by Date in Brazilian E-Commerce (2016-2018)')
fig1.show()
# Write a SQL query of Sales by Time
q3 = (
'SELECT strftime("%Y-%m", a.order_approved_at) AS date, '
' SUM(b.price) AS sales '
'FROM orders_dataset AS a '
'INNER JOIN order_items_dataset AS b '
'ON a.order_id = b.order_id '
'GROUP BY date '
'ORDER BY date '
)
# Convert the SQL query to Pandas data Frame
r3 = pd.read_sql(q3, con)
r3.head(5)
# Plotting sales by time data
fig2 = px.line(r3, x="date", y="sales", title='Sales by Time in Brazilian E-Commerce (2016-2018)')
fig2.show()
# Write a SQL query of Selling Product Categories Quantity
q4 = (
'SELECT a.product_category_name_english AS product, '
' COUNT(b.product_category_name) AS qty '
'FROM product_category_name_translation AS a '
'INNER JOIN products_dataset AS b '
'ON a.product_category_name = b.product_category_name '
'GROUP BY product '
'ORDER BY qty DESC '
)
# Convert the SQL query to Pandas data Frame
r4 = pd.read_sql(q4, con)
r4_top = r4.head(5)
r4_top
# Plotting selling product categories quantity data
fig3 = px.bar(r4_top, x="qty", y="product", orientation='h', barmode="group", title='Top 5 Selling Product Categories in Brazilian E-Commerce (2016-2018)')
fig3.update_layout(yaxis={'categoryorder':'total ascending'})
fig3.show()
# Write a SQL query of Sellers by City
q5 = (
'WITH temp_sellers AS '
'( '
' SELECT a.seller_city AS seller_city, '
' COUNT(b.order_id) AS sales_qty, '
' COUNT(b.order_id) * 100.0 / SUM(COUNT(b.order_id)) OVER () AS temp_sales_percentage '
' FROM sellers_dataset AS a '
' INNER JOIN order_items_dataset AS b '
' ON a.seller_id = b.seller_id '
' GROUP BY seller_city '
' ORDER BY sales_qty DESC '
') '
'SELECT seller_city, '
' sales_qty, '
' printf("%.2f", temp_sales_percentage) AS sales_percentage '
'FROM temp_sellers '
)
# Convert the SQL query to Pandas data Frame
r5 = pd.read_sql(q5, con)
r5_top = r5.head(5)
r5_top
# Plotting selling sellers by city data
fig4 = px.bar(r5_top, x="sales_qty", y="seller_city", orientation='h', hover_data=['sales_percentage'], color='sales_percentage', title='Top 5 Sellers by City in Brazilian E-Commerce (2016-2018)')
fig4.update_layout(yaxis={'categoryorder':'total ascending'})
fig4.show()
# Write a SQL query of Customers by City
q6 = (
'WITH temp_customer AS '
'( '
' SELECT a.customer_city AS customer_city, '
' COUNT(b.order_id) AS order_qty, '
' COUNT(b.order_id) * 100.0 / SUM(COUNT(b.order_id)) OVER () AS temp_order_percentage '
' FROM customers_dataset AS a '
' INNER JOIN orders_dataset AS b '
' ON a.customer_id = b.customer_id '
' GROUP BY customer_city '
' ORDER BY order_qty DESC '
') '
'SELECT customer_city, '
' order_qty, '
' printf("%.2f", temp_order_percentage) AS order_percentage '
'FROM temp_customer '
)
# Convert the SQL query to Pandas data Frame
r6 = pd.read_sql(q6, con)
r6_top = r6.head(5)
r6_top
# Plotting selling customer by city data
fig5 = px.bar(r6_top, x="order_qty", y="customer_city", orientation='h', hover_data=['order_percentage'], color='order_percentage', title='Top 5 Customers by City in Brazilian E-Commerce (2016-2018)')
fig5.update_layout(yaxis={'categoryorder':'total ascending'})
fig5.show()
# Write a SQL query of Average, Max, and Min Products
q7 = (
'SELECT AVG(price) AS Average_Price, '
' MAX(price) AS Max_Price, '
' MIN(price) AS Min_Price '
'FROM order_items_dataset '
)
# Convert the SQL query to Pandas data Frame
r7 = pd.read_sql(q7, con)
r7
q8 = (
'WITH average_time AS '
'( '
' SELECT julianday(order_estimated_delivery_date) - julianday(order_delivered_customer_date) AS delivery_time '
' FROM orders_dataset '
' WHERE order_status = "delivered" '
') '
'SELECT AVG(delivery_time) AS "Average_Delivery_Time_Interval_(Estimated_vs_Actual)" '
'FROM average_time '
)
# Convert the SQL query to Pandas data Frame
r8 = pd.read_sql(q8, con)
r8
# Write a SQL query of Delivery Interval (Estimated vs Actual) per Month
q9 = (
'WITH average_time AS '
'( '
' SELECT strftime("%Y-%m", order_delivered_customer_date) AS date, '
' julianday(order_estimated_delivery_date) - julianday(order_delivered_customer_date) AS delivery_time, '
' COUNT(order_id) AS qty'
' FROM orders_dataset '
' WHERE order_status = "delivered" '
' GROUP BY date '
' ORDER BY date '
') '
'SELECT date AS Date, delivery_time AS Day, qty AS Qty '
'FROM average_time '
)
# Convert the SQL query to Pandas data Frame
r9 = pd.read_sql(q9, con)
r9
# Plotting Delivery Interval (Estimated vs Actual) per Month data
fig6 = px.bar(r9, x="Date", y="Day", orientation='v', hover_data=['Qty'], color='Qty', title='Delivery Interval (Estimated vs Actual) per Month in<br>Brazilian E-Commerce (2016-2018)')
fig6.show()
# Close connection to Database
con.close()