import pandas as pd
import numpy as np
df = pd.read_csv('/work/2019 Winter Data Science Intern Challenge Data Set - Sheet1.csv')
df['order_amount'].describe()
df['order_amount'].median()
biggest_orders_by_price_df = df.sort_values(by='order_amount', ascending=False)
biggest_orders_by_price_df.head(20)
"""
Python Equivalent of SQL Statement:
WITH order_amount_counts AS (
SELECT order_amount, COUNT(order_amount)
FROM table
GROUP BY order_amount
)
TODO: debug why nunique returns a series instead of a dataframe
"""
order_amount_counts = df.groupby(['order_amount'])['order_id'].nunique()
order_amount_counts = order_amount_counts.to_frame()
order_amount_counts = order_amount_counts.reset_index()
order_amount_counts = order_amount_counts.rename(columns={'order_id':'count'})
order_amount_counts.head(5)
biggest_orders_by_volume_df = df.sort_values(by='total_items', ascending=False)
biggest_orders_by_volume_df.head(20)
order_volume_counts = df.groupby(['total_items'])['order_id'].nunique()
order_volume_counts = order_volume_counts.to_frame()
order_volume_counts = order_volume_counts.reset_index()
order_volume_counts = order_volume_counts.rename(columns={'order_id':'count'})
order_volume_counts
import seaborn as sns
sns.barplot(data=order_volume_counts, x='total_items', y='count')