from pathlib import Path
import numpy as np
import pandas as pd
from dataset import data
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import matplotlib.style as style
%matplotlib inline
style.use("fivethirtyeight")
style_colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
d = data()
d
items_clicked = d.load("Items Clicked")
items_on_cart = d.load("Items put on cart")
items_clicked["TIME_DIM_KEY"] = pd.to_datetime(items_clicked["TIME_DIM_KEY"],format="%Y%m%d")
items_clicked
clicks_per_day = items_clicked["TIME_DIM_KEY"].value_counts()
clicks_per_day
len(clicks_per_day)
summary_stats = clicks_per_day.describe()
summary_stats
fig, (ax1,ax2) = plt.subplots(nrows=1,ncols=2,figsize=(20,5))
ax1.hist(x=clicks_per_day)
sns.boxplot(x=clicks_per_day,ax=ax2)
plt.show()
#Sort the number of clicks per day
clicks_per_day_sorted = clicks_per_day.sort_values(ascending=False)
clicks_per_day_sorted.index= clicks_per_day_sorted.index.map(lambda x: x.strftime('%Y-%m-%d'))
threshold = summary_stats["75%"]+(1.5*stats.iqr(clicks_per_day))
is_outlier = clicks_per_day_sorted>threshold
is_outlier.sum()
fig, (ax1,ax2) = plt.subplots(nrows=1,ncols=2,figsize=(20,5))
is_outlier2 = clicks_per_day>threshold
ax1.hist(clicks_per_day[~is_outlier2])
sns.boxplot(x=clicks_per_day[~is_outlier2],ax=ax2)
plt.show()
fig, ax = plt.subplots(figsize=(15,10))
ax.bar(height=clicks_per_day_sorted[is_outlier],x=clicks_per_day_sorted[is_outlier].index,label="outlier",color=style_colors[1])
ax.bar(height=clicks_per_day_sorted[~is_outlier],x=clicks_per_day_sorted[~is_outlier].index,color=style_colors[0],label="non outlier")
for location in ['left', 'right', 'bottom', 'top']:
ax.spines[location].set_visible(False)
ax.set_xticks([])
ax.tick_params(pad=5)
ax.legend(prop={"size":20})
ax.set_xlabel("Days",fontsize=20,labelpad=30)
ax.set_ylabel("Clicks",fontsize=20,labelpad=30)
ax.set_title("Days With Most Clicks")
plt.show()