import pandas as pd
wyday= pd.read_csv('wyday.csv')
wyday['dailyvisit'].describe()
len(wyday[wyday.dailyvisit >=100])
len(wyday[wyday.dailyvisit >=1000])
len(wyday[wyday.dailyvisit == 0])
# Basic histogram of daily visits
# Bin width:5 to create 20 even columns between 0 and 100
# Range is set [0,100] because most of the observations have daily visits below 100
plot1 = wyday.dailyvisit.hist(bins = 20,range = [0, 100])
plot1.set_xlabel('Daily Visits (bin width: 5)')
plot1.set_ylabel('Number of Observations')
plot1.set_title('Daily Visit Distribution')
# Compare daily visits on weekday vs weekend
from matplotlib import pyplot as plt
plt.hist(wyday.loc[wyday.weekend==0,'dailyvisit'], bins=25,range=[0,50],alpha=0.5, label='Weekday')
plt.hist(wyday.loc[wyday.weekend==1,'dailyvisit'], bins=25,range=[0,50],alpha=0.5, label='Weekend')
plt.xlabel('Daily Visits')
plt.ylabel('Number of Observations')
plt.title('Daily Visits by Weekday and Weekend')
plt.legend()
plt.show()
# sum of daily visits by day of week
weekdaysum = wyday.groupby('dayofweek2').sum()
weekdaysum.reset_index(inplace=True)
weekdaysum = weekdaysum[['dayofweek2','dailyvisit']]
dic ={0:'Mon', 1:'Tues', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
weekdaysum['DW'] = [dic[x] for x in weekdaysum.dayofweek2]
weekdaysum
# Pie chart of totaly daily visits
plt.pie(weekdaysum.dailyvisit, labels=weekdaysum.DW, autopct = '%1.2f%%')
plt.legend(loc = 1, bbox_to_anchor=(1.4,1))
plt.title('Total Daily Visits')
# average daily visits by day of week
weekdaymean = wyday.groupby('dayofweek2').mean()
weekdaymean.reset_index(inplace=True)
weekdaymean = weekdaymean[['dayofweek2','dailyvisit']]
dic ={0:'Mon', 1:'Tues', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
weekdaymean['DW'] = [dic[x] for x in weekdaymean.dayofweek2]
weekdaymean
# average daily visits in bar chart
plt.bar(x=weekdaymean.DW, height=weekdaymean.dailyvisit)
plt.xlabel("Day of Week")
plt.ylabel("Average Daily Visits")
plt.title("Average Daily Visits by Day")
for i in range(len(weekdaymean)):
plt.text(i-0.2, weekdaymean.dailyvisit[i]+0.05, '%2.1f'%(weekdaymean.dailyvisit[i]))
wyday['date_range_end'] = pd.to_datetime(wyday.date_range_end)
wyday['date_range_start'] = pd.to_datetime(wyday.date_range_start)
wyday_before = wyday[wyday.date_range_end <= pd.to_datetime('2020-3-13')]
wyday_after = wyday[wyday.date_range_start > pd.to_datetime('2020-3-13')]
weekdaymean = wyday_before.groupby('dayofweek2').mean()
weekdaymean.reset_index(inplace=True)
weekdaymean = weekdaymean[['dayofweek2','dailyvisit']]
dic ={0:'Mon', 1:'Tues', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
weekdaymean['DW'] = [dic[x] for x in weekdaymean.dayofweek2]
weekdaymean_before = weekdaymean.copy()
weekdaymean = wyday_after.groupby('dayofweek2').mean()
weekdaymean.reset_index(inplace=True)
weekdaymean = weekdaymean[['dayofweek2','dailyvisit']]
dic ={0:'Mon', 1:'Tues', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
weekdaymean['DW'] = [dic[x] for x in weekdaymean.dayofweek2]
weekdaymean_after = weekdaymean.copy()
weekdaymean_before['COVID'] = 'Before'
weekdaymean_after['COVID'] = 'After'
weekdaymean_COVID = pd.concat([weekdaymean_before, weekdaymean_after])
# use seaborn to compare average daily visits before/after COVID
import seaborn as sns
myplot = sns.catplot(x='DW', y='dailyvisit', hue='COVID', data=weekdaymean_COVID, kind='bar')
plt.xlabel("Day of Week")
plt.ylabel("Average Daily Visits")
plt.title("Average Daily Visits by Day")
avgmonth = wyday.groupby('date_range_start').mean()
avgmonth.reset_index(inplace=True)
avgmonth = avgmonth[['date_range_start', 'dailyvisit']]
avgmonth
# Plot average daily visits by month with matplotlib
plt.rcParams["figure.figsize"]=10,5
plt.plot(avgmonth.date_range_start, avgmonth.dailyvisit, label='Average Daily Visit by Month')
ystart = ['2019-1-1', '2020-1-1', '2021-1-1']
ystart = [pd.to_datetime(x) for x in ystart]
plt.vlines(ystart,4,10, color='k')
plt.vlines(pd.to_datetime('2020-3-1'),4,10, color='r')
plt.xlabel('Year-Month')
plt.ylabel('Average Daily Visits')
plt.title('Average Daily Visits by Month')
plt.legend()
medmonth = wyday[['date_range_start', 'dailyvisit']].groupby('date_range_start').quantile(q=0.5)
medmonth.reset_index(inplace=True)
q1month = wyday[['date_range_start', 'dailyvisit']].groupby('date_range_start').quantile(q=0.25)
q1month.reset_index(inplace=True)
q3month = wyday[['date_range_start', 'dailyvisit']].groupby('date_range_start').quantile(q=0.75)
q3month.reset_index(inplace=True)
# Plot descriptive stats of daily visits by month
plt.rcParams["figure.figsize"]=10,5
plt.plot(avgmonth.date_range_start, avgmonth.dailyvisit, label='Mean')
plt.plot(medmonth.date_range_start, medmonth.dailyvisit, label='Median')
plt.plot(q1month.date_range_start, q1month.dailyvisit, label='25th Percentile')
plt.plot(q3month.date_range_start, q3month.dailyvisit, label='75th Percentile')
plt.fill_between(avgmonth.date_range_start, q1month.dailyvisit, q3month.dailyvisit, color='gray')
ystart = ['2019-1-1', '2020-1-1', '2021-1-1']
ystart = [pd.to_datetime(x) for x in ystart]
plt.vlines(ystart,0,15, color='k')
plt.vlines(pd.to_datetime('2020-3-1'),0,15, color='r')
plt.xlabel('Year-Month')
plt.ylabel('Average Daily Visits')
plt.title('Average Daily Visits by Month')
plt.legend()
wyday['year']=wyday.date_range_start.dt.year
wyday['quarter']=wyday.date_range_start.dt.quarter
yq = [str(wyday.year.iloc[x])+'-'+str(wyday.quarter.iloc[x]) for x in range(len(wyday))]
wyday['year_quarter'] = yq
# use seaborn to make boxplot of daily visits by quarter
ax = sns.boxplot(x="year_quarter", y="dailyvisit", data=wyday,
order = ['2019-1', '2019-2', '2019-3', '2019-4',
'2020-1', '2020-2', '2020-3', '2020-4', '2021-1', '2021-2', '2021-3', '2021-4'])
plt.title('Daily Visits by Quarter')
plt.ylabel('Daily Visits')