# Daily visit data in all 50 US states from 2019 - 2021
import pandas as pd
fname1 = '1H2019patterns.csv'
fname2 = '1H2020patterns.csv'
fname3 = '1H2021patterns.csv'
fname4 = '2H2019patterns.csv'
fname5 = '2H2020patterns.csv'
fname6 = '2H2021patterns.csv'
#read in data to create a wide sample
dt1 = pd.read_csv(fname1)
dt2 = pd.read_csv(fname2)
dt3 = pd.read_csv(fname3)
dt4 = pd.read_csv(fname4)
dt5 = pd.read_csv(fname5)
dt6 = pd.read_csv(fname6)
total_obs = dt1.shape[0]+dt2.shape[0]+dt3.shape[0]+dt4.shape[0]+dt5.shape[0]+dt6.shape[0]
print(f'The total number of obersvations is: {total_obs}')
# Identify target state for analysis: Wyoming
mystate = 'WY'
obs_wy1 = dt1[dt1.region == mystate]
obs_wy2 = dt2[dt2.region == mystate]
obs_wy3 = dt3[dt3.region == mystate]
obs_wy4 = dt4[dt4.region == mystate]
obs_wy5 = dt5[dt5.region == mystate]
obs_wy6 = dt6[dt6.region == mystate]
obs_wy = pd.concat([obs_wy1, obs_wy2])
obs_wy = pd.concat([obs_wy, obs_wy3])
obs_wy = pd.concat([obs_wy, obs_wy4])
obs_wy = pd.concat([obs_wy, obs_wy5])
obs_wy = pd.concat([obs_wy, obs_wy6])
obs_shape = obs_wy.shape
print(f'The total number of obersvations is: {obs_shape}')
obs_wy['date_range_start'] = pd.to_datetime(obs_wy['date_range_start'], utc=True).dt.date
obs_wy['date_range_end'] = pd.to_datetime(obs_wy['date_range_end'], utc=True).dt.date
bdate = obs_wy.date_range_start.iloc[0]
edate = obs_wy.date_range_end.iloc[-1]
print(f'Beginning date: {bdate}')
print(f'Ending date: {edate}')
#construct 31 columns
cnames = ['dailyvisit'+str(i) for i in range(1,32)]
vday = pd.DataFrame(columns=cnames)
rowidx = 0
for v in obs_wy.visits_by_day:
vlist = eval(v)
patch = [-1 for i in range(31-len(vlist))]
vlist.extend(patch)
vday.loc[rowidx]=vlist
rowidx = rowidx+1
if (rowidx % 10000) == 0:
print(rowidx)
obs_wyj = obs_wy.join(vday)
obs_wyj.shape
obs_wyj.reset_index(inplace=True)
obs_wy_long = pd.wide_to_long(obs_wyj, ['dailyvisit'], i=['index'], j='day')
#get rid of rows with fake visit value -1
obs_wy_long2 = obs_wy_long[obs_wy_long.dailyvisit > -1]
print(obs_wy_long2.shape) #create long sample
# basic descriptive statistics of daily visits in WY in long sample
meanvalue = obs_wy_long2.dailyvisit.mean()
stdvalue = obs_wy_long2.dailyvisit.std()
minvalue = obs_wy_long2.dailyvisit.min()
maxvalue = obs_wy_long2.dailyvisit.max()
print(f'Mean: {meanvalue}')
print(f'Standard Deviation: {stdvalue}')
print(f'Minimum value: {minvalue}')
print(f'Maximum value: {maxvalue}')
#define outliers as data points outside mean +/- 2*std
upperbound = meanvalue+stdvalue*2
lowerbound = meanvalue-stdvalue*2
upout = sum(obs_wy_long2.dailyvisit>upperbound)
lowout = sum(obs_wy_long2.dailyvisit<lowerbound)
print(f'Number of outliers above upper bound: {upout}')
print(f'Number of outliers above lower bound: {lowout}')
#All of the outliers are above the upper bound
obs_wy_long2 = obs_wy_long2.drop(columns = ['raw_visitor_counts','poi_cbg', 'visits_by_day', 'Unnamed: 0','raw_visit_counts'])
wyday = obs_wy_long2.reset_index()
wyday['dailyvisit'] = wyday['dailyvisit'].astype(int) #force dailyvisit column to be numeric to calculate the mean
wyct = wyday.groupby(by=['date_range_start', 'date_range_end']).count().drop(columns=['index'])
wymean = wyday.groupby(by=['date_range_start', 'date_range_end']).mean().drop(columns=['index'])
wyct = wyct.join(wymean, lsuffix='ct',rsuffix='mean').drop(columns=['city', 'dailyvisitct', 'daymean'])
wyct = wyct.rename(columns={'dayct':'num_observations'})
print(wyct)
#define new variable ' dayofweek'
wyday['dayofweek'] = pd.to_datetime(wyday.date_range_start).dt.weekday
#need to adjust the day of week
#the exact date for each daily visit is date_range_start + day -1
#the results are in dayofweek2
wyday['dayofweek2']=(wyday.dayofweek+wyday.day-1)%7
wyct = wyday.groupby(by=['dayofweek2']).count().drop(columns=['index'])
wymean = wyday.groupby(by=['dayofweek2']).mean().drop(columns=['index'])
wyct = wyct.join(wymean, lsuffix='ct',rsuffix='mean')
weekday_sta = wyct.drop(columns = ['city', 'date_range_start', 'date_range_end', 'dailyvisitct', 'daymean'])
weekday_sta = weekday_sta.rename(columns = {'dayct':'num_observations'})
print(weekday_sta)
wyday['positivevisits'] = 1
wyday.loc[wyday.dailyvisit==0, 'positivevisits'] = 0
wyct = wyday.groupby(by=['dayofweek2', 'positivevisits']).count()
print(wyct['index'])
wyday['weekend'] = 1
wyday.loc[wyday.dayofweek2<5, 'weekend'] = 0
wyday.to_csv('wyday.csv')
wyct = wyday.groupby(by=['weekend']).count().drop(columns=['index'])
wymean = wyday.groupby(by=['weekend']).mean().drop(columns=['index'])
wyct = wyct.join(wymean, lsuffix='ct',rsuffix='mean')
weekday_sta = wyct[['dayct', 'dailyvisitmean']]
weekday_sta = weekday_sta.rename(columns={'dayct':'num_observations'})
print(weekday_sta)