# Daily visit data in all 50 US states from 2019 - 2021
import pandas as pd
fname1 = '1H2019patterns.csv'
fname2 = '1H2020patterns.csv'
fname3 = '1H2021patterns.csv'
fname4 = '2H2019patterns.csv'
fname5 = '2H2020patterns.csv'
fname6 = '2H2021patterns.csv'
#read in data to create a wide sample
dt1 = pd.read_csv(fname1)
dt2 = pd.read_csv(fname2)
dt3 = pd.read_csv(fname3)
dt4 = pd.read_csv(fname4)
dt5 = pd.read_csv(fname5)
dt6 = pd.read_csv(fname6)
total_obs = dt1.shape[0]+dt2.shape[0]+dt3.shape[0]+dt4.shape[0]+dt5.shape[0]+dt6.shape[0]
print(f'The total number of obersvations is: {total_obs}')
The total number of obersvations is: 25331975
# Identify target state for analysis: Wyoming
mystate = 'WY'
obs_wy1 = dt1[dt1.region == mystate]
obs_wy2 = dt2[dt2.region == mystate]
obs_wy3 = dt3[dt3.region == mystate]
obs_wy4 = dt4[dt4.region == mystate]
obs_wy5 = dt5[dt5.region == mystate]
obs_wy6 = dt6[dt6.region == mystate]
obs_wy = pd.concat([obs_wy1, obs_wy2])
obs_wy = pd.concat([obs_wy, obs_wy3])
obs_wy = pd.concat([obs_wy, obs_wy4])
obs_wy = pd.concat([obs_wy, obs_wy5])
obs_wy = pd.concat([obs_wy, obs_wy6])
obs_shape = obs_wy.shape
print(f'The total number of obersvations is: {obs_shape}')
The total number of obersvations is: (55172, 9)
obs_wy['date_range_start'] = pd.to_datetime(obs_wy['date_range_start'], utc=True).dt.date
obs_wy['date_range_end'] = pd.to_datetime(obs_wy['date_range_end'], utc=True).dt.date
bdate = obs_wy.date_range_start.iloc[0]
edate = obs_wy.date_range_end.iloc[-1]
print(f'Beginning date: {bdate}')
print(f'Ending date: {edate}')
Beginning date: 2019-02-01
Ending date: 2021-11-01
#construct 31 columns
cnames = ['dailyvisit'+str(i) for i in range(1,32)]
vday = pd.DataFrame(columns=cnames)
rowidx = 0
for v in obs_wy.visits_by_day:
vlist = eval(v)
patch = [-1 for i in range(31-len(vlist))]
vlist.extend(patch)
vday.loc[rowidx]=vlist
rowidx = rowidx+1
if (rowidx % 10000) == 0:
print(rowidx)
10000
20000
30000
40000
50000
obs_wyj = obs_wy.join(vday)
obs_wyj.shape
obs_wyj.reset_index(inplace=True)
obs_wy_long = pd.wide_to_long(obs_wyj, ['dailyvisit'], i=['index'], j='day')
#get rid of rows with fake visit value -1
obs_wy_long2 = obs_wy_long[obs_wy_long.dailyvisit > -1]
print(obs_wy_long2.shape) #create long sample
(1679836, 11)
# basic descriptive statistics of daily visits in WY in long sample
meanvalue = obs_wy_long2.dailyvisit.mean()
stdvalue = obs_wy_long2.dailyvisit.std()
minvalue = obs_wy_long2.dailyvisit.min()
maxvalue = obs_wy_long2.dailyvisit.max()
print(f'Mean: {meanvalue}')
print(f'Standard Deviation: {stdvalue}')
print(f'Minimum value: {minvalue}')
print(f'Maximum value: {maxvalue}')
Mean: 6.749877368981258
Standard Deviation: 9.792662434796089
Minimum value: 0
Maximum value: 366
#define outliers as data points outside mean +/- 2*std
upperbound = meanvalue+stdvalue*2
lowerbound = meanvalue-stdvalue*2
upout = sum(obs_wy_long2.dailyvisit>upperbound)
lowout = sum(obs_wy_long2.dailyvisit<lowerbound)
print(f'Number of outliers above upper bound: {upout}')
print(f'Number of outliers above lower bound: {lowout}')
#All of the outliers are above the upper bound
Number of outliers above upper bound: 70492
Number of outliers above lower bound: 0
obs_wy_long2 = obs_wy_long2.drop(columns = ['raw_visitor_counts','poi_cbg', 'visits_by_day', 'Unnamed: 0','raw_visit_counts'])
wyday = obs_wy_long2.reset_index()
wyday['dailyvisit'] = wyday['dailyvisit'].astype(int) #force dailyvisit column to be numeric to calculate the mean
wyct = wyday.groupby(by=['date_range_start', 'date_range_end']).count().drop(columns=['index'])
wymean = wyday.groupby(by=['date_range_start', 'date_range_end']).mean().drop(columns=['index'])
wyct = wyct.join(wymean, lsuffix='ct',rsuffix='mean').drop(columns=['city', 'dailyvisitct', 'daymean'])
wyct = wyct.rename(columns={'dayct':'num_observations'})
print(wyct)
num_observations placekey region \
date_range_start date_range_end
2019-01-01 2019-02-01 46500 46500 46500
2019-02-01 2019-03-01 42196 42196 42196
2019-03-01 2019-04-01 46562 46562 46562
2019-04-01 2019-05-01 45510 45510 45510
2019-05-01 2019-06-01 47399 47399 47399
2019-06-01 2019-07-01 46080 46080 46080
2019-07-01 2019-08-01 47616 47616 47616
2019-08-01 2019-09-01 47678 47678 47678
2019-09-01 2019-10-01 45630 45630 45630
2019-10-01 2019-11-01 47058 47058 47058
2019-11-01 2019-12-01 45000 45000 45000
2019-12-01 2020-01-01 46593 46593 46593
2020-01-01 2020-02-01 47151 47151 47151
2020-02-01 2020-03-01 43877 43877 43877
2020-03-01 2020-04-01 46779 46779 46779
2020-04-01 2020-05-01 44670 44670 44670
2020-05-01 2020-06-01 46748 46748 46748
2020-06-01 2020-07-01 45960 45960 45960
2020-07-01 2020-08-01 47957 47957 47957
2020-08-01 2020-09-01 47895 47895 47895
2020-09-01 2020-10-01 46410 46410 46410
2020-10-01 2020-11-01 47740 47740 47740
2020-11-01 2020-12-01 45450 45450 45450
2020-12-01 2021-01-01 46779 46779 46779
2021-01-01 2021-02-01 47120 47120 47120
2021-02-01 2021-03-01 42392 42392 42392
2021-03-01 2021-04-01 47399 47399 47399
2021-04-01 2021-05-01 46050 46050 46050
2021-05-01 2021-06-01 48019 48019 48019
2021-06-01 2021-07-01 46650 46650 46650
2021-07-01 2021-08-01 48732 48732 48732
2021-08-01 2021-09-01 49259 49259 49259
2021-09-01 2021-10-01 47730 47730 47730
2021-10-01 2021-11-01 49352 49352 49352
2021-11-01 2021-12-01 47070 47070 47070
2021-12-01 2022-01-01 48825 48825 48825
dailyvisitmean
date_range_start date_range_end
2019-01-01 2019-02-01 5.320516
2019-02-01 2019-03-01 5.746469
2019-03-01 2019-04-01 6.174284
2019-04-01 2019-05-01 6.050319
2019-05-01 2019-06-01 7.194287
2019-06-01 2019-07-01 8.280122
2019-07-01 2019-08-01 8.503591
2019-08-01 2019-09-01 7.702106
2019-09-01 2019-10-01 7.377274
2019-10-01 2019-11-01 6.241723
2019-11-01 2019-12-01 5.786422
2019-12-01 2020-01-01 6.335093
2020-01-01 2020-02-01 7.391084
2020-02-01 2020-03-01 6.967272
2020-03-01 2020-04-01 5.231493
2020-04-01 2020-05-01 3.712380
2020-05-01 2020-06-01 5.054462
2020-06-01 2020-07-01 7.221432
2020-07-01 2020-08-01 8.247201
2020-08-01 2020-09-01 7.760643
2020-09-01 2020-10-01 6.971170
2020-10-01 2020-11-01 6.247507
2020-11-01 2020-12-01 4.830319
2020-12-01 2021-01-01 5.388123
2021-01-01 2021-02-01 5.339516
2021-02-01 2021-03-01 5.632855
2021-03-01 2021-04-01 6.217557
2021-04-01 2021-05-01 6.226145
2021-05-01 2021-06-01 7.383161
2021-06-01 2021-07-01 9.501329
2021-07-01 2021-08-01 9.994213
2021-08-01 2021-09-01 8.429952
2021-09-01 2021-10-01 7.594888
2021-10-01 2021-11-01 6.995259
2021-11-01 2021-12-01 6.600850
2021-12-01 2022-01-01 6.585950
#define new variable ' dayofweek'
wyday['dayofweek'] = pd.to_datetime(wyday.date_range_start).dt.weekday
#need to adjust the day of week
#the exact date for each daily visit is date_range_start + day -1
#the results are in dayofweek2
wyday['dayofweek2']=(wyday.dayofweek+wyday.day-1)%7
wyct = wyday.groupby(by=['dayofweek2']).count().drop(columns=['index'])
wymean = wyday.groupby(by=['dayofweek2']).mean().drop(columns=['index'])
wyct = wyct.join(wymean, lsuffix='ct',rsuffix='mean')
weekday_sta = wyct.drop(columns = ['city', 'date_range_start', 'date_range_end', 'dailyvisitct', 'daymean'])
weekday_sta = weekday_sta.rename(columns = {'dayct':'num_observations'})
print(weekday_sta)
num_observations placekey region dayofweekct dailyvisitmean \
dayofweek2
0 239102 239102 239102 239102 6.194917
1 240601 240601 240601 240601 6.360389
2 240634 240634 240634 240634 6.599450
3 240652 240652 240652 240652 6.831907
4 240667 240667 240667 240667 7.910923
5 239103 239103 239103 239103 7.497581
6 239077 239077 239077 239077 5.849153
dayofweekmean
dayofweek2
0 2.895233
1 2.768895
2 2.730691
3 2.794699
4 2.876801
5 2.953192
6 3.030087
wyday['positivevisits'] = 1
wyday.loc[wyday.dailyvisit==0, 'positivevisits'] = 0
wyct = wyday.groupby(by=['dayofweek2', 'positivevisits']).count()
print(wyct['index'])
dayofweek2 positivevisits
0 0 50210
1 188892
1 0 46169
1 194432
2 0 45083
1 195551
3 0 44658
1 195994
4 0 40711
1 199956
5 0 47430
1 191673
6 0 64046
1 175031
Name: index, dtype: int64
wyday['weekend'] = 1
wyday.loc[wyday.dayofweek2<5, 'weekend'] = 0
wyday.to_csv('wyday.csv')
wyct = wyday.groupby(by=['weekend']).count().drop(columns=['index'])
wymean = wyday.groupby(by=['weekend']).mean().drop(columns=['index'])
wyct = wyct.join(wymean, lsuffix='ct',rsuffix='mean')
weekday_sta = wyct[['dayct', 'dailyvisitmean']]
weekday_sta = weekday_sta.rename(columns={'dayct':'num_observations'})
print(weekday_sta)
num_observations dailyvisitmean
weekend
0 1201656 6.780306
1 478180 6.673412