import pandas as pd
wyday= pd.read_csv('obs_wylong.csv')
statecoreplaces= pd.read_csv('statecoreplaces.csv')
coreplacesWY = statecoreplaces[statecoreplaces.region == 'WY']
print(coreplacesWY.shape)
coreplacesWY.drop(columns = ['brands', 'latitude', 'longitude', 'street_address', 'iso_country_code', 'phone_number', 'open_hours', 'category_tags',
'opened_on', 'closed_on', 'tracking_closed_since', 'geometry_type'], inplace=True)
coreplacesWY.groupby('naics_code').size()
import numpy as np
# variable 'bizgroup' = business group is in full/limited service restaurant
# coreWY = only contains naics code for full/limited service restaurants and grocers
coreWY = coreplacesWY.loc[coreplacesWY['naics_code'].isin([722511, 722513, 445110])].copy()
coreWY['bizgroup'] = np.where(coreWY['naics_code'].isin([722511, 722513]),
1, 0)
coreWY.groupby('bizgroup').size()
table = coreplacesWY.groupby(['city', 'naics_code']).size()
top10 = table.groupby('city').size().nlargest(10)
top10biz = coreWY[coreWY.city.isin(top10.index)]
top10biz.groupby(['city', 'bizgroup']).size().reset_index().pivot(index='city', columns='bizgroup', values=0)
longcoreout = pd.merge(wyday, coreWY, on = 'placekey', how = 'outer', suffixes=('', '_y'))
longcorein = pd.merge(wyday, coreWY, on = 'placekey', how = 'inner', suffixes=('', '_y'))
longcorein.to_csv('longcorein.csv', index = False)
print(len(longcorein))
sum(longcoreout.city.isna() & ~longcoreout.city_y.isna())
sum(~longcoreout.city.isna() & longcoreout.city_y.isna())
longcorein[longcorein.city.isin(top10.index)].groupby(['bizgroup', 'naics_code']).size()
longcoreintop10 = longcorein[longcorein.city.isin(top10.index)]
longcoreintop10.groupby('bizgroup').agg({'dailyvisits': 'mean'})
# statistical tests on the differences in average daily visits
import scipy
from scipy.stats import ttest_ind
a = longcoreintop10.loc[longcoreintop10['bizgroup'] == 1]['dailyvisits']
b = longcoreintop10.loc[longcoreintop10['bizgroup'] == 0]['dailyvisits']
t, p = scipy.stats.ttest_ind(a, b,
equal_var=False,
alternative = 'two-sided')
print('''The t-test results are: t-statistics = {:.2f},
p-value = {:.2f}'''.format(t,p))
import statsmodels.formula.api as smf
longcoreintop10['date'] = pd.to_datetime(longcoreintop10['date'])
# dummy “Post0313” equal to one if date is at or after March 13, 2020
longcoreintop10['post0313'] = np.where(longcoreintop10['date'] > pd.to_datetime('2020-03-13'), 1, 0)
longcoreintop10['bizgroup_post'] = longcoreintop10['bizgroup'] * longcoreintop10['post0313']
longcoreintop10['dayofweek'] = longcoreintop10['date'].dt.dayofweek
longcoreintop10['weekend'] = np.where(longcoreintop10['dayofweek'] >= 5, 1, 0)
# regress ln(dailyvisits+1) on bizgroup, Post0313, bizgroup_post, and weekend
results = smf.ols('np.log(dailyvisits+1) ~ bizgroup + post0313 + bizgroup_post + weekend', data=longcoreintop10).fit()
print(results.summary())
!pip install linearmodels
from linearmodels import PanelOLS
finaldata = longcoreintop10.copy()
finaldata = finaldata.set_index(['city', 'date'])
# panelOLS on daily visits
mod = PanelOLS(np.log(finaldata['dailyvisits']+1), finaldata[['bizgroup', 'post0313', 'bizgroup_post', 'weekend']], entity_effects=True, time_effects=False, drop_absorbed=True)
results = mod.fit(cov_type='clustered')
print(results)
mod2 = PanelOLS(np.log(finaldata['dailyvisits']+1), finaldata[['bizgroup', 'post0313', 'bizgroup_post', 'weekend']], entity_effects=False, time_effects=True, drop_absorbed=True)
results2 = mod2.fit(cov_type='clustered')
print(results2)
mod3 = PanelOLS(np.log(finaldata['dailyvisits']+1), finaldata[['bizgroup', 'post0313', 'bizgroup_post', 'weekend']], entity_effects=True, time_effects=True, drop_absorbed=True)
results3 = mod3.fit(cov_type='clustered')
print(results3)