import pandas as pd
wyday= pd.read_csv('obs_wylong.csv')
statecoreplaces= pd.read_csv('statecoreplaces.csv')
coreplacesWY = statecoreplaces[statecoreplaces.region == 'WY']
print(coreplacesWY.shape)
(1747, 22)
coreplacesWY.drop(columns = ['brands', 'latitude', 'longitude', 'street_address', 'iso_country_code', 'phone_number', 'open_hours', 'category_tags',
'opened_on', 'closed_on', 'tracking_closed_since', 'geometry_type'], inplace=True)
/Users/irisyu/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py:4906: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
return super().drop(
coreplacesWY.groupby('naics_code').size()
import numpy as np
# variable 'bizgroup' = business group is in full/limited service restaurant
# coreWY = only contains naics code for full/limited service restaurants and grocers
coreWY = coreplacesWY.loc[coreplacesWY['naics_code'].isin([722511, 722513, 445110])].copy()
coreWY['bizgroup'] = np.where(coreWY['naics_code'].isin([722511, 722513]),
1, 0)
coreWY.groupby('bizgroup').size()
table = coreplacesWY.groupby(['city', 'naics_code']).size()
top10 = table.groupby('city').size().nlargest(10)
top10biz = coreWY[coreWY.city.isin(top10.index)]
top10biz.groupby(['city', 'bizgroup']).size().reset_index().pivot(index='city', columns='bizgroup', values=0)
longcoreout = pd.merge(wyday, coreWY, on = 'placekey', how = 'outer', suffixes=('', '_y'))
longcorein = pd.merge(wyday, coreWY, on = 'placekey', how = 'inner', suffixes=('', '_y'))
longcorein.to_csv('longcorein.csv', index = False)
print(len(longcorein))
1188623
sum(longcoreout.city.isna() & ~longcoreout.city_y.isna())
sum(~longcoreout.city.isna() & longcoreout.city_y.isna())
longcorein[longcorein.city.isin(top10.index)].groupby(['bizgroup', 'naics_code']).size()
longcoreintop10 = longcorein[longcorein.city.isin(top10.index)]
longcoreintop10.groupby('bizgroup').agg({'dailyvisits': 'mean'})
# statistical tests on the differences in average daily visits
import scipy
from scipy.stats import ttest_ind
a = longcoreintop10.loc[longcoreintop10['bizgroup'] == 1]['dailyvisits']
b = longcoreintop10.loc[longcoreintop10['bizgroup'] == 0]['dailyvisits']
t, p = scipy.stats.ttest_ind(a, b,
equal_var=False,
alternative = 'two-sided')
print('''The t-test results are: t-statistics = {:.2f},
p-value = {:.2f}'''.format(t,p))
The t-test results are: t-statistics = -31.59,
p-value = 0.00
import statsmodels.formula.api as smf
longcoreintop10['date'] = pd.to_datetime(longcoreintop10['date'])
# dummy “Post0313” equal to one if date is at or after March 13, 2020
longcoreintop10['post0313'] = np.where(longcoreintop10['date'] > pd.to_datetime('2020-03-13'), 1, 0)
longcoreintop10['bizgroup_post'] = longcoreintop10['bizgroup'] * longcoreintop10['post0313']
longcoreintop10['dayofweek'] = longcoreintop10['date'].dt.dayofweek
longcoreintop10['weekend'] = np.where(longcoreintop10['dayofweek'] >= 5, 1, 0)
/var/folders/j2/8gptfrvd0_dfl_yxfcck_hh40000gn/T/ipykernel_55004/1259782823.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
longcoreintop10['date'] = pd.to_datetime(longcoreintop10['date'])
/var/folders/j2/8gptfrvd0_dfl_yxfcck_hh40000gn/T/ipykernel_55004/1259782823.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
longcoreintop10['post0313'] = np.where(longcoreintop10['date'] > pd.to_datetime('2020-03-13'), 1, 0)
/var/folders/j2/8gptfrvd0_dfl_yxfcck_hh40000gn/T/ipykernel_55004/1259782823.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
longcoreintop10['bizgroup_post'] = longcoreintop10['bizgroup'] * longcoreintop10['post0313']
/var/folders/j2/8gptfrvd0_dfl_yxfcck_hh40000gn/T/ipykernel_55004/1259782823.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
longcoreintop10['dayofweek'] = longcoreintop10['date'].dt.dayofweek
/var/folders/j2/8gptfrvd0_dfl_yxfcck_hh40000gn/T/ipykernel_55004/1259782823.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
longcoreintop10['weekend'] = np.where(longcoreintop10['dayofweek'] >= 5, 1, 0)
# regress ln(dailyvisits+1) on bizgroup, Post0313, bizgroup_post, and weekend
results = smf.ols('np.log(dailyvisits+1) ~ bizgroup + post0313 + bizgroup_post + weekend', data=longcoreintop10).fit()
print(results.summary())
OLS Regression Results
===================================================================================
Dep. Variable: np.log(dailyvisits + 1) R-squared: 0.004
Model: OLS Adj. R-squared: 0.004
Method: Least Squares F-statistic: 458.6
Date: Tue, 07 Jun 2022 Prob (F-statistic): 0.00
Time: 17:15:32 Log-Likelihood: -7.4506e+05
No. Observations: 496382 AIC: 1.490e+06
Df Residuals: 496377 BIC: 1.490e+06
Df Model: 4
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
Intercept 1.6370 0.009 185.483 0.000 1.620 1.654
bizgroup 0.1548 0.009 16.949 0.000 0.137 0.173
post0313 0.0380 0.011 3.391 0.001 0.016 0.060
bizgroup_post -0.0415 0.012 -3.555 0.000 -0.064 -0.019
weekend -0.1233 0.003 -36.106 0.000 -0.130 -0.117
==============================================================================
Omnibus: 30491.970 Durbin-Watson: 0.634
Prob(Omnibus): 0.000 Jarque-Bera (JB): 11076.757
Skew: 0.014 Prob(JB): 0.00
Kurtosis: 2.269 Cond. No. 20.9
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
!pip install linearmodels
from linearmodels import PanelOLS
Requirement already satisfied: linearmodels in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (4.25)
Requirement already satisfied: patsy in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from linearmodels) (0.5.2)
Requirement already satisfied: scipy>=1.2 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from linearmodels) (1.7.1)
Requirement already satisfied: numpy>=1.16 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from linearmodels) (1.20.3)
Requirement already satisfied: property-cached>=1.6.3 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from linearmodels) (1.6.4)
Requirement already satisfied: mypy-extensions>=0.4 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from linearmodels) (0.4.3)
Requirement already satisfied: pandas>=0.24 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from linearmodels) (1.3.4)
Requirement already satisfied: statsmodels>=0.11 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from linearmodels) (0.12.2)
Requirement already satisfied: Cython>=0.29.21 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from linearmodels) (0.29.24)
Requirement already satisfied: pyhdfe>=0.1 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from linearmodels) (0.1.0)
Requirement already satisfied: formulaic in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from linearmodels) (0.2.4)
Requirement already satisfied: python-dateutil>=2.7.3 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from pandas>=0.24->linearmodels) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from pandas>=0.24->linearmodels) (2021.3)
Requirement already satisfied: six>=1.5 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas>=0.24->linearmodels) (1.16.0)
Requirement already satisfied: astor in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from formulaic->linearmodels) (0.8.1)
Requirement already satisfied: wrapt in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from formulaic->linearmodels) (1.12.1)
Requirement already satisfied: interface-meta>=1.2 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from formulaic->linearmodels) (1.2.4)
finaldata = longcoreintop10.copy()
finaldata = finaldata.set_index(['city', 'date'])
# panelOLS on daily visits
mod = PanelOLS(np.log(finaldata['dailyvisits']+1), finaldata[['bizgroup', 'post0313', 'bizgroup_post', 'weekend']], entity_effects=True, time_effects=False, drop_absorbed=True)
results = mod.fit(cov_type='clustered')
print(results)
PanelOLS Estimation Summary
================================================================================
Dep. Variable: dailyvisits R-squared: 0.0031
Estimator: PanelOLS R-squared (Between): 0.0648
No. Observations: 496382 R-squared (Within): 0.0031
Date: Tue, Jun 07 2022 R-squared (Overall): 0.0449
Time: 17:17:33 Log-likelihood -7.241e+05
Cov. Estimator: Clustered
F-statistic: 388.06
Entities: 10 P-value 0.0000
Avg Obs: 4.964e+04 Distribution: F(4,496368)
Min Obs: 6546.0
Max Obs: 1.407e+05 F-statistic (robust): 348.91
P-value 0.0000
Time periods: 1096 Distribution: F(4,496368)
Avg Obs: 452.90
Min Obs: 445.00
Max Obs: 469.00
Parameter Estimates
=================================================================================
Parameter Std. Err. T-stat P-value Lower CI Upper CI
---------------------------------------------------------------------------------
bizgroup 0.0961 0.0112 8.5839 0.0000 0.0742 0.1180
post0313 0.0634 0.0140 4.5208 0.0000 0.0359 0.0909
bizgroup_post -0.0677 0.0143 -4.7182 0.0000 -0.0958 -0.0396
weekend -0.1233 0.0034 -36.246 0.0000 -0.1299 -0.1166
=================================================================================
F-test for Poolability: 4860.4
P-value: 0.0000
Distribution: F(9,496368)
Included effects: Entity
mod2 = PanelOLS(np.log(finaldata['dailyvisits']+1), finaldata[['bizgroup', 'post0313', 'bizgroup_post', 'weekend']], entity_effects=False, time_effects=True, drop_absorbed=True)
results2 = mod2.fit(cov_type='clustered')
print(results2)
/Users/irisyu/opt/anaconda3/lib/python3.9/site-packages/linearmodels/panel/model.py:1831: AbsorbingEffectWarning:
Variables have been fully absorbed and have removed from the regression:
post0313, weekend
warnings.warn(
PanelOLS Estimation Summary
================================================================================
Dep. Variable: dailyvisits R-squared: 0.0011
Estimator: PanelOLS R-squared (Between): 0.1400
No. Observations: 496382 R-squared (Within): -0.0004
Date: Tue, Jun 07 2022 R-squared (Overall): 0.0956
Time: 17:17:42 Log-likelihood -7.345e+05
Cov. Estimator: Clustered
F-statistic: 277.02
Entities: 10 P-value 0.0000
Avg Obs: 4.964e+04 Distribution: F(2,495284)
Min Obs: 6546.0
Max Obs: 1.407e+05 F-statistic (robust): 171.40
P-value 0.0000
Time periods: 1096 Distribution: F(2,495284)
Avg Obs: 452.90
Min Obs: 445.00
Max Obs: 469.00
Parameter Estimates
=================================================================================
Parameter Std. Err. T-stat P-value Lower CI Upper CI
---------------------------------------------------------------------------------
bizgroup 0.1549 0.0114 13.605 0.0000 0.1326 0.1772
bizgroup_post -0.0413 0.0145 -2.8375 0.0045 -0.0698 -0.0128
=================================================================================
F-test for Poolability: 20.963
P-value: 0.0000
Distribution: F(1095,495284)
Included effects: Time
mod3 = PanelOLS(np.log(finaldata['dailyvisits']+1), finaldata[['bizgroup', 'post0313', 'bizgroup_post', 'weekend']], entity_effects=True, time_effects=True, drop_absorbed=True)
results3 = mod3.fit(cov_type='clustered')
print(results3)
/Users/irisyu/opt/anaconda3/lib/python3.9/site-packages/linearmodels/panel/model.py:1831: AbsorbingEffectWarning:
Variables have been fully absorbed and have removed from the regression:
post0313, weekend
warnings.warn(
PanelOLS Estimation Summary
================================================================================
Dep. Variable: dailyvisits R-squared: 0.0003
Estimator: PanelOLS R-squared (Between): 0.0609
No. Observations: 496382 R-squared (Within): -0.0006
Date: Tue, Jun 07 2022 R-squared (Overall): 0.0414
Time: 17:18:13 Log-likelihood -7.125e+05
Cov. Estimator: Clustered
F-statistic: 70.133
Entities: 10 P-value 0.0000
Avg Obs: 4.964e+04 Distribution: F(2,495275)
Min Obs: 6546.0
Max Obs: 1.407e+05 F-statistic (robust): 41.540
P-value 0.0000
Time periods: 1096 Distribution: F(2,495275)
Avg Obs: 452.90
Min Obs: 445.00
Max Obs: 469.00
Parameter Estimates
=================================================================================
Parameter Std. Err. T-stat P-value Lower CI Upper CI
---------------------------------------------------------------------------------
bizgroup 0.0960 0.0112 8.5703 0.0000 0.0740 0.1179
bizgroup_post -0.0675 0.0144 -4.6956 0.0000 -0.0956 -0.0393
=================================================================================
F-test for Poolability: 64.318
P-value: 0.0000
Distribution: F(1104,495275)
Included effects: Entity, Time