import pandas as pd
import numpy as np
wyday= pd.read_csv('longcorein.csv', dtype = {'poi_cbg': 'int64'})
# preparing the data
wyday['date'] = pd.to_datetime(wyday['date'])
wyday2 = wyday[wyday["date"].isin(['2019-01', '2020-01', '2021-01', '2019-04', '2020-04', '2021-04', '2019-07', '2020-07', '2021-07', '2019-10', '2020-10', '2021-10'])]
wyday_ch = wyday2[wyday2["city"].isin(['Cheyenne'])]
wyday_ch.groupby(['date']).size()
/Users/irisyu/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (9) have mixed types.Specify dtype option on import or set low_memory=False.
exec(code_obj, self.user_global_ns, self.user_ns)
# import covid cases by county and merge with prev data
countycv = pd.read_csv('us-counties-covid-20220404.csv')
cbgdesc = pd.read_csv('cbg_field_descriptions.csv')
cbg = pd.read_csv('cbg.csv', dtype = {'census_block_group': 'int64'})
wyday_ch = wyday_ch[~wyday_ch.isna().any(axis = 1)].copy()
cbg = cbg[~cbg.isna().any(axis = 1)].copy()
lsg = wyday_ch.merge(cbg, left_on='poi_cbg', right_on='census_block_group', how= 'inner')
lsg.shape
lsg_pk = lsg['placekey'].unique()
lsg_pk.shape
lsg.groupby('naics_code').size()
lsg.groupby(['naics_code', 'placekey']).size()
lsg['B01001e1'].mean()
lsg['poi_cbg'] = lsg['poi_cbg'].astype(str)
lsg['fips'] = lsg['poi_cbg'].str[:5]
lsg['fips'].head()
countycv['date'] = pd.to_datetime(countycv['date'])
countycv['fips'] = countycv['fips'].astype(str)
countycov = countycv[countycv["fips"].isin(['56021.0'])]
countycov.shape
countycov['date'].min()
countycov['date'].max()
casemean = countycov['cases'].mean()
deathmean = countycov['deaths'].mean()
print(f'Avg Cases: {casemean}')
print(f'Avg Deaths: {deathmean}')
Avg Cases: 9252.153538050734
Avg Deaths: 103.72363150867824
# merge data for final analysis
localmerge = pd.merge(lsg, countycov, on='date', how='left')
localmerge.drop(columns = ['fips_x', 'fips_y', 'city_y', 'region_y'], inplace = True)
localmerge.cases.isnull().sum()
localmerge.deaths.isnull().sum()
cols = ['cases', 'deaths']
localmerge[cols] = localmerge[cols].fillna(0)
avgcase = localmerge['cases'].mean()
avgdeath = localmerge['deaths'].mean()
print(f'Avg Cases: {avgcase}')
print(f'Avg Deaths: {avgdeath}')
Avg Cases: 3510.3118279569894
Avg Deaths: 39.236559139784944
localmerge = localmerge.dropna()
localmerge['date'] = pd.to_datetime(localmerge['date'])
localmerge['maskmandate'] = np.where((localmerge['date'] >= '2020-12-07') & (localmerge['date'] <= '2021-03-16'), 1, 0)
localmerge['restclose'] = np.where((localmerge['date'] >= '2020-03-20') & (localmerge['date'] <= '2020-05-15'), 1, 0)
localmerge['limcap'] = np.where((localmerge['date'] >= '2020-05-15') & (localmerge['date'] <= '2021-03-16'), 1, 0)
localmerge['dayofweek'] = localmerge['date'].dt.dayofweek
localmerge['weekend'] = np.where(localmerge['dayofweek'] >= 5, 1, 0)
X = localmerge.loc[:,['weekend','maskmandate', 'dayofweek', 'limcap', 'restclose', 'cases', 'deaths',
'naics_code', 'B01001e1', 'B19013e1', 'B19025e1', 'B19301e1', 'B01001e2', 'B01002e1',
'B01002e2', 'B01002e3', 'B02001e2', 'B02001e3', 'B02001e5']]
Y = np.log(localmerge.loc[:, 'dailyvisits'] + 1)
X['naics_code'] = X['naics_code'].astype(str)
X['weekend'] = X['weekend'].astype(str)
X['dayofweek'] = X['dayofweek'].astype(str)
X.describe(include=[np.number])
X.describe(exclude=[np.number])
X = pd.get_dummies(data = X, drop_first = True)
# use sklearn to get training set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape
X_test_scaled.shape
print(X_train.cases.describe())
print(X_test.cases.describe())
count 81.000000
mean 5867.802469
std 5134.381466
min 35.000000
25% 270.000000
50% 7045.000000
75% 10533.000000
max 14555.000000
Name: cases, dtype: float64
count 27.000000
mean 6578.740741
std 6117.541918
min 35.000000
25% 270.000000
50% 7045.000000
75% 12544.000000
max 14555.000000
Name: cases, dtype: float64
X_train.groupby('maskmandate').head()
X_test.groupby('maskmandate').head()
# OLS regression
import statsmodels.api as sm
X2_train = sm.add_constant(X_train)
results = sm.OLS(y_train,X2_train.astype(float)).fit()
results.summary()
/Users/irisyu/opt/anaconda3/lib/python3.9/site-packages/statsmodels/tsa/tsatools.py:142: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only
x = pd.concat(x[::order], 1)
X2_test = sm.add_constant(X_test)
y_pred = results.predict(X2_test)
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))
0.6452068307445257
/Users/irisyu/opt/anaconda3/lib/python3.9/site-packages/statsmodels/tsa/tsatools.py:142: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only
x = pd.concat(x[::order], 1)
# lasso for linear regression
from sklearn.linear_model import Lasso
linlasso = Lasso(alpha=0.0001, max_iter = 10000).fit(X_train_scaled, y_train)
print('Non-zero features: {}'.format(np.sum(linlasso.coef_ != 0)))
print('R-squared score (training): {:.3f}'.format(linlasso.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}\n'.format(linlasso.score(X_test_scaled, y_test)))
print('Features with non-zero weight (sorted by absolute magnitude):')
for e in sorted (list(zip(list(X), linlasso.coef_)),
key = lambda e: -abs(e[1])):
if e[1] != 0:
print('\t{}, {:.3f}'.format(e[0], e[1]))
print('Features with zero weight (sorted by absolute magnitude):')
for e in sorted (list(zip(list(X), linlasso.coef_)),
key = lambda e: -abs(e[1])):
if e[1] == 0:
print('\t{}, {:.3f}'.format(e[0], e[1]))
Non-zero features: 15
R-squared score (training): 0.528
R-squared score (test): 0.646
Features with non-zero weight (sorted by absolute magnitude):
B02001e2, -6.396
B02001e3, 3.615
B02001e5, 2.692
B01002e3, -2.066
B19301e1, -1.696
restclose, -1.455
naics_code_722511, 1.103
naics_code_721110, 1.073
limcap, -0.743
naics_code_722513, 0.670
cases, -0.486
dayofweek_3, -0.414
dayofweek_4, -0.346
naics_code_445120, 0.338
maskmandate, 0.279
Features with zero weight (sorted by absolute magnitude):
deaths, 0.000
B01001e1, -0.000
B19013e1, -0.000
B19025e1, 0.000
B01001e2, 0.000
B01002e1, 0.000
B01002e2, -0.000
for alpha in [0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 5]:
linlasso2 = Lasso(alpha=alpha, max_iter = 10000).fit(X_train_scaled, y_train)
r2_train = linlasso2.score(X_train_scaled, y_train)
r2_test = linlasso2.score(X_test_scaled, y_test)
print('Alpha = {}\nFeatures kept: {}, r-squared training: {:.3f}, \
r-squared test: {:.3f}\n'
.format(alpha, np.sum(linlasso2.coef_ != 0), r2_train, r2_test))
Alpha = 0
Features kept: 22, r-squared training: 0.528, r-squared test: 0.645
Alpha = 1e-05
Features kept: 18, r-squared training: 0.528, r-squared test: 0.645
Alpha = 0.0001
Features kept: 15, r-squared training: 0.528, r-squared test: 0.646
Alpha = 0.001
Features kept: 14, r-squared training: 0.526, r-squared test: 0.642
Alpha = 0.01
Features kept: 13, r-squared training: 0.441, r-squared test: 0.445
Alpha = 0.1
Features kept: 1, r-squared training: 0.142, r-squared test: 0.149
Alpha = 1
Features kept: 0, r-squared training: 0.000, r-squared test: -0.038
Alpha = 5
Features kept: 0, r-squared training: 0.000, r-squared test: -0.038
/var/folders/j2/8gptfrvd0_dfl_yxfcck_hh40000gn/T/ipykernel_55037/1320401982.py:2: UserWarning: With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator
linlasso2 = Lasso(alpha=alpha, max_iter = 10000).fit(X_train_scaled, y_train)
/Users/irisyu/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:530: UserWarning: Coordinate descent with no regularization may lead to unexpected results and is discouraged.
model = cd_fast.enet_coordinate_descent(
/Users/irisyu/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:530: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 26.345047390860724, tolerance: 0.011167289210375351
model = cd_fast.enet_coordinate_descent(
/Users/irisyu/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:530: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.3733204291088299, tolerance: 0.011167289210375351
model = cd_fast.enet_coordinate_descent(
pip install eli5
Requirement already satisfied: eli5 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (0.11.0)
Requirement already satisfied: tabulate>=0.7.7 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from eli5) (0.8.9)
Requirement already satisfied: numpy>=1.9.0 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from eli5) (1.20.3)
Requirement already satisfied: jinja2 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from eli5) (2.11.3)
Requirement already satisfied: graphviz in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from eli5) (0.20)
Requirement already satisfied: attrs>16.0.0 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from eli5) (21.2.0)
Requirement already satisfied: six in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from eli5) (1.16.0)
Requirement already satisfied: scikit-learn>=0.20 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from eli5) (0.24.2)
Requirement already satisfied: scipy in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from eli5) (1.7.1)
Requirement already satisfied: joblib>=0.11 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn>=0.20->eli5) (1.1.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn>=0.20->eli5) (2.2.0)
Requirement already satisfied: MarkupSafe>=0.23 in /Users/irisyu/opt/anaconda3/lib/python3.9/site-packages (from jinja2->eli5) (1.1.1)
Note: you may need to restart the kernel to use updated packages.
import eli5
from eli5.sklearn import PermutationImportance
permlas = PermutationImportance(linlasso).fit(X_test_scaled, y_test)
eli5.show_weights(permlas, feature_names=list(X_train.columns))
# ridge for linear regression
from sklearn.linear_model import Ridge
linridge = Ridge(alpha=0.1).fit(X_train_scaled, y_train)
print('Ridge regression coef:\n{}'.format(linridge.coef_))
print('R-squared score (training): {:.3f}'.format(linridge.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'.format(linridge.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linridge.coef_ != 0)))
Ridge regression coef:
[ 0.16077865 -0.33152571 -1.01157904 -0.22991763 0.30691198 -1.87722814
0.22103322 0.06129274 -1.10851407 1.31410264 0.33459456 -0.28522013
-2.35402976 -3.91970572 2.19378079 2.2729288 -0.39503833 -0.43370117
0.1605328 0.8605944 0.88630594 0.49685944]
R-squared score (training): 0.519
R-squared score (test): 0.617
Number of non-zero features: 22
linridge2 = Ridge(alpha=0.001).fit(X_train_scaled, y_train)
print('Ridge regression coef:\n{}'.format(linridge2.coef_))
print('R-squared score (training): {:.3f}'.format(linridge2.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'.format(linridge2.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linridge2.coef_ != 0)))
print('Number of zero features: {}'.format(np.sum(linridge2.coef_ == 0)))
Ridge regression coef:
[ 0.30338522 -0.70791049 -1.42349264 -0.77293414 0.32509409 -2.30413574
0.23023336 0.07214424 -1.39517651 1.69620422 0.5993956 -0.18948324
-2.75125939 -4.81607997 2.82017242 2.67193474 -0.41394346 -0.35401676
0.34619031 1.07872763 1.10846795 0.67547529]
R-squared score (training): 0.528
R-squared score (test): 0.645
Number of non-zero features: 22
Number of zero features: 0
linridge3 = Ridge(alpha=1).fit(X_train_scaled, y_train)
print('Ridge regression coef:\n{}'.format(linridge3.coef_))
print('R-squared score (training): {:.3f}'.format(linridge3.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'.format(linridge3.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linridge3.coef_ != 0)))
Ridge regression coef:
[-0.08956617 -0.02972191 -0.5886898 0.08131951 0.23893256 -0.72188657
0.11874237 0.00542386 -0.42045806 0.34655778 -0.26422782 -0.44416801
-1.21719172 -1.50267832 0.57310913 1.18835203 -0.26552595 -0.19743078
-0.28003242 0.30607924 0.3126211 0.03044751]
R-squared score (training): 0.417
R-squared score (test): 0.424
Number of non-zero features: 22
coef = linridge2.coef_
coef_sorted = sorted(zip(X.columns, coef), key = lambda x: x[1])
for i, j in coef_sorted:
print('{var} {coef}'.format(var = i, coef = j))
B02001e2 -4.816079971664496
B01002e3 -2.7512593864163652
B01001e1 -2.3041357374115887
restclose -1.423492635765304
B19301e1 -1.395176509493562
cases -0.7729341446240525
limcap -0.707910493208603
dayofweek_3 -0.41394346217379646
dayofweek_4 -0.3540167560579346
B01002e2 -0.18948323769886066
B19025e1 0.07214423636450355
B19013e1 0.23023335751328242
maskmandate 0.30338522251929134
deaths 0.3250940937489878
naics_code_445120 0.3461903087571737
B01002e1 0.5993955983430187
naics_code_722513 0.6754752889063014
naics_code_721110 1.0787276328463136
naics_code_722511 1.10846795019107
B01001e2 1.6962042177593564
B02001e5 2.6719347385524648
B02001e3 2.8201724152229386
permri = PermutationImportance(linridge2).fit(X_test_scaled, y_test)
eli5.show_weights(permri, feature_names = list(X_train.columns))