import pandas as pd
import numpy as np
wyday= pd.read_csv('longcorein.csv', dtype = {'poi_cbg': 'int64'})
# preparing the data
wyday['date'] = pd.to_datetime(wyday['date'])
wyday2 = wyday[wyday["date"].isin(['2019-01', '2020-01', '2021-01', '2019-04', '2020-04', '2021-04', '2019-07', '2020-07', '2021-07', '2019-10', '2020-10', '2021-10'])]
wyday_ch = wyday2[wyday2["city"].isin(['Cheyenne'])]
wyday_ch.groupby(['date']).size()
# import covid cases by county and merge with prev data
countycv = pd.read_csv('us-counties-covid-20220404.csv')
cbgdesc = pd.read_csv('cbg_field_descriptions.csv')
cbg = pd.read_csv('cbg.csv', dtype = {'census_block_group': 'int64'})
wyday_ch = wyday_ch[~wyday_ch.isna().any(axis = 1)].copy()
cbg = cbg[~cbg.isna().any(axis = 1)].copy()
lsg = wyday_ch.merge(cbg, left_on='poi_cbg', right_on='census_block_group', how= 'inner')
lsg.shape
lsg_pk = lsg['placekey'].unique()
lsg_pk.shape
lsg.groupby('naics_code').size()
lsg.groupby(['naics_code', 'placekey']).size()
lsg['B01001e1'].mean()
lsg['poi_cbg'] = lsg['poi_cbg'].astype(str)
lsg['fips'] = lsg['poi_cbg'].str[:5]
lsg['fips'].head()
countycv['date'] = pd.to_datetime(countycv['date'])
countycv['fips'] = countycv['fips'].astype(str)
countycov = countycv[countycv["fips"].isin(['56021.0'])]
countycov.shape
countycov['date'].min()
countycov['date'].max()
casemean = countycov['cases'].mean()
deathmean = countycov['deaths'].mean()
print(f'Avg Cases: {casemean}')
print(f'Avg Deaths: {deathmean}')
# merge data for final analysis
localmerge = pd.merge(lsg, countycov, on='date', how='left')
localmerge.drop(columns = ['fips_x', 'fips_y', 'city_y', 'region_y'], inplace = True)
localmerge.cases.isnull().sum()
localmerge.deaths.isnull().sum()
cols = ['cases', 'deaths']
localmerge[cols] = localmerge[cols].fillna(0)
avgcase = localmerge['cases'].mean()
avgdeath = localmerge['deaths'].mean()
print(f'Avg Cases: {avgcase}')
print(f'Avg Deaths: {avgdeath}')
localmerge = localmerge.dropna()
localmerge['date'] = pd.to_datetime(localmerge['date'])
localmerge['maskmandate'] = np.where((localmerge['date'] >= '2020-12-07') & (localmerge['date'] <= '2021-03-16'), 1, 0)
localmerge['restclose'] = np.where((localmerge['date'] >= '2020-03-20') & (localmerge['date'] <= '2020-05-15'), 1, 0)
localmerge['limcap'] = np.where((localmerge['date'] >= '2020-05-15') & (localmerge['date'] <= '2021-03-16'), 1, 0)
localmerge['dayofweek'] = localmerge['date'].dt.dayofweek
localmerge['weekend'] = np.where(localmerge['dayofweek'] >= 5, 1, 0)
X = localmerge.loc[:,['weekend','maskmandate', 'dayofweek', 'limcap', 'restclose', 'cases', 'deaths',
'naics_code', 'B01001e1', 'B19013e1', 'B19025e1', 'B19301e1', 'B01001e2', 'B01002e1',
'B01002e2', 'B01002e3', 'B02001e2', 'B02001e3', 'B02001e5']]
Y = np.log(localmerge.loc[:, 'dailyvisits'] + 1)
X['naics_code'] = X['naics_code'].astype(str)
X['weekend'] = X['weekend'].astype(str)
X['dayofweek'] = X['dayofweek'].astype(str)
X.describe(include=[np.number])
X.describe(exclude=[np.number])
X = pd.get_dummies(data = X, drop_first = True)
# use sklearn to get training set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape
X_test_scaled.shape
print(X_train.cases.describe())
print(X_test.cases.describe())
X_train.groupby('maskmandate').head()
X_test.groupby('maskmandate').head()
# OLS regression
import statsmodels.api as sm
X2_train = sm.add_constant(X_train)
results = sm.OLS(y_train,X2_train.astype(float)).fit()
results.summary()
X2_test = sm.add_constant(X_test)
y_pred = results.predict(X2_test)
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))
# lasso for linear regression
from sklearn.linear_model import Lasso
linlasso = Lasso(alpha=0.0001, max_iter = 10000).fit(X_train_scaled, y_train)
print('Non-zero features: {}'.format(np.sum(linlasso.coef_ != 0)))
print('R-squared score (training): {:.3f}'.format(linlasso.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}\n'.format(linlasso.score(X_test_scaled, y_test)))
print('Features with non-zero weight (sorted by absolute magnitude):')
for e in sorted (list(zip(list(X), linlasso.coef_)),
key = lambda e: -abs(e[1])):
if e[1] != 0:
print('\t{}, {:.3f}'.format(e[0], e[1]))
print('Features with zero weight (sorted by absolute magnitude):')
for e in sorted (list(zip(list(X), linlasso.coef_)),
key = lambda e: -abs(e[1])):
if e[1] == 0:
print('\t{}, {:.3f}'.format(e[0], e[1]))
for alpha in [0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 5]:
linlasso2 = Lasso(alpha=alpha, max_iter = 10000).fit(X_train_scaled, y_train)
r2_train = linlasso2.score(X_train_scaled, y_train)
r2_test = linlasso2.score(X_test_scaled, y_test)
print('Alpha = {}\nFeatures kept: {}, r-squared training: {:.3f}, \
r-squared test: {:.3f}\n'
.format(alpha, np.sum(linlasso2.coef_ != 0), r2_train, r2_test))
pip install eli5
import eli5
from eli5.sklearn import PermutationImportance
permlas = PermutationImportance(linlasso).fit(X_test_scaled, y_test)
eli5.show_weights(permlas, feature_names=list(X_train.columns))
# ridge for linear regression
from sklearn.linear_model import Ridge
linridge = Ridge(alpha=0.1).fit(X_train_scaled, y_train)
print('Ridge regression coef:\n{}'.format(linridge.coef_))
print('R-squared score (training): {:.3f}'.format(linridge.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'.format(linridge.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linridge.coef_ != 0)))
linridge2 = Ridge(alpha=0.001).fit(X_train_scaled, y_train)
print('Ridge regression coef:\n{}'.format(linridge2.coef_))
print('R-squared score (training): {:.3f}'.format(linridge2.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'.format(linridge2.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linridge2.coef_ != 0)))
print('Number of zero features: {}'.format(np.sum(linridge2.coef_ == 0)))
linridge3 = Ridge(alpha=1).fit(X_train_scaled, y_train)
print('Ridge regression coef:\n{}'.format(linridge3.coef_))
print('R-squared score (training): {:.3f}'.format(linridge3.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'.format(linridge3.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'.format(np.sum(linridge3.coef_ != 0)))
coef = linridge2.coef_
coef_sorted = sorted(zip(X.columns, coef), key = lambda x: x[1])
for i, j in coef_sorted:
print('{var} {coef}'.format(var = i, coef = j))
permri = PermutationImportance(linridge2).fit(X_test_scaled, y_test)
eli5.show_weights(permri, feature_names = list(X_train.columns))