Exploring the Relationship Between State Vaccination Rates & Political Affiliations
Jame Zou, 6/6/2021
Data preparation
import pandas as pd
df_abbr = pd.read_csv("US_Abbreviations.csv")
df_vacc = pd.read_csv("US_Vaccinations.csv")
df_elec = pd.read_csv("2016_Elections.csv")
df_vacc['date'] = pd.to_datetime(df_vacc['date'])
df_vacc.dtypes
State and column example
state = 'California'
column = 'people_vaccinated_per_hundred'
def state_data(state, column):
state_sr = df_vacc[df_vacc['location'] == state].set_index('date')[column].fillna(method='ffill')
return state_sr
state_sr = state_data(state, column)
state_sr
import matplotlib.pyplot as plt
def state_plot(state, series, startDate):
x = series.index
y = series.values
plt.xlabel(f"Date")
plt.ylabel("% of population vaccinated")
plt.title(f"{state} Vaccinations")
plt.xticks(rotation=45, ha='right')
plt.plot(x, y)
plt.show()
startDate = state_sr.index[0]
state_plot(state, state_sr, startDate)
Fitting a logistic model
import numpy as np
from scipy.optimize import curve_fit
def logistic_curve(x, β0, β1, β2):
return β0 / (1 + np.exp(β1 * (-x + β2)))
def logistic_betas(series):
xs = np.arange(len(series))
ys = series.values
max_vacc = series.max()
half_way = len(series) / 2
guessed_betas = [max_vacc, 1, half_way]
try:
found_betas, covariance = curve_fit(logistic_curve, xs, ys, p0=guessed_betas)
β0, β1, β2 = found_betas
return β0, β1, β2
except:
β0, β1, β2 = np.NaN, np.NaN, np.NaN
return β0, β1, β2
β0, β1, β2 = logistic_betas(state_sr)
β0, β1, β2
def logistic_model(series, β0, β1, β2, startDate, state):
xs = np.arange(len(series))
ys = series.values
fit_model = lambda x: logistic_curve(x, β0, β1, β2)
plt.plot(xs, ys, label='data')
plt.plot(xs, fit_model(xs), label='model')
plt.legend()
plt.xlabel(f"Days since {series.index.min()}")
plt.ylabel("% of population vaccinated")
plt.title(f"{state} Vaccinations and Logistic Model")
plt.show()
logistic_model(state_sr, β0, β1, β2, startDate, state)
Finding βs for all states
def us_betas(state):
data = state_data(state, 'people_vaccinated_per_hundred')
return logistic_betas(data)
all_betas = df_abbr['US STATE'].apply(us_betas)
all_betas.head()
all_states = df_elec[['State', 'Clinton_Percent', 'Trump_Percent']].copy()
all_states.insert(0, 'State Name', df_abbr['US STATE'])
all_states['Maximum Vaccinations'] = all_betas.apply(lambda x: x[0])
all_states['Rate of Vaccination'] = all_betas.apply(lambda x: x[1])
all_states['Time of Maximum Increase'] = all_betas.apply(lambda x: x[2])
all_states.head()
Visualization
import seaborn as sns
sns.pairplot(all_states)
plt.tight_layout()
plt.show()
numeric_only = all_states[['Clinton_Percent', 'Trump_Percent', 'Maximum Vaccinations',
'Rate of Vaccination', 'Time of Maximum Increase']].dropna()
corr_coeff = np.corrcoef(numeric_only, rowvar=False)
sns.heatmap(corr_coeff, annot=True)
plt.xticks(np.arange(5)+0.5, numeric_only.columns, rotation=20, ha='right')
plt.yticks(np.arange(5)+0.5, numeric_only.columns, rotation=0)
plt.show()
Hypothesis test
import scipy.stats as stats
alpha = 0.05
blue = all_states[all_states['Clinton_Percent'] > all_states['Trump_Percent']]['Time of Maximum Increase'].dropna()
red = all_states[all_states['Trump_Percent'] > all_states['Clinton_Percent']]['Time of Maximum Increase'].dropna()
t_stat, p_value = stats.ttest_ind(blue, red, equal_var=False)
reject_H0 = p_value < alpha
alpha, p_value, reject_H0