Vaccinations in the U.S. - Anusha and Taylor
Importing the Data
import pandas as pd
vac_df = pd.read_csv( 'us_state_vaccinations.csv' )
state_code_df = pd.read_csv( 'abbreviations.csv' )
election_df = pd.read_csv( 'election.csv' )
vac_df.head()
state_code_df.head()
election_df.head()
vac_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8668 entries, 0 to 8667
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date 8668 non-null object
1 location 8668 non-null object
2 total_vaccinations 8398 non-null float64
3 total_distributed 8259 non-null float64
4 people_vaccinated 8247 non-null float64
5 people_fully_vaccinated_per_hundred 7748 non-null float64
6 total_vaccinations_per_hundred 7850 non-null float64
7 people_fully_vaccinated 8165 non-null float64
8 people_vaccinated_per_hundred 7825 non-null float64
9 distributed_per_hundred 7837 non-null float64
10 daily_vaccinations_raw 8603 non-null float64
11 daily_vaccinations 8603 non-null float64
12 daily_vaccinations_per_million 8040 non-null float64
13 share_doses_used 8259 non-null float64
dtypes: float64(12), object(2)
memory usage: 948.2+ KB
vac_df['date'] = pd.to_datetime( vac_df['date'] )
Collect By State
def extract_data( state, column ):
return vac_df[vac_df['location'] == state].drop( columns='location' ).set_index( 'date' )[column]
extract_data ('New York State', 'people_vaccinated_per_hundred')
Plotting by State
import matplotlib.pyplot as plt
import numpy as np
series = extract_data( 'New York State', 'people_vaccinated_per_hundred' ).reset_index(drop=True).dropna()
plt.plot( np.arange(0, len(series)), series )
# Format Plot
plt.title('People Vaccinated Per Hundred by Day in New York State')
plt.xlabel('Days')
plt.ylabel('People Vaccinated Per Hundred')
plt.show()
Logistic Curve
def logistic_curve ( x, β0, β1, β2 ):
return β0 / ( 1 + np.exp( β1*(-x+β2) ) )
from scipy.optimize import curve_fit
series = extract_data( 'New York State', 'people_vaccinated_per_hundred' ).reset_index(drop=True).dropna()
xs = np.arange( len(series) )
ys = series
my_guessed_betas = [ series.max(), 1, len(xs)/2 ]
found_betas, covariance = curve_fit( logistic_curve, xs, ys, p0=my_guessed_betas )
β0, β1, β2 = found_betas
β0, β1, β2
fit_model = lambda x: logistic_curve ( x, β0, β1, β2 )
# data
plt.plot( xs, ys )
# model
plt.plot( xs, fit_model(xs) )
plt.legend( [ 'data', 'model' ] )
plt.title('Fitted Model: People Vaccinated Per Hundred in New York State')
plt.xlabel('Days')
plt.ylabel('People Vaccinated Per Hundred')
plt.show()
from scipy.optimize import curve_fit
def find_betas (series):
global xs, ys
xs = np.arange( len(series) )
ys = series
my_guessed_betas = [ series.max(), 1, len(series)/2 ]
try:
found_betas, covariance = curve_fit( logistic_curve, xs, ys, p0=my_guessed_betas )
β0, β1, β2 = found_betas
return β0, β1, β2
except:
β0, β1, β2 = np.nan, np.nan, np.nan
return β0, β1, β2
# fit_model = lambda x: logistic_curve ( x, β0, β1, β2 )
find_betas(series)
Table With State and β Values
unique_list_of_locations = list( vac_df['location'].unique() )
table = pd.DataFrame(columns=['state'])
# adds values to 'state' column in table
table['state'] = [state for state in unique_list_of_locations]
def generate_series (state):
state_series = extract_data( state, 'people_vaccinated_per_hundred' ).reset_index(drop=True).dropna()
return find_betas( state_series )
all_states_betas = table['state'].apply( generate_series )
all_states_betas
states = vac_df['location'].unique()
states = np.sort(states)
states = pd.DataFrame(states, columns=['state'])
states['maximum number of people vaccinated per hundred'] = all_states_betas.apply(lambda x: x[0])
states['rate of vaccination'] = all_states_betas.apply(lambda x: x[1])
states['time of maximum increase'] = all_states_betas.apply(lambda x: x[2])
states
Adding State Abbreviations
states_abb = states.merge(state_code_df)
states_abb
Adding Political Affiliations of Each State
states_abb_elec = states_abb.merge(election_df)
states_abb_elec
Heatmap
import seaborn as sns
numeric_columns_only = states_abb_elec.drop( ['state', 'abbreviation'], axis=1 )
correlation_coefficients = np.corrcoef( numeric_columns_only, rowvar=False )
sns.heatmap( correlation_coefficients, annot=True, cmap='BuPu')
plt.xticks( np.arange(5)+0.5, numeric_columns_only.columns, rotation= 90)
plt.yticks( np.arange(5)+0.5, numeric_columns_only.columns, rotation=0)
plt.title('Correlation Coefficient Heatmap')
plt.show()
Hypothesis Test
import scipy.stats as stats
α = 0.05
reps = states_abb_elec[states_abb_elec['Trump percent'] >= 60]['maximum number of people vaccinated per hundred']
dems = states_abb_elec[states_abb_elec['Trump percent'] <= 40]['maximum number of people vaccinated per hundred']
t_statistics, p_value = stats.ttest_ind( dems, reps, equal_var=False )
reject_H0 = p_value < α
α, p_value, reject_H0