Analyzing the Relationship Between Vaccinations and Political Standing
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
# read in vaccination data
df_vax = pd.read_csv("C:/Users/Claire Mezzina/Documents/Data Science/us_state_vaccinations.csv")
df_vax = df.dropna()
# make dates column into datetime object
df_vax["date"] = pd.to_datetime(df_vax["date"])
print(min(df_vax["date"]))
print(max(df_vax["date"]))
print(len(df_vax["date"].value_counts()))
# convert dates into numerical range of days starting from first date to last date
dates_new = np.arange(1, 159)
# read in state abbreviation data and state voting data
df_state = pd.read_excel("C:/Users/Claire Mezzina/Documents/Data Science/US_state_abbreviations.xlsx")
df_voting = pd.read_excel("C:/Users/Claire Mezzina/Documents/Data Science/voting.xlsx")
2021-01-12 00:00:00
2021-05-26 00:00:00
130
# function to take a state and look at any column of data for that state
def by_state(state, column):
return df_vax[df_vax["location"] == state].drop(columns="location").set_index("date")[column]
# function for logistic model
def logistic_curve(x, β0, β1, β2):
return β0 / (1 + np.exp(β1 * (-x + β2)))
# function to produce vaccination metrics with logistic model for any given state
def state_metrics(state):
# set x and y
ys = df_vax[df_vax["location"] == state]["people_vaccinated_per_hundred"]
xs = np.arange(len(ys))
# guesses for beta values
β0 = df_vax[df_vax["location"] == state]["people_vaccinated_per_hundred"].max()
β1 = 1
β2 = len(df_vax[df_vax["location"] == state]["people_vaccinated_per_hundred"]) / 2
# curve_fit function
my_guessed_betas = [β0, β1, β2]
found_betas, covariance = curve_fit(logistic_curve, xs, ys, p0 = my_guessed_betas)
β0, β1, β2 = found_betas
return β0, β1, β2
# x and y for plot
ys = by_state("Massachusetts", "people_vaccinated_per_hundred")
xs = np.arange(len(ys))
# plot of people vaccinated per hundred in Massachusetts across time
plt.plot(xs, ys)
plt.title("People vaccinated in MA across time")
plt.xlabel("Number of days (since 1/12/2021)")
plt.ylabel("Number of people vaccinated per hundred")
plt.grid(alpha=0.3)
plt.show()
# find the vaccination metrics for Massachusetts
state_metrics("Massachusetts")
# create random sample points
sample_xs = np.linspace(2, 5, 100)
# create plot of model fit
fit_model = lambda x: logistic_curve(x, β0, β1, β2)
plt.scatter(xs, ys)
plt.plot(sample_xs, fit_model(sample_xs))
plt.title("Model fit to people vaccinated in MA across time")
plt.xlabel("Number of days (since 1/12/2021)")
plt.ylabel("Number of people vaccinated per hundred")
plt.grid(alpha=0.3)
plt.show()
# list of locations in vaccination data that are not states for voting data
not_incl = ["Puerto Rico", "Marshall Islands", "Federated States of Micronesia", "Indian Health Svc", "Guam", "American Samoa", "United States", "Northern Mariana Islands", "Virgin Islands", "Republic of Palau"]
# get list of just US states, exluding extraneous locations as listed in not_incl
states=[]
for state in list(set(df_vax["location"])):
if state not in not_incl:
states.append(state)
B_0 = []
B_1 = []
B_2 = []
for state in states:
bs = state_metrics(state)
B_0.append(bs[0])
B_1.append(bs[1])
B_2.append(bs[2])
# create new dataframe
dict_new = {"state": states, "max_vaccinations": B_0, "vaccination_rate": B_1, "max_increase_day": B_2}
df_new = pd.DataFrame(dict_new)
# change "New York State" cell to "New York"
df_new["state"][45] = "New York"
print(df_new)
state max_vaccinations vaccination_rate max_increase_day
0 Mississippi 34.668860 0.040996 49.875880
1 Maryland 62.631869 0.038292 71.458327
2 Massachusetts 75.497051 0.036684 75.038230
3 North Dakota 42.766528 0.043423 45.950711
4 Ohio 47.738268 0.040831 59.703020
5 Tennessee 41.455718 0.038337 60.857958
6 Utah 50.959429 0.035631 69.869221
7 West Virginia 41.467020 0.035438 46.330332
8 Idaho 38.410120 0.042945 54.290691
9 Nebraska 50.238661 0.042292 59.859773
10 Wisconsin 53.616933 0.041091 61.303224
11 Nevada 47.875222 0.038862 62.631306
12 New Jersey 64.979189 0.039090 69.120765
13 Florida 54.970953 0.034417 69.865220
14 Montana 46.515794 0.040916 54.688533
15 New Hampshire 68.795576 0.047221 66.629780
16 Louisiana 36.671207 0.039820 50.363868
17 Texas 47.793686 0.037199 66.439805
18 Virginia 60.780057 0.036463 68.582409
19 New Mexico 59.203039 0.039760 56.101005
20 Iowa 51.061935 0.042208 59.838240
21 Connecticut 71.188161 0.034603 69.963979
22 South Carolina 42.412899 0.040751 57.408731
23 Washington 63.892360 0.034459 74.852766
24 Missouri 44.701270 0.039812 62.312831
25 Wyoming 37.523014 0.040518 46.934267
26 Georgia 44.339985 0.036084 67.603738
27 Michigan 53.632320 0.036420 67.135186
28 Rhode Island 65.976023 0.038742 69.875425
29 Vermont 89.699812 0.031594 86.506374
30 Kansas 48.448134 0.045545 59.620669
31 Maine 69.241932 0.038625 70.215485
32 Delaware 59.902098 0.036913 69.676065
33 Hawaii 90.439664 0.029503 91.777736
34 District of Columbia 74.273709 0.032414 87.191519
35 Alaska 47.291552 0.034895 44.435591
36 Pennsylvania 64.361858 0.038383 72.169599
37 Oklahoma 42.725543 0.042183 49.243126
38 Colorado 60.633604 0.035772 70.990829
39 Minnesota 58.377538 0.038231 66.326053
40 South Dakota 49.179947 0.043972 50.246166
41 Arkansas 42.420943 0.036687 58.604460
42 Illinois 58.302036 0.038849 67.694760
43 North Carolina 45.509902 0.040223 58.082465
44 Kentucky 47.183262 0.042192 56.461489
45 New York 60.397543 0.038732 69.515705
46 Indiana 45.699646 0.033932 63.822595
47 Oregon 65.576539 0.031711 79.599457
48 Arizona 48.103552 0.039200 59.050567
49 California 63.444114 0.037272 72.205937
50 Alabama 38.319998 0.038746 58.687263
<ipython-input-420-a3f4697120ef>:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_new["state"][45] = "New York"
# add District of Columbia to state abbreviations data
add_dc = {"US STATE": "District of Columbia", "ABBREVIATION": "DC"}
df_state = df_state.append(add_dc, ignore_index=True)
# merge state abbrevation table to new table
df_all = df_new.merge(df_state, how="inner", left_on="state", right_on="US STATE")
# drop the extra state name column
df_all = df_all.drop(["US STATE"], axis=1)
# put state abbreviation column in front
column_names = ["ABBREVIATION", "state", "max_vaccinations", "vaccination_rate", "max_increase_day"]
df_all = df_all.reindex(columns=column_names)
print(df_all)
ABBREVIATION state max_vaccinations vaccination_rate \
0 MS Mississippi 34.668860 0.040996
1 MD Maryland 62.631869 0.038292
2 MA Massachusetts 75.497051 0.036684
3 ND North Dakota 42.766528 0.043423
4 OH Ohio 47.738268 0.040831
5 TN Tennessee 41.455718 0.038337
6 UT Utah 50.959429 0.035631
7 WV West Virginia 41.467020 0.035438
8 ID Idaho 38.410120 0.042945
9 NE Nebraska 50.238661 0.042292
10 WI Wisconsin 53.616933 0.041091
11 NV Nevada 47.875222 0.038862
12 NJ New Jersey 64.979189 0.039090
13 FL Florida 54.970953 0.034417
14 MT Montana 46.515794 0.040916
15 NH New Hampshire 68.795576 0.047221
16 LA Louisiana 36.671207 0.039820
17 TX Texas 47.793686 0.037199
18 VA Virginia 60.780057 0.036463
19 NM New Mexico 59.203039 0.039760
20 IA Iowa 51.061935 0.042208
21 CT Connecticut 71.188161 0.034603
22 SC South Carolina 42.412899 0.040751
23 WA Washington 63.892360 0.034459
24 MO Missouri 44.701270 0.039812
25 WY Wyoming 37.523014 0.040518
26 GA Georgia 44.339985 0.036084
27 MI Michigan 53.632320 0.036420
28 RI Rhode Island 65.976023 0.038742
29 VT Vermont 89.699812 0.031594
30 KS Kansas 48.448134 0.045545
31 ME Maine 69.241932 0.038625
32 DE Delaware 59.902098 0.036913
33 HI Hawaii 90.439664 0.029503
34 DC District of Columbia 74.273709 0.032414
35 AK Alaska 47.291552 0.034895
36 PA Pennsylvania 64.361858 0.038383
37 OK Oklahoma 42.725543 0.042183
38 CO Colorado 60.633604 0.035772
39 MN Minnesota 58.377538 0.038231
40 SD South Dakota 49.179947 0.043972
41 AR Arkansas 42.420943 0.036687
42 IL Illinois 58.302036 0.038849
43 NC North Carolina 45.509902 0.040223
44 KY Kentucky 47.183262 0.042192
45 NY New York 60.397543 0.038732
46 IN Indiana 45.699646 0.033932
47 OR Oregon 65.576539 0.031711
48 AZ Arizona 48.103552 0.039200
49 CA California 63.444114 0.037272
50 AL Alabama 38.319998 0.038746
max_increase_day
0 49.875880
1 71.458327
2 75.038230
3 45.950711
4 59.703020
5 60.857958
6 69.869221
7 46.330332
8 54.290691
9 59.859773
10 61.303224
11 62.631306
12 69.120765
13 69.865220
14 54.688533
15 66.629780
16 50.363868
17 66.439805
18 68.582409
19 56.101005
20 59.838240
21 69.963979
22 57.408731
23 74.852766
24 62.312831
25 46.934267
26 67.603738
27 67.135186
28 69.875425
29 86.506374
30 59.620669
31 70.215485
32 69.676065
33 91.777736
34 87.191519
35 44.435591
36 72.169599
37 49.243126
38 70.990829
39 66.326053
40 50.246166
41 58.604460
42 67.694760
43 58.082465
44 56.461489
45 69.515705
46 63.822595
47 79.599457
48 59.050567
49 72.205937
50 58.687263
# merge voting data onto new table
df_final = df_all.merge(df_voting, how="inner", left_on="ABBREVIATION", right_on="State_")
# drop the extra state name column
df_final = df_final.drop(["State_"], axis=1)
print(df_final)
ABBREVIATION state max_vaccinations vaccination_rate \
0 MS Mississippi 34.668860 0.040996
1 MD Maryland 62.631869 0.038292
2 MA Massachusetts 75.497051 0.036684
3 ND North Dakota 42.766528 0.043423
4 OH Ohio 47.738268 0.040831
5 TN Tennessee 41.455718 0.038337
6 UT Utah 50.959429 0.035631
7 WV West Virginia 41.467020 0.035438
8 ID Idaho 38.410120 0.042945
9 NE Nebraska 50.238661 0.042292
10 WI Wisconsin 53.616933 0.041091
11 NV Nevada 47.875222 0.038862
12 NJ New Jersey 64.979189 0.039090
13 FL Florida 54.970953 0.034417
14 MT Montana 46.515794 0.040916
15 NH New Hampshire 68.795576 0.047221
16 LA Louisiana 36.671207 0.039820
17 TX Texas 47.793686 0.037199
18 VA Virginia 60.780057 0.036463
19 NM New Mexico 59.203039 0.039760
20 IA Iowa 51.061935 0.042208
21 CT Connecticut 71.188161 0.034603
22 SC South Carolina 42.412899 0.040751
23 WA Washington 63.892360 0.034459
24 MO Missouri 44.701270 0.039812
25 WY Wyoming 37.523014 0.040518
26 GA Georgia 44.339985 0.036084
27 MI Michigan 53.632320 0.036420
28 RI Rhode Island 65.976023 0.038742
29 VT Vermont 89.699812 0.031594
30 KS Kansas 48.448134 0.045545
31 ME Maine 69.241932 0.038625
32 DE Delaware 59.902098 0.036913
33 HI Hawaii 90.439664 0.029503
34 DC District of Columbia 74.273709 0.032414
35 AK Alaska 47.291552 0.034895
36 PA Pennsylvania 64.361858 0.038383
37 OK Oklahoma 42.725543 0.042183
38 CO Colorado 60.633604 0.035772
39 MN Minnesota 58.377538 0.038231
40 SD South Dakota 49.179947 0.043972
41 AR Arkansas 42.420943 0.036687
42 IL Illinois 58.302036 0.038849
43 NC North Carolina 45.509902 0.040223
44 KY Kentucky 47.183262 0.042192
45 NY New York 60.397543 0.038732
46 IN Indiana 45.699646 0.033932
47 OR Oregon 65.576539 0.031711
48 AZ Arizona 48.103552 0.039200
49 CA California 63.444114 0.037272
50 AL Alabama 38.319998 0.038746
max_increase_day Hillary Clinton Donald Trump
0 49.875880 40 58
1 71.458327 61 35
2 75.038230 61 34
3 45.950711 28 64
4 59.703020 44 52
5 60.857958 35 61
6 69.869221 28 46
7 46.330332 26 69
8 54.290691 28 59
9 59.859773 34 60
10 61.303224 47 48
11 62.631306 48 46
12 69.120765 55 42
13 69.865220 48 49
14 54.688533 36 57
15 66.629780 48 47
16 50.363868 38 58
17 66.439805 43 53
18 68.582409 50 45
19 56.101005 48 40
20 59.838240 42 52
21 69.963979 54 41
22 57.408731 41 55
23 74.852766 54 38
24 62.312831 38 57
25 46.934267 22 70
26 67.603738 46 51
27 67.135186 47 48
28 69.875425 55 40
29 86.506374 61 33
30 59.620669 36 57
31 70.215485 48 45
32 69.676065 53 42
33 91.777736 62 30
34 87.191519 93 4
35 44.435591 38 53
36 72.169599 48 49
37 49.243126 29 65
38 70.990829 47 44
39 66.326053 47 45
40 50.246166 32 62
41 58.604460 34 60
42 67.694760 55 39
43 58.082465 47 51
44 56.461489 33 63
45 69.515705 59 37
46 63.822595 38 57
47 79.599457 52 41
48 59.050567 45 50
49 72.205937 62 33
50 58.687263 35 63
df_final.corr()
sns.heatmap(df_final.corr())
# hypothesis test
max_1 = df_final.loc[df_final["Donald Trump"] > df_final["Hillary Clinton"], "max_vaccinations"]
max_2 = df_final.loc[df_final["Hillary Clinton"] > df_final["Donald Trump"], "max_vaccinations"]
α = 0.05
t_statistics, p_value = stats.ttest_ind(max_1, max_2)
reject_H0 = p_value < α
α, p_value, reject_H0
#hypothesis test
rate_1 = df_final.loc[df_final["Donald Trump"] > df_final["Hillary Clinton"], "vaccination_rate"]
rate_2 = df_final.loc[df_final["Hillary Clinton"] > df_final["Donald Trump"], "vaccination_rate"]
α = 0.05
t_statistics, p_value = stats.ttest_ind(rate_1, rate_2)
reject_H0 = p_value < α
α, p_value, reject_H0