#import necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import pearsonr
from functools import partial
import numpy as np
from scipy.optimize import curve_fit
import seaborn as sns
#import data
electiondf = pd.read_csv("npr-2016-election-data.csv")
confirmed = pd.read_csv("covid_confirmed_usafacts (1).csv")
county = pd.read_csv('covid_county_population_usafacts.csv')
#merge data -- question 3
df = pd.DataFrame.merge(county,confirmed)
df.head()
df = pd.DataFrame.merge(df, electiondf)
by_state_df = df.groupby( 'State' ).sum()
by_state_df = by_state_df.iloc[:,2:]
by_state_df.head()
by_state_df['Total Cases'] = by_state_df.iloc[:,:-2]
by_state_df['Total Cases']
#
#curve fitting
# quesiton 4
b0 = []
b1 = []
b2 = []
#logistic model
def logistic_curve (x,β0,β1,β2):
return β0 / (1+np.exp(β1*(-x+β2)))
'''
b0 - guess cases so far
b1 - 1
b2 - len data/ 2
'''
#function takes a state abbreviation as input and returns the series of cases as output
def cases_over_time_in_state ( state_abbreviation ):
cases_in_state = by_state_df.loc[state_abbreviation, 'Total Cases']
for index, r, in by_state_df.iterrows():
xs = np.arrange(len(r))
ys = r
guess_betas = [ys.max(), 1, len(df)/2]
found_betas, covariance = curve_fit(logistic_curve,xs, ys,p0=guess_betas)
β0,β1,β2 = found_betas
b0.append(β0)
b1.append(β1)
b2.append(β2)
model = lambda x: logistic_curve(x,β0,β1,β2)
plt.scatter(xs, ys)
plt.plot(xs, model(xs), color= "red")
plt.show()
#df['b0']=pd.Series(b0)
#df['b1']=pd.Series(b1)
#df['b2']=pd.Series(b2)
by_state_df['b0']=pd.Series(b0)
by_state_df['b1']=pd.Series(b1)
by_state_df['b2']=pd.Series(b2)
print(by_state_df.head())
stateFIPS 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 \
State
AK 62 0 0 0 0 0 0
AL 68 0 0 0 0 0 0
AR 380 0 0 0 0 0 0
AZ 64 0 0 0 0 1 1
CA 360 0 0 0 0 2 3
1/28/20 1/29/20 1/30/20 ... 10/10/20 10/11/20 10/12/20 10/13/20 \
State ...
AK 0 0 0 ... 9430 9686 9877 10028
AL 0 0 0 ... 164525 165342 166076 167193
AR 0 0 0 ... 92220 92833 93487 94167
AZ 1 1 1 ... 224970 225575 226050 226734
CA 3 4 4 ... 852159 854377 857638 862113
Clinton Trump Total Cases b0 b1 b2
State
AK 1178 1643 62 NaN NaN NaN
AL 2380 4284 68 NaN NaN NaN
AR 2584 4560 380 NaN NaN NaN
AZ 720 800 64 NaN NaN NaN
CA 3720 1980 360 NaN NaN NaN
[5 rows x 273 columns]
/opt/venv/lib/python3.7/site-packages/ipykernel_launcher.py:38: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
/opt/venv/lib/python3.7/site-packages/ipykernel_launcher.py:39: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
/opt/venv/lib/python3.7/site-packages/ipykernel_launcher.py:40: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
#question 5
correlation = df.corr()
print(correlation)
#subset candidates
Clinton = df['Clinton']
Trump = df['Trump']
#define variables for correlation analysis
###########################################
#most recent number of cases
recent_total_cases = df['Total Cases']
#projected maximum number of cases
max_cases = df[b0]
#most recent number of cases per capita
per_capcases = recent_total_cases/df['population']
#projected maximum number of cases per capita
max_percap = max_cases/df['population']
#time of maximum increase
timemax = df['b2'] * 2
#rate of increase
roi = df['b1']/timemax
#evaluate correlations
#########################
# most recent number of cases
corr_recent_total_cases_clinton, _ = pearsonr(Clinton, recent_total_cases)
corr_recent_total_cases_trump, _ = pearsonr(Trump, recent_total_cases)
print(corr_recent_total_cases_clinton)
print(corr_recent_total_cases_trump)
# most recent number of cases per capita
corr_per_capcases_clinton, _ = pearsonr(Clinton, per_capcases)
corr_per_capcases_trump, _ = pearsonr(Trump, per_capcases)
print(corr_maxpercap_clinton)
print(corr_maxpercap_trump)
# projected maximum number of cases
corr_max_cases_clinton, _ = pearsonr(Clinton, max_cases)
corr_max_cases_trump, _ = pearsonr(Trump, max_cases)
print(corr_max_cases_clinton)
print(corr_max_cases_trump)
# projected maximum number of cases per capita
corr_max_percap_clinton, _ = pearsonr(Clinton, max_percap)
corr_max_percap_trump,_ = pearsonr(Trump, max_percap)
print(corr_max_percap_clinton)
print(corr_max_percap_trump)
# rate of increase
corr_increase_clinton, _ = pearsonr(Clinton, roi)
corr_increase_trump, _ = pearsonr(Trump, roi)
print(corr_increase_clinton)
print(corr_increase_trump)
# time of maximum increase
corr_time_max_clinton, _ = pearsonr(Clinton, timemax)
corr_time_max_trump,_ = pearsonr(Trump, timemax)
print(corr_time_max_clinton)
print(corr_time_max_trump)
ValueError: array must not contain infs or NaNs
#heatmap of correlation
ax = sns.heatmap(
correlation,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
);
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(correlation, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(correlation, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})