Checkpoint 1: EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
df = pd.read_csv("weball20.txt", delimiter = '|')
# df.to_csv("weball20.csv", index = None)
# problem: do not know column names in weball18/20 files
# we need to know # of votes received per candidate & amount of donations received per candidate
# 538 Democratic data from Github
data_dem = pd.read_csv("dem_candidates.csv")
data_dem.head()
Democrat 538 Data EDA
# Democrats: Group by candidate, x axis: candidate & y axis: primary %
data = data_dem[['Candidate', 'Primary %']].groupby('Candidate').sum() # sum the primary % received per candidate
data = data.sort_values('Primary %', ascending=False) # sort in descending order
data.head()
# Democrats: Plot histogram
plt.hist(data['Primary %'], bins=30)
plt.title("Histogram: Frequencies of Percentage of Votes Received for Candidates")
# Data Cleaning: Remove data with primary % as 100% or more, or 0% and lower.
dem_final = data_dem[(data_dem['Primary %'] > 0) & (data_dem['Primary %'] < 100)]
dem_final
#Distribution of Primary % for Democratic Candidates
plt.hist(dem_final['Primary %'])
#Distribution of log(Primary %) for Democratic Candidates
plt.hist(np.log(dem_final['Primary %']))
data_dem = dem_final
# Create Bar Graph of Categories Comparison
# Removed Warren because command did not recognize column name
# Some columns of Dem Data have options: [nan, 'Yes', 'No'] or ['No', 'Yes']
print(data_dem['Veteran?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['LGBTQ?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Elected Official?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Self-Funder?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['STEM?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Obama Alum?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Party Support?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Emily Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Guns Sense Candidate?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Biden Endorsed?'].unique()) # [nan, 'Yes', 'No']
# print(data_dem['Warren Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Sanders Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Our Revolution Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Justice Dems Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['PCCC Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Indivisible Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['WFP Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['VoteVets Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['No Labels Support?'].unique()) # [nan, 'Yes', 'No']
['Yes' 'No' nan]
['No' 'Yes' nan]
['No' 'Yes' nan]
['No' 'Yes']
['No' 'Yes' nan]
['No' 'Yes' nan]
[nan 'Yes' 'No']
[nan 'No' 'Yes']
['No' 'Yes' nan]
[nan 'Yes' 'No']
[nan 'Yes' 'No']
[nan 'Yes' 'No']
[nan 'Yes' 'No']
[nan 'No' 'Yes']
[nan 'No' 'Yes']
[nan 'Yes' 'No']
[nan 'No' 'Yes']
[nan 'No' 'Yes']
# Create Lists for Bar graph of categories values
no = []
yes = []
# No list
no.append(data_dem['Veteran?'].value_counts(sort=True)[0])
no.append(data_dem['LGBTQ?'].value_counts(sort=True)[0])
no.append(data_dem['Elected Official?'].value_counts(sort=True)[0]) # 667
no.append(data_dem['Self-Funder?'].value_counts(sort=True)[0]) # 768
no.append(data_dem['STEM?'].value_counts(sort=True)[0])
no.append(data_dem['Obama Alum?'].value_counts(sort=True)[0])
no.append(data_dem['Party Support?'].value_counts(sort=True)[0])
no.append(data_dem['Emily Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Guns Sense Candidate?'].value_counts(sort=True)[0])
no.append(data_dem['Biden Endorsed?'].value_counts(sort=True)[0])
# no.append(data_dem['Warren Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Sanders Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Our Revolution Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Justice Dems Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['PCCC Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Indivisible Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['WFP Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['VoteVets Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['No Labels Support?'].value_counts(sort=True)[0])
# Yes list
yes.append(data_dem['Veteran?'].value_counts(sort=True)[1])
yes.append(data_dem['LGBTQ?'].value_counts(sort=True)[1])
yes.append(data_dem['Elected Official?'].value_counts(sort=True)[1]) # 667
yes.append(data_dem['Self-Funder?'].value_counts(sort=True)[1]) # 768
yes.append(data_dem['STEM?'].value_counts(sort=True)[1])
yes.append(data_dem['Obama Alum?'].value_counts(sort=True)[1])
yes.append(data_dem['Party Support?'].value_counts(sort=True)[1])
yes.append(data_dem['Emily Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Guns Sense Candidate?'].value_counts(sort=True)[1])
yes.append(data_dem['Biden Endorsed?'].value_counts(sort=True)[1])
# yes.append(data_dem['Warren Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Sanders Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Our Revolution Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Justice Dems Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['PCCC Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Indivisible Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['WFP Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['VoteVets Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['No Labels Support?'].value_counts(sort=True)[1])
# Bar Graph Plot
index = ['Veteran?', 'LGBTQ?', 'Elected Official?', 'Self-Funder?', 'STEM?', 'Obama Alum?',
'Party Support?', 'Emily Endorsed?', 'Guns Sense Candidate?', 'Biden Endorsed?', 'Sanders Endorsed?', 'Our Revolution Endorsed?',
'Justice Dems Endorsed?', 'PCCC Endorsed?', 'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?', 'No Labels Support?']
df = pd.DataFrame({'No': no, 'Yes': yes}, index = index)
df.plot.bar()
figure(figsize=(40,60))
#Visualization for veteran status
veterans_dem = data_dem[data_dem["Veteran?"]=="Yes"]
not_veterans_dem = data_dem[data_dem["Veteran?"]=="No"]
#Histogram of win
veterans_plot = plt.hist(veterans_dem["Primary %"])
plt.title('Veterans Primary % Histogram')
plt.xlabel("Primary%")
plt.ylabel("Count")
plt.show()
not_veterans_plot = plt.hist(not_veterans_dem["Primary %"])
plt.title('Non-veterans Primary % Histogram')
plt.xlabel("Primary%")
plt.ylabel("Count")
plt.show()
data_dem["STEM?"].hist(by=data_dem["Veteran?"])
plt.xlabel("Stem?")
plt.ylabel("Count")
plt.show()
data_dem["Race"].hist(by=data_dem["Veteran?"])
plt.xlabel("Race")
plt.ylabel("Count")
plt.show()
data_dem["LGBTQ?"].hist(by=data_dem["Veteran?"])
plt.xlabel("LGBTQ?")
plt.ylabel("Count")
plt.show()
data_dem["Partisan Lean"].hist(by=data_dem["Veteran?"])
plt.xlabel("Partisan Lean")
plt.ylabel("Count")
plt.show()
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
sns.set()
import pymc3 as pm
from pymc3 import glm
import statsmodels.api as sm
import statsmodels.formula.api as smf
import arviz
WARNING (theano.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
data_dem.head(2) # 2 to save space
# data cleaning: only keep the columns we need
dem_final = data_dem.drop(['District', 'Office Type', 'Race Type', 'Race Primary Election Date',
'Primary Runoff Status', 'General Status', 'Primary Status'], axis = 1)
#dataframe copy we will use for Causal Inferencing
dem_final1 = dem_final.copy(deep=True)
dem_final1
def to_one(arr):
new_arr = []
for i in arr:
if (i == "Yes") or (i == "White"):
new_arr.append(1)
else:
new_arr.append(0)
return new_arr
dem_final1= dem_final[["Candidate", "State", "Partisan Lean","Primary %", "Won Primary", "Race" ,"LGBTQ?","STEM?", "Veteran?", "Elected Official?", "Self-Funder?", "Obama Alum?" ]].copy()
dem_final1["Won Primary"]=to_one(dem_final1["Won Primary"])
dem_final1["Race"]=to_one(dem_final1["Race"])
dem_final1["LGBTQ?"]=to_one(dem_final1["LGBTQ?"])
dem_final1["STEM?"]=to_one(dem_final1["STEM?"])
dem_final1["Veteran?"]=to_one(dem_final1["Veteran?"])
dem_final1["Elected Official?"]=to_one(dem_final1["Elected Official?"])
dem_final1["Self-Funder?"]=to_one(dem_final1["Self-Funder?"])
dem_final1["Obama Alum?"]=to_one(dem_final1["Obama Alum?"])
dem_final1
#From lab
def fit_OLS_model(df, target_variable, explanatory_variables, intercept = False):
target = df[target_variable]
inputs = df[explanatory_variables]
if intercept:
inputs = sm.add_constant(inputs)
fitted_model = sm.OLS(target, inputs, missing="drop").fit()
return(fitted_model)
betas_model = fit_OLS_model(dem_final1, 'Primary %', ['Race', 'STEM?',"LGBTQ?","Veteran?", "Partisan Lean"], intercept = True)
print(betas_model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Primary % R-squared: 0.082
Model: OLS Adj. R-squared: 0.076
Method: Least Squares F-statistic: 13.49
Date: Tue, 14 Dec 2021 Prob (F-statistic): 1.29e-12
Time: 01:10:31 Log-Likelihood: -3343.1
No. Observations: 758 AIC: 6698.
Df Residuals: 752 BIC: 6726.
Df Model: 5
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const 22.0734 1.203 18.350 0.000 19.712 24.435
Race 2.6558 1.485 1.788 0.074 -0.259 5.571
STEM? -4.8473 1.915 -2.531 0.012 -8.607 -1.087
LGBTQ? -1.1016 3.371 -0.327 0.744 -7.720 5.516
Veteran? 0.3910 2.022 0.193 0.847 -3.578 4.360
Partisan Lean -0.2579 0.036 -7.218 0.000 -0.328 -0.188
==============================================================================
Omnibus: 76.073 Durbin-Watson: 1.969
Prob(Omnibus): 0.000 Jarque-Bera (JB): 98.380
Skew: 0.882 Prob(JB): 4.34e-22
Kurtosis: 3.064 Cond. No. 113.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
#Dem Final dataframe with only swing states inside
dem_final_swing = dem_final1[dem_final1["State"].isin(["AZ","CO","FL","GA","IA","MI", "NV", "NH","NC","OH","PA","WI"])]
dem_final_swing
#Subsetting the swing state dataframe even further
dem_final_swing_CI = dem_final_swing[["Candidate", "Partisan Lean","Primary %", "Won Primary", "Race" ,"LGBTQ?","STEM?", "Veteran?", "Elected Official?", "Self-Funder?", "Obama Alum?" ]].copy()
dem_final_swing_CI
betas_model_swing = fit_OLS_model(dem_final_swing_CI, 'Primary %', ['Race', 'STEM?',"LGBTQ?","Veteran?", "Partisan Lean"], intercept = True)
print(betas_model_swing.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Primary % R-squared: 0.106
Model: OLS Adj. R-squared: 0.086
Method: Least Squares F-statistic: 5.219
Date: Tue, 14 Dec 2021 Prob (F-statistic): 0.000150
Time: 01:10:32 Log-Likelihood: -1004.3
No. Observations: 226 AIC: 2021.
Df Residuals: 220 BIC: 2041.
Df Model: 5
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const 23.6912 2.134 11.101 0.000 19.485 27.897
Race 7.5483 2.823 2.673 0.008 1.984 13.113
STEM? -4.3046 3.680 -1.170 0.243 -11.557 2.948
LGBTQ? -1.1493 7.563 -0.152 0.879 -16.054 13.755
Veteran? -4.3058 4.276 -1.007 0.315 -12.733 4.121
Partisan Lean -0.2690 0.069 -3.900 0.000 -0.405 -0.133
==============================================================================
Omnibus: 24.007 Durbin-Watson: 2.034
Prob(Omnibus): 0.000 Jarque-Bera (JB): 29.541
Skew: 0.882 Prob(JB): 3.85e-07
Kurtosis: 2.847 Cond. No. 121.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
dem_final_model = dem_final1.drop(["Candidate", "State"], axis = 1)
dem_final_model
#Split data into testing and training set to compare models with RMSE. We will arbitrarily choose a 70-30 split
from sklearn.model_selection import train_test_split
dem_train, dem_test = train_test_split(dem_final_model, test_size= 0.3, random_state=42)
y_train = dem_train["Primary %"]
x_train = dem_train.drop(["Primary %", "Won Primary"], axis = 1)
x_test = dem_test.drop(["Primary %", "Won Primary"], axis = 1)
y_test = dem_test["Primary %"]
from sklearn.ensemble import RandomForestRegressor
#/work/updated_notebook Mil (6) (1) (1)-2.ipynb
#Setting up random forest regressor
model_forest = RandomForestRegressor(max_features= 3)
model_forest.fit(x_train, y_train)
#Predictions
predictions_rf = model_forest.predict(x_test)
#Calculating RMSE
rmse_rf = np.mean((y_test - predictions_rf)**2)
rmse_rf = np.sqrt(rmse_rf)
rmse_rf
#Making sure training set still has normal distribution
plt.hist(np.log(dem_train['Primary %']))
#Still some semblance of normality
#setting up glm dataframe
dem_final_glm = dem_train
dem_final_glm
#Replacing spaces in column name with _
dem_final_glm.columns = dem_final_glm.columns.str.replace(' ','_')
#Replacing ? in column name with nothing
dem_final_glm.columns = dem_final_glm.columns.str.replace('?','')
#Renaming Primary_% to Primary
dem_final_glm.columns = dem_final_glm.columns.str.replace('_%','')
#Renaming Self-Funder to SelfFunder
dem_final_glm.columns = dem_final_glm.columns.str.replace('-F','F')
dem_final_glm
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:5: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.
"""
#Adding log(Primary %) as column
dem_final_glm["log_primary"] = np.log(dem_final_glm["Primary"])
gaus_formula = "log_primary ~ Partisan_Lean + Race + LGBTQ + STEM + Veteran + Elected_Official + SelfFunder + Obama_Alum"
glm_gaus = smf.glm(formula=gaus_formula, data=dem_final_glm, family=sm.families.Gaussian())
reg_glm_reg = glm_gaus.fit()
print(reg_glm_reg.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: log_primary No. Observations: 530
Model: GLM Df Residuals: 521
Model Family: Gaussian Df Model: 8
Link Function: identity Scale: 0.94340
Method: IRLS Log-Likelihood: -732.06
Date: Tue, 14 Dec 2021 Deviance: 491.51
Time: 01:10:33 Pearson chi2: 492.
No. Iterations: 3 Pseudo R-squ. (CS): 0.1755
Covariance Type: nonrobust
====================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------
Intercept 2.4027 0.079 30.435 0.000 2.248 2.557
Partisan_Lean -0.0198 0.002 -8.972 0.000 -0.024 -0.015
Race 0.0976 0.087 1.128 0.259 -0.072 0.267
LGBTQ 0.0273 0.178 0.154 0.878 -0.321 0.375
STEM -0.1536 0.115 -1.337 0.181 -0.379 0.071
Veteran 0.1038 0.115 0.902 0.367 -0.122 0.329
Elected_Official 0.5822 0.121 4.795 0.000 0.344 0.820
SelfFunder 0.5000 0.188 2.656 0.008 0.131 0.869
Obama_Alum 0.6288 0.215 2.921 0.003 0.207 1.051
====================================================================================
#Getting trace from Gaussian model
with pm.Model() as gaus_model:
glm.GLM.from_formula(formula=gaus_formula, data=dem_final_glm)
gaus_trace = pm.sample(100, cores=4, tune = 1000, target_accept=0.95)
The glm module is deprecated and will be removed in version 4.0
We recommend to instead use Bambi https://bambinos.github.io/bambi/
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: FutureWarning: In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning.
after removing the cwd from sys.path.
Only 100 samples in chain.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sd, Obama_Alum, SelfFunder, Elected_Official, Veteran, STEM, LGBTQ, Race, Partisan_Lean, Intercept]
Sampling 4 chains for 1_000 tune and 100 draw iterations (4_000 + 400 draws total) took 56 seconds.
#Running posterior predictive check on Gaussian model
with gaus_model:
gaussian_ppc = pm.sample_posterior_predictive(
gaus_trace, random_seed=42
)
#PLotting posterior predictive check on Gaussian Model. Notice observations truncates, which is an issue with model.
arviz.plot_ppc(arviz.from_pymc3(posterior_predictive=gaussian_ppc, model=gaus_model))
plt.xlabel ('Log(Primary %)')
#Coefficients from GlM model
coefficients = np.array([2.4027,-0.0198,0.0976,0.0273,-0.1536,0.1038,0.5822,0.5,0.6288])
#Function that takes in the testing df rows and manually calculates the predicted values
def glm_pred(row):
xs = np.dot(row, coefficients[1:10])
pred = xs + coefficients[0]
return pred
#Finding prediction of testing set
pred_glm = np.array(x_test.apply(glm_pred, axis = 1))
#Converting to correct primary form
pred_glm = np.exp(pred_glm)
rmse_glm = np.mean((y_test - pred_glm)**2)
rmse_glm = np.sqrt(rmse_glm)
rmse_glm