Checkpoint 1: EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
df = pd.read_csv("weball20.txt", delimiter = '|')
# df.to_csv("weball20.csv", index = None)
# problem: do not know column names in weball18/20 files
# we need to know # of votes received per candidate & amount of donations received per candidate
# 538 Democratic data from Github
data_dem = pd.read_csv("dem_candidates.csv")
data_dem.head()
Democrat 538 Data EDA
# Democrats: Group by candidate, x axis: candidate & y axis: primary %
data = data_dem[['Candidate', 'Primary %']].groupby('Candidate').sum() # sum the primary % received per candidate
data = data.sort_values('Primary %', ascending=False) # sort in descending order
data.head()
# Democrats: Plot histogram
plt.hist(data['Primary %'], bins=30)
plt.title("Histogram: Frequencies of Percentage of Votes Received for Candidates")
# Data Cleaning: Remove data with primary % as 100% or more, or 0% and lower.
dem_final = data_dem[(data_dem['Primary %'] > 0) & (data_dem['Primary %'] < 100)]
dem_final
#Distribution of Primary % for Democratic Candidates
plt.hist(dem_final['Primary %'])
#Distribution of log(Primary %) for Democratic Candidates
plt.hist(np.log(dem_final['Primary %']))
data_dem = dem_final
# Create Bar Graph of Categories Comparison
# Removed Warren because command did not recognize column name
# Some columns of Dem Data have options: [nan, 'Yes', 'No'] or ['No', 'Yes']
print(data_dem['Veteran?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['LGBTQ?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Elected Official?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Self-Funder?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['STEM?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Obama Alum?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Party Support?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Emily Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Guns Sense Candidate?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Biden Endorsed?'].unique()) # [nan, 'Yes', 'No']
# print(data_dem['Warren Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Sanders Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Our Revolution Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Justice Dems Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['PCCC Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Indivisible Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['WFP Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['VoteVets Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['No Labels Support?'].unique()) # [nan, 'Yes', 'No']
# Create Lists for Bar graph of categories values
no = []
yes = []
# No list
no.append(data_dem['Veteran?'].value_counts(sort=True)[0])
no.append(data_dem['LGBTQ?'].value_counts(sort=True)[0])
no.append(data_dem['Elected Official?'].value_counts(sort=True)[0]) # 667
no.append(data_dem['Self-Funder?'].value_counts(sort=True)[0]) # 768
no.append(data_dem['STEM?'].value_counts(sort=True)[0])
no.append(data_dem['Obama Alum?'].value_counts(sort=True)[0])
no.append(data_dem['Party Support?'].value_counts(sort=True)[0])
no.append(data_dem['Emily Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Guns Sense Candidate?'].value_counts(sort=True)[0])
no.append(data_dem['Biden Endorsed?'].value_counts(sort=True)[0])
# no.append(data_dem['Warren Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Sanders Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Our Revolution Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Justice Dems Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['PCCC Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Indivisible Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['WFP Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['VoteVets Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['No Labels Support?'].value_counts(sort=True)[0])
# Yes list
yes.append(data_dem['Veteran?'].value_counts(sort=True)[1])
yes.append(data_dem['LGBTQ?'].value_counts(sort=True)[1])
yes.append(data_dem['Elected Official?'].value_counts(sort=True)[1]) # 667
yes.append(data_dem['Self-Funder?'].value_counts(sort=True)[1]) # 768
yes.append(data_dem['STEM?'].value_counts(sort=True)[1])
yes.append(data_dem['Obama Alum?'].value_counts(sort=True)[1])
yes.append(data_dem['Party Support?'].value_counts(sort=True)[1])
yes.append(data_dem['Emily Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Guns Sense Candidate?'].value_counts(sort=True)[1])
yes.append(data_dem['Biden Endorsed?'].value_counts(sort=True)[1])
# yes.append(data_dem['Warren Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Sanders Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Our Revolution Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Justice Dems Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['PCCC Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Indivisible Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['WFP Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['VoteVets Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['No Labels Support?'].value_counts(sort=True)[1])
# Bar Graph Plot
index = ['Veteran?', 'LGBTQ?', 'Elected Official?', 'Self-Funder?', 'STEM?', 'Obama Alum?',
'Party Support?', 'Emily Endorsed?', 'Guns Sense Candidate?', 'Biden Endorsed?', 'Sanders Endorsed?', 'Our Revolution Endorsed?',
'Justice Dems Endorsed?', 'PCCC Endorsed?', 'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?', 'No Labels Support?']
df = pd.DataFrame({'No': no, 'Yes': yes}, index = index)
df.plot.bar()
figure(figsize=(40,60))
#Visualization for veteran status
veterans_dem = data_dem[data_dem["Veteran?"]=="Yes"]
not_veterans_dem = data_dem[data_dem["Veteran?"]=="No"]
#Histogram of win
veterans_plot = plt.hist(veterans_dem["Primary %"])
plt.title('Veterans Primary % Histogram')
plt.xlabel("Primary%")
plt.ylabel("Count")
plt.show()
not_veterans_plot = plt.hist(not_veterans_dem["Primary %"])
plt.title('Non-veterans Primary % Histogram')
plt.xlabel("Primary%")
plt.ylabel("Count")
plt.show()
data_dem["STEM?"].hist(by=data_dem["Veteran?"])
plt.xlabel("Stem?")
plt.ylabel("Count")
plt.show()
data_dem["Race"].hist(by=data_dem["Veteran?"])
plt.xlabel("Race")
plt.ylabel("Count")
plt.show()
data_dem["LGBTQ?"].hist(by=data_dem["Veteran?"])
plt.xlabel("LGBTQ?")
plt.ylabel("Count")
plt.show()
data_dem["Partisan Lean"].hist(by=data_dem["Veteran?"])
plt.xlabel("Partisan Lean")
plt.ylabel("Count")
plt.show()
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
sns.set()
import pymc3 as pm
from pymc3 import glm
import statsmodels.api as sm
import statsmodels.formula.api as smf
import arviz
data_dem.head(2) # 2 to save space
# data cleaning: only keep the columns we need
dem_final = data_dem.drop(['District', 'Office Type', 'Race Type', 'Race Primary Election Date',
'Primary Runoff Status', 'General Status', 'Primary Status'], axis = 1)
#dataframe copy we will use for Causal Inferencing
dem_final1 = dem_final.copy(deep=True)
dem_final1
def to_one(arr):
new_arr = []
for i in arr:
if (i == "Yes") or (i == "White"):
new_arr.append(1)
else:
new_arr.append(0)
return new_arr
dem_final1= dem_final[["Candidate", "State", "Partisan Lean","Primary %", "Won Primary", "Race" ,"LGBTQ?","STEM?", "Veteran?", "Elected Official?", "Self-Funder?", "Obama Alum?" ]].copy()
dem_final1["Won Primary"]=to_one(dem_final1["Won Primary"])
dem_final1["Race"]=to_one(dem_final1["Race"])
dem_final1["LGBTQ?"]=to_one(dem_final1["LGBTQ?"])
dem_final1["STEM?"]=to_one(dem_final1["STEM?"])
dem_final1["Veteran?"]=to_one(dem_final1["Veteran?"])
dem_final1["Elected Official?"]=to_one(dem_final1["Elected Official?"])
dem_final1["Self-Funder?"]=to_one(dem_final1["Self-Funder?"])
dem_final1["Obama Alum?"]=to_one(dem_final1["Obama Alum?"])
dem_final1
#From lab
def fit_OLS_model(df, target_variable, explanatory_variables, intercept = False):
target = df[target_variable]
inputs = df[explanatory_variables]
if intercept:
inputs = sm.add_constant(inputs)
fitted_model = sm.OLS(target, inputs, missing="drop").fit()
return(fitted_model)
betas_model = fit_OLS_model(dem_final1, 'Primary %', ['Race', 'STEM?',"LGBTQ?","Veteran?", "Partisan Lean"], intercept = True)
print(betas_model.summary())
#Dem Final dataframe with only swing states inside
dem_final_swing = dem_final1[dem_final1["State"].isin(["AZ","CO","FL","GA","IA","MI", "NV", "NH","NC","OH","PA","WI"])]
dem_final_swing
#Subsetting the swing state dataframe even further
dem_final_swing_CI = dem_final_swing[["Candidate", "Partisan Lean","Primary %", "Won Primary", "Race" ,"LGBTQ?","STEM?", "Veteran?", "Elected Official?", "Self-Funder?", "Obama Alum?" ]].copy()
dem_final_swing_CI
betas_model_swing = fit_OLS_model(dem_final_swing_CI, 'Primary %', ['Race', 'STEM?',"LGBTQ?","Veteran?", "Partisan Lean"], intercept = True)
print(betas_model_swing.summary())
dem_final_model = dem_final1.drop(["Candidate", "State"], axis = 1)
dem_final_model
#Split data into testing and training set to compare models with RMSE. We will arbitrarily choose a 70-30 split
from sklearn.model_selection import train_test_split
dem_train, dem_test = train_test_split(dem_final_model, test_size= 0.3, random_state=42)
y_train = dem_train["Primary %"]
x_train = dem_train.drop(["Primary %", "Won Primary"], axis = 1)
x_test = dem_test.drop(["Primary %", "Won Primary"], axis = 1)
y_test = dem_test["Primary %"]
from sklearn.ensemble import RandomForestRegressor
#/work/updated_notebook Mil (6) (1) (1)-2.ipynb
#Setting up random forest regressor
model_forest = RandomForestRegressor(max_features= 3)
model_forest.fit(x_train, y_train)
#Predictions
predictions_rf = model_forest.predict(x_test)
#Calculating RMSE
rmse_rf = np.mean((y_test - predictions_rf)**2)
rmse_rf = np.sqrt(rmse_rf)
rmse_rf
#Making sure training set still has normal distribution
plt.hist(np.log(dem_train['Primary %']))
#Still some semblance of normality
#setting up glm dataframe
dem_final_glm = dem_train
dem_final_glm
#Replacing spaces in column name with _
dem_final_glm.columns = dem_final_glm.columns.str.replace(' ','_')
#Replacing ? in column name with nothing
dem_final_glm.columns = dem_final_glm.columns.str.replace('?','')
#Renaming Primary_% to Primary
dem_final_glm.columns = dem_final_glm.columns.str.replace('_%','')
#Renaming Self-Funder to SelfFunder
dem_final_glm.columns = dem_final_glm.columns.str.replace('-F','F')
dem_final_glm
#Adding log(Primary %) as column
dem_final_glm["log_primary"] = np.log(dem_final_glm["Primary"])
gaus_formula = "log_primary ~ Partisan_Lean + Race + LGBTQ + STEM + Veteran + Elected_Official + SelfFunder + Obama_Alum"
glm_gaus = smf.glm(formula=gaus_formula, data=dem_final_glm, family=sm.families.Gaussian())
reg_glm_reg = glm_gaus.fit()
print(reg_glm_reg.summary())
#Getting trace from Gaussian model
with pm.Model() as gaus_model:
glm.GLM.from_formula(formula=gaus_formula, data=dem_final_glm)
gaus_trace = pm.sample(100, cores=4, tune = 1000, target_accept=0.95)
#Running posterior predictive check on Gaussian model
with gaus_model:
gaussian_ppc = pm.sample_posterior_predictive(
gaus_trace, random_seed=42
)
#PLotting posterior predictive check on Gaussian Model. Notice observations truncates, which is an issue with model.
arviz.plot_ppc(arviz.from_pymc3(posterior_predictive=gaussian_ppc, model=gaus_model))
plt.xlabel ('Log(Primary %)')
#Coefficients from GlM model
coefficients = np.array([2.4027,-0.0198,0.0976,0.0273,-0.1536,0.1038,0.5822,0.5,0.6288])
#Function that takes in the testing df rows and manually calculates the predicted values
def glm_pred(row):
xs = np.dot(row, coefficients[1:10])
pred = xs + coefficients[0]
return pred
#Finding prediction of testing set
pred_glm = np.array(x_test.apply(glm_pred, axis = 1))
#Converting to correct primary form
pred_glm = np.exp(pred_glm)
rmse_glm = np.mean((y_test - pred_glm)**2)
rmse_glm = np.sqrt(rmse_glm)
rmse_glm