Checkpoint 1: EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
df = pd.read_csv("weball20.txt", delimiter = '|')
# df.to_csv("weball20.csv", index = None)
# problem: do not know column names in weball18/20 files
# we need to know # of votes received per candidate & amount of donations received per candidate
# 538 Republican data from Github
data_rep = pd.read_csv("rep_candidates.csv")
data_rep.head()
# 538 Democratic data from Github
data_dem = pd.read_csv("dem_candidates.csv")
data_dem.head()
Republican 538 Data EDA
# Republicans: Group by candidate, x axis: candidate & y axis: primary %
data = data_rep[['Candidate', 'Primary %']].groupby('Candidate').sum() # sum the primary % received per candidate
data = data.sort_values('Primary %', ascending=False) # sort in descending order
data.head()
# Republicans: Plot histogram
plt.hist(data['Primary %'], bins=30)
plt.title("Histogram: Frequencies of Percentage of Votes Received for Candidates")
# Data Cleaning: Remove data with primary % as 100% or more, or 0%
data = data[(data['Primary %'] > 0) & (data['Primary %'] < 100)]
data.head()
# Create Bar Graph of Categories Comparison
# All these columns of Repub Data have options: [nan, 'Yes', 'No']
print(data_rep['Rep Party Support?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['Trump Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['Bannon Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['Great America Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['NRA Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['Right to Life Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['Susan B. Anthony Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['Club for Growth Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['Koch Support?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['House Freedom Support?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['Tea Party Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['Main Street Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['Chamber Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_rep['No Labels Support?'].unique()) # [nan, 'Yes', 'No']
# Create Lists for Bar graph of categories values
no = []
yes = []
# No list
no.append(data_rep['Rep Party Support?'].value_counts(sort=True)[0]) # Outputs [75, 63] for 75 No, 63 Yes
no.append(data_rep['Trump Endorsed?'].value_counts(sort=True)[0]) # [54 No, 17 Yes]
no.append(data_rep['Bannon Endorsed?'].value_counts(sort=True)[0]) # [35 No, 8 Yes]
no.append(data_rep['Great America Endorsed?'].value_counts(sort=True)[0]) # [29 No, 7 Yes]
no.append(data_rep['NRA Endorsed?'].value_counts(sort=True)[0]) # [55 No, 14 Yes]
no.append(data_rep['Right to Life Endorsed?'].value_counts(sort=True)[0]) # [227 No, 74 Yes]
no.append(data_rep['Susan B. Anthony Endorsed?'].value_counts(sort=True)[0]) # [107 No, 23 Yes]
no.append(data_rep['Club for Growth Endorsed?'].value_counts(sort=True)[0]) # [104 No, 21 Yes]
no.append(data_rep['Koch Support?'].value_counts(sort=True)[0]) # [90 No, 21 Yes]
no.append(data_rep['House Freedom Support?'].value_counts(sort=True)[0]) # [79 No, 14 Yes]
no.append(data_rep['Tea Party Endorsed?'].value_counts(sort=True)[0]) # [84 No, 16 Yes]
no.append(data_rep['Main Street Endorsed?'].value_counts(sort=True)[0]) # [81 No, 17 Yes]
no.append(data_rep['Chamber Endorsed?'].value_counts(sort=True)[0]) # [19 No, 8 Yes]
no.append(data_rep['No Labels Support?'].value_counts(sort=True)[0]) # [22 No, 2 Yes]
# Yes list
yes.append(data_rep['Rep Party Support?'].value_counts(sort=True)[1]) # Outputs [75, 63] for 75 No, 63 Yes
yes.append(data_rep['Trump Endorsed?'].value_counts(sort=True)[1]) # [54 No, 17 Yes]
yes.append(data_rep['Bannon Endorsed?'].value_counts(sort=True)[1]) # [35 No, 8 Yes]
yes.append(data_rep['Great America Endorsed?'].value_counts(sort=True)[1]) # [29 No, 7 Yes]
yes.append(data_rep['NRA Endorsed?'].value_counts(sort=True)[1]) # [55 No, 14 Yes]
yes.append(data_rep['Right to Life Endorsed?'].value_counts(sort=True)[1]) # [227 No, 74 Yes]
yes.append(data_rep['Susan B. Anthony Endorsed?'].value_counts(sort=True)[1]) # [107 No, 23 Yes]
yes.append(data_rep['Club for Growth Endorsed?'].value_counts(sort=True)[1]) # [104 No, 21 Yes]
yes.append(data_rep['Koch Support?'].value_counts(sort=True)[1]) # [90 No, 21 Yes]
yes.append(data_rep['House Freedom Support?'].value_counts(sort=True)[1]) # [79 No, 14 Yes]
yes.append(data_rep['Tea Party Endorsed?'].value_counts(sort=True)[1]) # [84 No, 16 Yes]
yes.append(data_rep['Main Street Endorsed?'].value_counts(sort=True)[1]) # [81 No, 17 Yes]
yes.append(data_rep['Chamber Endorsed?'].value_counts(sort=True)[1]) # [19 No, 8 Yes]
yes.append(data_rep['No Labels Support?'].value_counts(sort=True)[1]) # [22 No, 2 Yes]
print(len(no), no)
print(len(yes), yes)
# Bar Graph Plot
index = ['Rep Party Support?', 'Trump Endorsed?', 'Bannon Endorsed?', 'Great America Endorsed?', 'NRA Endorsed?', 'Right to Life Endorsed?',
'Susan B. Anthony Endorsed?', 'Club for Growth Endorsed?', 'Koch Support?', 'House Freedom Support?', 'Tea Party Endorsed?', 'Main Street Endorsed?',
'Chamber Endorsed?', 'No Labels Support?']
df = pd.DataFrame({'No': no, 'Yes': yes}, index = index)
df.plot.bar()
figure(figsize=(40,60))
Democrat 538 Data EDA
# Democrats: Group by candidate, x axis: candidate & y axis: primary %
data = data_dem[['Candidate', 'Primary %']].groupby('Candidate').sum() # sum the primary % received per candidate
data = data.sort_values('Primary %', ascending=False) # sort in descending order
data.head()
# Democrats: Plot histogram
plt.hist(data['Primary %'], bins=30)
plt.title("Histogram: Frequencies of Percentage of Votes Received for Candidates")
# Data Cleaning: Remove data with primary % as 100% or more, or 0%
data = data[(data['Primary %'] > 0) & (data['Primary %'] < 100)]
data.head()
data_dem.head()
# Create Bar Graph of Categories Comparison
# Removed Warren because command did not recognize column name
# Some columns of Dem Data have options: [nan, 'Yes', 'No'] or ['No', 'Yes']
print(data_dem['Veteran?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['LGBTQ?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Elected Official?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Self-Funder?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['STEM?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Obama Alum?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Party Support?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Emily Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Guns Sense Candidate?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Biden Endorsed?'].unique()) # [nan, 'Yes', 'No']
# print(data_dem['Warren Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Sanders Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Our Revolution Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Justice Dems Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['PCCC Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['Indivisible Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['WFP Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['VoteVets Endorsed?'].unique()) # [nan, 'Yes', 'No']
print(data_dem['No Labels Support?'].unique()) # [nan, 'Yes', 'No']
# Create Lists for Bar graph of categories values
no = []
yes = []
# No list
no.append(data_dem['Veteran?'].value_counts(sort=True)[0])
no.append(data_dem['LGBTQ?'].value_counts(sort=True)[0])
no.append(data_dem['Elected Official?'].value_counts(sort=True)[0]) # 667
no.append(data_dem['Self-Funder?'].value_counts(sort=True)[0]) # 768
no.append(data_dem['STEM?'].value_counts(sort=True)[0])
no.append(data_dem['Obama Alum?'].value_counts(sort=True)[0])
no.append(data_dem['Party Support?'].value_counts(sort=True)[0])
no.append(data_dem['Emily Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Guns Sense Candidate?'].value_counts(sort=True)[0])
no.append(data_dem['Biden Endorsed?'].value_counts(sort=True)[0])
# no.append(data_dem['Warren Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Sanders Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Our Revolution Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Justice Dems Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['PCCC Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['Indivisible Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['WFP Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['VoteVets Endorsed?'].value_counts(sort=True)[0])
no.append(data_dem['No Labels Support?'].value_counts(sort=True)[0])
# Yes list
yes.append(data_dem['Veteran?'].value_counts(sort=True)[1])
yes.append(data_dem['LGBTQ?'].value_counts(sort=True)[1])
yes.append(data_dem['Elected Official?'].value_counts(sort=True)[1]) # 667
yes.append(data_dem['Self-Funder?'].value_counts(sort=True)[1]) # 768
yes.append(data_dem['STEM?'].value_counts(sort=True)[1])
yes.append(data_dem['Obama Alum?'].value_counts(sort=True)[1])
yes.append(data_dem['Party Support?'].value_counts(sort=True)[1])
yes.append(data_dem['Emily Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Guns Sense Candidate?'].value_counts(sort=True)[1])
yes.append(data_dem['Biden Endorsed?'].value_counts(sort=True)[1])
# yes.append(data_dem['Warren Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Sanders Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Our Revolution Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Justice Dems Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['PCCC Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['Indivisible Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['WFP Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['VoteVets Endorsed?'].value_counts(sort=True)[1])
yes.append(data_dem['No Labels Support?'].value_counts(sort=True)[1])
# Bar Graph Plot
index = ['Veteran?', 'LGBTQ?', 'Elected Official?', 'Self-Funder?', 'STEM?', 'Obama Alum?',
'Party Support?', 'Emily Endorsed?', 'Guns Sense Candidate?', 'Biden Endorsed?', 'Sanders Endorsed?', 'Our Revolution Endorsed?',
'Justice Dems Endorsed?', 'PCCC Endorsed?', 'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?', 'No Labels Support?']
df = pd.DataFrame({'No': no, 'Yes': yes}, index = index)
df.plot.bar()
figure(figsize=(40,60))
data_dem
#Visualization for veteran status
veterans_dem = data_dem[data_dem["Veteran?"]=="Yes"]
not_veterans_dem = data_dem[data_dem["Veteran?"]=="No"]
#Histogram of win
veterans_plot = plt.hist(veterans_dem["Primary %"])
plt.show()
not_veterans_plot = plt.hist(not_veterans_dem["Primary %"])
plt.show()
input_1