import pandas as pd
df = pd.read_csv('rawdata.csv')
len(df.index)
dup_cols = [col for col in df.columns if col not in ['Submission ID', 'Submitted at']]
df_clean = df.drop_duplicates(subset=dup_cols)
len(df_clean)
df_response = df_clean.groupby('PGY year?').agg(
count = ('Submission ID', 'nunique')
)
df_response
(df_clean['Did you vote in the 2022 Midterm Election?'] == 'I am not eligible to vote (i.e. non-US citizen)').sum()
df_eligible = df_clean[df_clean['Did you vote in the 2022 Midterm Election?'] != 'I am not eligible to vote (i.e. non-US citizen)']
len(df_eligible.index)
def did_vote(response):
if response == 'Yes':
return 1
else:
return 0
df_eligible['did_vote'] = df_eligible.apply(
lambda x:
did_vote(x['Did you vote in the 2022 Midterm Election?']),
axis=1
)
df_samestate = df_eligible.groupby('Are you registered to vote in the same state as your institution?').agg(
count = ('Respondent ID', 'nunique')
)
df_samestate
df_state = df_eligible.groupby('In which state are you registered?').agg(
count = ('Respondent ID', 'nunique')
)
df_state
df_eligible['did_vote'].sum()
turnout = df_eligible['did_vote'].sum()/len(df_eligible.index)
turnout
df_turnout = df_eligible.groupby('PGY year?').agg(
vote = ('did_vote', 'sum'),
count = ('Respondent ID', 'nunique')
)
df_turnout.reset_index(inplace=True)
df_turnout['pct_voting'] = df_turnout['vote']/df_turnout['count']
df_turnout
df_not_vote = df_eligible[df_eligible['did_vote'] != 1]
len(df_not_vote.index)
df_not_vote.columns = df_not_vote.columns.str.replace(r'What reasons contributed to your decision not to vote\?', '')
df_not_vote.columns = df_not_vote.columns.str.replace(r'\n\(Select all that apply\)', '')
df_not_vote.columns = df_not_vote.columns.str.replace(r'\(', '')
df_not_vote.columns = df_not_vote.columns.str.replace(r'\)', '')
barrier_cols = df_not_vote.columns[13:22].to_list()
barrier_cols
df_melt = df_not_vote.melt(
id_vars = ['Respondent ID'],
value_vars = barrier_cols,
var_name='barrier'
)
df_melt['value'] = df_melt['value'].replace({True: 1, False: 0})
df_melt.dropna(inplace=True)
df_barriers = df_melt.groupby('barrier').agg(
count = ('value', 'sum'),
total = ('Respondent ID', 'nunique')
)
df_barriers.reset_index(inplace=True)
df_barriers
df_barriers['count'] = df_barriers['count'].replace(6, 7)
df_barriers = df_barriers[~df_barriers['barrier'].isin(['Other', 'Please describe...'])]
df_barriers.sort_values(by='count', ascending=False, inplace=True)
df_barriers['pct'] = df_barriers['count']/df_barriers['total']
df_barriers