import pandas as pd
from tqdm import tqdm
from xone import utils, cache
states = pd.read_html(
'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States'
)
states[0].iloc[:, 0].values
all_states = (
states[0].iloc[:, 0]
.str.replace('\[E\]', '')
.str.replace(' ', '-')
.values
)
all_states
@cache.with_cache(data_path='States/AllCounties', file_fmt='[[today]] {state}.pkl', update_freq='2D')
def state_poll_data(state: str) -> pd.DataFrame:
"""
Get state poll data from Sky News
"""
return (
pd.read_html(
f'https://election.news.sky.com/us-election/{state}',
header=0,
)[0]
.assign(State=state)
)
state_poll_data(state='Pennsylvania')
data = (
pd.concat(
map(state_poll_data, tqdm(all_states)),
sort=False,
)
.query('County != "Total"')
.apply(pd.to_numeric, errors='ignore')
.reset_index(drop=True)
)
data[['State', 'County', 'Biden', 'Trump', 'Other']]
100%|██████████| 50/50 [00:01<00:00, 29.27it/s]
data.query('(Biden == 0) or (Trump == 0)')
def first(sub: pd.Series) -> pd.Series:
"""
Count first number frequency
"""
return (
sub
.astype(str)
.str[0]
.astype(int)
.value_counts()
.sort_index()
.iloc[1:]
)
first(data.Biden)
(
data[['Biden', 'Trump', 'Other']]
.apply(first, axis=0)
.plot(
kind='bar',
figsize=(14, 6),
color=['#019BD8', '#D81C28', '#FFCC00'],
linewidth=0,
width=.85,
)
.legend(fontsize='x-large')
)
from kit import xml_parse
dist_data = utils.read_zip(
zip_url='https://results.enr.clarityelections.com//PA/Allegheny/106267/270118/reports/detailxml.zip',
read_func=xml_parse.read_xml,
parties=['DEM', 'REP'],
)
dist_data
xml_parse.freq_plot(dist_data)
def group_dist(dist_name: str) -> str:
"""
Clean up district name for grouping purposes
"""
return dist_name.split(' DIST ')[0]
def agg_county(cnt_data: pd.DataFrame) -> pd.DataFrame:
"""
Aggregate data within county
"""
agg_data = (
cnt_data
.set_index(map(group_dist, cnt_data.index))
)
return (
agg_data
.pivot_table(
index=agg_data.index,
aggfunc='sum',
)
)
agg_county(dist_data)
xml_parse.freq_plot(agg_county(dist_data))