import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
train_features = pd.read_csv("data/training_set_features.csv", index_col="respondent_id")
train_labels = pd.read_csv("data/training_set_labels.csv", index_col="respondent_id")
df = train_features.join(train_labels)
pd.Series(train_features.isnull().sum()*100/len(train_features)).sort_values(ascending=False).head(15)
print(f"Number of Respondents is {train_features.shape[0]} and there are {train_features.shape[1]} attributes.")
plt.style.use('fivethirtyeight')
n_obs = train_labels.shape[0]
fig, ax = plt.subplots(2, 1, sharex=True)
x1 = train_labels["h1n1_vaccine"].value_counts().div(n_obs).index.tolist()
y1 = train_labels["h1n1_vaccine"].value_counts().div(n_obs).values.tolist()
ax[0].barh(x1, y1)
ax[0].set_yticks([0, 1])
ax[0].set_ylabel("H1N1")
x2 = train_labels["seasonal_vaccine"].value_counts().div(n_obs).index.tolist()
y2 = train_labels["seasonal_vaccine"].value_counts().div(n_obs).values.tolist()
ax[1].barh(x2, y2)
ax[1].set_yticks([0, 1])
ax[1].set_ylabel("Seasonal")
plt.suptitle('Proportion of Vaccines')
fig.tight_layout()
def plot(col, title, labels=None):
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots(1, 2, sharey=True, figsize=(12, 6))
counts = df[["h1n1_vaccine", col]].groupby(["h1n1_vaccine", col]).size().unstack("h1n1_vaccine")
group_counts = counts.sum(axis="columns")
props = counts.div(group_counts, axis="index")
x = props.index.tolist()
y = list(props[0].values)
z = list(props[1].values)
ax[0].barh(x, y, label="No")
ax[0].barh(x, z, left=y, label="Yes")
ax[0].set_title("H1N1 Vaccine")
ax[0].legend(loc="upper center")
counts1 = df[["seasonal_vaccine", col]].groupby(["seasonal_vaccine", col]).size().unstack("seasonal_vaccine")
group_counts1 = counts1.sum(axis="columns")
props1 = counts1.div(group_counts1, axis="index")
x1 = props1.index.tolist()
y1 = list(props1[0].values)
z1 = list(props1[1].values)
ax[1].barh(x1, y1, label="No")
ax[1].barh(x1, z1, left=y1, label="Yes")
ax[1].set_title("Seasonal Vaccine")
ax[1].legend(loc="upper center")
plt.suptitle(title)
plt.yticks(x, labels)
plt.tight_layout()
plot(col="h1n1_concern",labels=["Somewhat concerned", "Not very concerned", "Very concerned", "Not at all concerned"],
title="Level of concern about the H1N1 flu")
plot(col="h1n1_knowledge", labels=["A little knowledge", "A lot of knowledge", "No knowledge"], title="Level of knowledge about H1N1 flu")
plot(col="behavioral_antiviral_meds", labels=["No", "Yes"], title="Has taken antiviral medications")
plot(col="behavioral_avoidance", labels=["Yes", "No"], title="Has avoided close contact with others with flu-like symptoms")
plot(col="behavioral_face_mask", labels=["No", "Yes"], title="Has bought a face mask")
plot(col="behavioral_wash_hands", labels=["Yes", "No"], title="Has frequently washed hands or used hand sanitizer")
plot(col="behavioral_large_gatherings", labels=["No", "Yes"], title="Has reduced time at large gatherings")
plot(col="behavioral_outside_home", labels=["No", "Yes"], title="Has reduced contact with people outside of own household")
plot(col="doctor_recc_h1n1", labels=["No", "Yes"], title="H1N1 flu vaccine was recommended by doctor")
plot(col="doctor_recc_seasonal", labels=["No", "Yes"], title="Seasonal flu vaccine was recommended by doctor")
plot(col="chronic_med_condition", labels=["No", "Yes"], title="Has chronic medical conditions")
plot(col="child_under_6_months", labels=["No", "Yes"], title="Has regular close contact with a child under the age of six months")
plot(col="health_worker", labels=["No", "Yes"], title="Is a healthcare worker")
plot(col="health_insurance", labels=["Yes", "No"], title="Has health insurance")
plot(col="opinion_h1n1_vacc_effective", labels=["Somewhat", "Very effective", "Don't know", "Not very effective", "Not at all effective"],
title="Respondent's opinion about H1N1 vaccine effectiveness")
plot(col="opinion_h1n1_risk", labels=["Somewhat low", "Very low", "Somewhat high", "Very high", "Don't know"],
title="Respondent's opinion about risk of getting sick with H1N1 flu without vaccine")
plot(col="opinion_h1n1_sick_from_vacc", labels=["Not very worried", "Not at all worried", "Somewhat worried", "Very worried", "Don't know"],
title="Respondent's worry of getting sick from taking H1N1 vaccine")
plot(col="opinion_seas_vacc_effective", labels=["Somewhat effective", "Very effective", "Not very effective", "Not at all effective", "Don't know"],
title="Respondent's opinion about seasonal flu vaccine effectiveness")
plot(col="opinion_seas_risk", labels=["Somewhat low", "Somewhat high", "Very low", "Very high", "Don't know"],
title="Respondent's opinion about risk of getting sick with seasonal flu without vaccine")
plot(col="opinion_seas_sick_from_vacc", labels=["Not at all worried", "Not very worried", "Somewhat worried", "Very worried", "Don't know"],
title="Respondent's worry of getting sick from taking seasonal flu vaccine")
plot(col="age_group", title="Age group of respondent")
plot(col="education", title="Self-reported education level")
plot(col="race", title="Race of respondent")
plot(col="sex", title="Sex of respondent")
plot(col="income_poverty", title="Household annual income of respondent with respect to 2008 Census poverty thresholds")
plot(col="marital_status", title="Marital status of respondent")
plot(col="rent_or_own", title="Housing situation of respondent")
plot(col="employment_status", title="Employment status of respondent")
plot(col="hhs_geo_region", title="Respondent's residence using a 10-region geographic classification")
plot(col="census_msa", title="Respondent's residence within metropolitan statistical areas (MSA) as defined by the U.S. Census")
plot(col="household_adults", title="Number of other adults in household")
plot(col="household_children", title="Number of children in household")
plot(col="employment_industry", title="Type of industry respondent is employed in")
plot(col="employment_occupation", title="Type of occupation of respondent")