# import libraries
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# object to hold our dataframes
dfs = {}
# read in ap_results and print tabular information
dfs["ap_results"] = pd.read_csv("ap_results.csv")
dfs["ap_results"].describe(include="all")
# read in class_size and print tabular information
dfs["class_size"] = pd.read_csv("class_size.csv")
dfs["class_size"].describe(include="all")
# read in demographics and print tabular information
dfs["demographics"] = pd.read_csv("demographics.csv")
dfs["demographics"].describe(include="all")
# read in grad_outcomes and print tabular information
dfs["grad_outcomes"] = pd.read_csv("grad_outcomes.csv")
dfs["grad_outcomes"].describe(include="all")
# read in sat_results and print tabular information
dfs["sat_results"] = pd.read_csv("sat_results.csv")
dfs["sat_results"].describe(include="all")
# concatenate the surveys
dfs["survey_all"] = pd.read_csv("survey_all.txt", delimiter="\t", encoding="windows-1252")
dfs["survey_d75"] = pd.read_csv("survey_d75.txt", delimiter="\t", encoding="windows-1252")
dfs["survey"] = pd.concat([dfs["survey_all"], dfs["survey_d75"]], axis=0)
# filter by these columns
dfs["survey"] = dfs["survey"][[
"dbn",
"rr_s",
"rr_t",
"rr_p",
"N_s",
"N_t",
"N_p",
"saf_p_11",
"com_p_11",
"eng_p_11",
"aca_p_11",
"saf_t_11",
"com_t_11",
"eng_t_11",
"aca_t_11",
"saf_s_11",
"com_s_11",
"eng_s_11",
"aca_s_11",
"saf_tot_11",
"com_tot_11",
"eng_tot_11",
"aca_tot_11",
]]
# print summary information
dfs["survey"].describe(include="all")
# uppercase the dbn column in the "survey" dataset so that it's consistent
dfs["survey"] = dfs["survey"].rename({"dbn": "DBN"}, axis=1)
# create a DBN column out of columns in the "class_size" dataset
dfs["class_size"]["padded_csd"] = dfs["class_size"]["CSD"].apply(lambda x : str(x) if len(str(x)) > 1 else str(x).zfill(2))
dfs["class_size"]["DBN"] = dfs["class_size"]["padded_csd"] + dfs["class_size"]["SCHOOL CODE"]
# convert relevant columns from object to float
for column in ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']:
dfs["sat_results"][column] = pd.to_numeric(dfs["sat_results"][column], errors="coerce")
# sum those columns to get a new total column
dfs['sat_results']['sat_score'] = dfs['sat_results']['SAT Math Avg. Score'] + dfs['sat_results']['SAT Critical Reading Avg. Score'] + dfs['sat_results']['SAT Writing Avg. Score']
# filter class_size dataset
dfs["class_size"] = dfs["class_size"][dfs["class_size"]["GRADE "] == "09-12"]
dfs["class_size"] = dfs["class_size"][dfs["class_size"]["PROGRAM TYPE"] == "GEN ED"]
dfs["class_size"] = dfs["class_size"].groupby("DBN").agg(np.mean)
dfs["class_size"] = dfs["class_size"].reset_index()
# filter demographics dataset
dfs["demographics"] = dfs["demographics"][dfs["demographics"]["schoolyear"] == 20112012]
# filter grad_outcomes dataset
dfs["grad_outcomes"] = dfs["grad_outcomes"][dfs["grad_outcomes"]["Cohort"] == "2006"]
dfs["grad_outcomes"] = dfs["grad_outcomes"][dfs["grad_outcomes"]["Demographic"] == "Total Cohort"]
# add the sat_results as the left-most dataset
combined_df = dfs["sat_results"]
# combine the ap_results and grad_outcomes dataset using a LEFT JOIN
combined_df = combined_df.merge(dfs["ap_results"], on="DBN", how="left")
combined_df = combined_df.merge(dfs["grad_outcomes"], on="DBN", how="left")
# combine the remaining datasets using an INNER JOIN
for dataset in ["class_size", "demographics", "survey"]:
combined_df = combined_df.merge(dfs[dataset], on="DBN", how="inner")
# use the average of the columns to fill in missing data caused by the joins
combined_df = combined_df.fillna(combined_df.mean())
# fill remaining missing data with 0 (note, if a column is missing the entirety of it's values, the fillna trick used above doesn't work)
combined_df = combined_df.fillna(0)
combined_df["school_district"] = combined_df["DBN"].apply(lambda x: x[:2])
# create a scatter plot and set the figure size
ax = combined_df.plot.scatter("saf_t_11", "sat_score", figsize=(8, 5))
# set x,y labels and titles of chart
ax.set_xlabel("Teacher Perception of School Safety")
ax.set_ylabel("SAT Score")
plt.title("Teacher Perception of School Safety And Its Impact on SAT Scores")
# display the chart
plt.show()
# create a scatter plot and set the figure size
ax = combined_df.plot.scatter("saf_s_11", "sat_score", figsize=(8, 5))
# set x,y labels and titles of chart
ax.set_xlabel("Student Perception of School Safety")
ax.set_ylabel("SAT Score")
plt.title("Student Perception of School Safety And Its Impact on SAT Scores")
# display the chart
plt.show()
# group by school district, calculate averages and then scatter plot
ax = combined_df.groupby("school_district").agg(np.mean).plot.scatter("saf_t_11", "sat_score", figsize=(8, 5))
# set x,y labels and titles of chart
ax.set_xlabel("Teacher Perception of District Safety")
ax.set_ylabel("SAT Score")
plt.title("Teacher Perception of District Safety And Its Impact on SAT Scores")
# show plot
plt.show()
# group by school district, calculate averages and then scatter plot
ax = combined_df.groupby("school_district").agg(np.mean).plot.scatter("saf_s_11", "sat_score", figsize=(8, 5))
# set x,y labels and titles of chart
ax.set_xlabel("Student Perception of District Safety")
ax.set_ylabel("SAT Score")
plt.title("Student Perception of District Safety And Its Impact on SAT Scores")
# display the chart
plt.show()
# plot correlations between sat_score and race
ax = combined_df.corr()["sat_score"][["white_per", "asian_per", "black_per", "hispanic_per"]].plot(kind="bar")
# label chart
ax.set_ylabel("Correlation")
ax.set_xlabel("Race")
plt.title("Correlation of Race and SAT Scores")
# show chart
plt.show()
# create scatter plot between hispanic_per and sat_score
ax = combined_df.plot.scatter("hispanic_per", "sat_score", figsize=(8, 5))
# label chart
ax.set_xlabel("Percentage of Hispanic Students")
ax.set_ylabel("SAT Score")
plt.title("Percentage of Hispanic Students and SAT Scores")
# show chart
plt.show()
combined_df[combined_df["hispanic_per"] > 95]["SCHOOL NAME"]
combined_df[(combined_df["hispanic_per"] < 10) & (combined_df["sat_score"] > 1800)]["SCHOOL NAME"]
# plot correlation with sat_score and gneder
ax = combined_df.corr()["sat_score"][["male_per", "female_per"]].plot(kind="bar", figsize=(8, 5))
# label chart
ax.set_ylabel("Correlation")
ax.set_xlabel("Gender")
plt.title("Correlation of Gender and SAT Score")
# plot chart
plt.show()
# create scatter plot with female percentage and sat scores
ax = combined_df.plot.scatter("female_per", "sat_score", figsize=(8, 5))
# label chart
ax.set_ylabel("SAT Score")
ax.set_xlabel("Percentage of Females")
plt.title("Percentage of Female Students and SAT Score")
# show chart
plt.show()
# create new column with percentage of ap test takers from a school
combined_df["ap_percentage"] = combined_df["AP Test Takers "] / combined_df["total_enrollment"]
# create a scatter plot with ap_percentage and sat_score
ax = combined_df.plot.scatter("ap_percentage", "sat_score", figsize=(8, 5))
# label chart
ax.set_ylabel("SAT Score")
ax.set_xlabel("Percentage of AP Test Takers")
plt.title("Percentage of AP Test Takers in a School and SAT Scores")
# show chart
ax.set_xlim(0, 1)
plt.show()