import pandas as pd
# these options allow for better viewability of data when performing EDA in jupyter notebook
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', df.shape[1]+1)
file = '.\data\Most-Recent-Cohorts-All-Data-Elements.csv'
df = pd.read_csv(file, low_memory=False)
df.shape
df.head()
for col in df.columns:
print (col)
features = ['INSTNM', 'CITY', 'STABBR', 'PREDDEG', 'LOCALE', 'ADM_RATE', 'SATVR75', 'SATMT75', 'UGDS', 'TUITIONFEE_IN', 'TUITIONFEE_OUT', 'C150_4', 'RET_FT4', 'C100_4', 'ICLEVEL', 'BOOKSUPPLY', 'ROOMBOARD_ON', 'OTHEREXPENSE_ON', 'ROOMBOARD_OFF', 'OTHEREXPENSE_OFF']
pd.set_option('display.max_rows', 30)
df_features = df[features]
df_features
df_4yr = df_features.loc[(df_features['ICLEVEL'] == 1) & (df_features['PREDDEG'] == 3)]
df_4yr
df_states = df_4yr.loc[(df_features['STABBR'] == 'MD') | (df_features['STABBR'] == 'DC') | (df_features['STABBR'] == 'VA') | (df_features['STABBR'] == 'PA') | (df_features['STABBR'] == 'WV') | (df_features['STABBR'] == 'DE')]
df_states
df_states.isnull().sum()
df_clean_adm = df_states.dropna(subset=['ADM_RATE'])
df_clean_adm = df_clean_adm[df_clean_adm['ADM_RATE'] != 1]
df_clean_adm.isnull().sum()
df_offcampus_na = df_clean_adm[df_clean_adm['ROOMBOARD_OFF'].isnull() & df_clean_adm['OTHEREXPENSE_OFF'].isnull()]
df_clean_offcampus = df_clean_adm
# means based off city data
for row in df_offcampus_na['CITY']:
roomboard = df_clean_adm[df_clean_adm['CITY'] == row]['ROOMBOARD_OFF'].mean()
otherexpense = df_clean_adm[df_clean_adm['CITY'] == row]['OTHEREXPENSE_OFF'].mean()
df_clean_offcampus.loc[(df_clean_offcampus['CITY'] == row) & (df_clean_offcampus['ROOMBOARD_OFF'].isnull()) , 'ROOMBOARD_OFF'] = round(roomboard, 2)
df_clean_offcampus.loc[(df_clean_offcampus['CITY'] == row) & (df_clean_offcampus['OTHEREXPENSE_OFF'].isnull()) , 'OTHEREXPENSE_OFF'] = round(otherexpense, 2)
# if city data not available means based off state data
for row in df_offcampus_na['STABBR']:
roomboard = df_clean_adm[df_clean_adm['STABBR'] == row]['ROOMBOARD_OFF'].mean()
otherexpense = df_clean_adm[df_clean_adm['STABBR'] == row]['OTHEREXPENSE_OFF'].mean()
df_clean_offcampus.loc[(df_clean_offcampus['STABBR'] == row) & (df_clean_offcampus['ROOMBOARD_OFF'].isnull()) , 'ROOMBOARD_OFF'] = round(roomboard, 2)
df_clean_offcampus.loc[(df_clean_offcampus['STABBR'] == row) & (df_clean_offcampus['OTHEREXPENSE_OFF'].isnull()) , 'OTHEREXPENSE_OFF'] = round(otherexpense, 2)
df_clean_offcampus.isnull().sum()
satbox = df_clean_offcampus.boxplot(column=['SATVR75', 'SATMT75'], figsize=(10,7))
satbox.set_ylabel('score')
satbox.set_xticklabels(['Verbal Section', 'Math Section'])
satbox.set_title('Distribution of 75th percentile SAT Scores from 4 Year Institutions in Maryland and Surrounding States')
satbox;
df_sat_na = df_clean_offcampus[df_clean_offcampus['SATVR75'].isnull() & df_clean_offcampus['SATMT75'].isnull()]
df_clean_sat = df_clean_offcampus
satmt = round(df_clean_sat['SATMT75'].mean(),)
satvr = round(df_clean_sat['SATVR75'].mean())
df_clean_sat.loc[df_clean_sat['SATMT75'].isnull() , 'SATMT75'] = satmt
df_clean_sat.loc[df_clean_sat['SATVR75'].isnull() , 'SATVR75'] = satvr
df_clean_sat.isnull().sum()
df_clean_sat[df_clean_sat['TUITIONFEE_IN'].isnull()]
df_clean = df_clean_sat
df_clean.fillna(0, inplace= True)
df_clean
state_hist = df_clean['STABBR'].hist();
state_hist.set_title('Number of Colleges per State')
df_clean[df_clean['STABBR'] != 'MD'].hist(column='TUITIONFEE_OUT', by='STABBR', figsize=(20,10));
df_clean[df_clean['STABBR'] == 'MD'].hist(column='TUITIONFEE_OUT', by='STABBR', figsize=(10,7));