Datathon project

import pandas as pd import scipy as stats from scipy.stats import linregress import numpy as np import matplotlib.pyplot as plt import wbgapi as wb import seaborn as sns

wb.source.info()

wb.series.info()

HCI_data = pd.read_csv('HCI.csv') print(list(HCI_data.columns)) HCI_data

def linreg(table,x,y): res=linregress(table.dropna()[x], table.dropna()[y]) print('Correlation:', res.rvalue), print('P-Value:', res.pvalue) plt.xlabel(x) plt.ylabel(y) plt.plot(table[x], res.intercept + res.slope*table[x], 'r', label='fitted line') HCI_data.plot(kind='scatter', x='Harmonized Test Scores', y='Adult Survival Rate', figsize=(10, 6), color='darkblue') plt.title('Correlation between Adult Survival Rate and Harmonized Test Scores') plt.xlabel('Harmonized Test Scores') plt.ylabel('Adult Survival Rate') linreg(HCI_data, 'Harmonized Test Scores', 'Adult Survival Rate') plt.show()

edu_spending = wb.data.DataFrame('NY.ADJ.AEDU.GN.ZS') percent_gni_spending=edu_spending['YR2010'] #HCI_data.merge('region', 'NY.ADJ.AEDU.GN.ZS') country_code=HCI_data['WB Code'].fillna(0) percent_gni_spending = percent_gni_spending[:(len(country_code))] HCI_data.insert(3, 'GNI Spending on Education', list(percent_gni_spending)) HCI_data

wb.economy.DataFrame()

HCI_data.plot(kind='scatter', x='Harmonized Test Scores', y='GNI Spending on Education', figsize=(10, 6), color='darkblue') plt.title('Test Scores vs GNI Spending on Education') plt.xlabel('Learning-Adjusted Years of School') plt.ylabel('GNI Spending on Education') linreg(HCI_data, 'Harmonized Test Scores', 'GNI Spending on Education') plt.show()

HCI_data.plot(kind='scatter', x='Learning-Adjusted Years of School', y='GNI Spending on Education', figsize=(10, 6), color='darkblue') plt.title('Does GNI Spending Proportion Affect How Many Years Children Spend in School?') plt.xlabel('Learning-Adjusted Years of School') plt.ylabel('GNI Spending on Education') linreg(HCI_data, 'Learning-Adjusted Years of School', 'GNI Spending on Education') plt.show()

http://nces.ed.gov/programs/coe/indicator/cmd#3

edu_test = pd.read_excel('edu_spending.xlsx') edu_test

edu2 = edu_test.rename(columns={'Unnamed: 1':'WB Code', 'Unnamed: 4':'Spending per Student (thousands USD)'}) edu2

merged_table = pd.merge(HCI_data, edu2, on=['WB Code']) merged_table = merged_table[merged_table['Spending per Student (thousands USD)']!= 'SE.XPD.PRIM.PC.ZS'] merged_table=merged_table.astype({'Spending per Student (thousands USD)': float}) merged_table #merged_table_v2=merged_table.insert(0, "Spending per student(US Dollars)", new_lst, True)

merged_table.plot(kind='scatter', x='Harmonized Test Scores', y='Spending per Student (thousands USD)', figsize=(10, 6), color='darkblue') plt.title('Real Amount Spent on Each Student vs Test Scores') plt.xlabel('Harmonized Test Scores') plt.ylabel('Spending per Student (thousands USD)') linreg(merged_table, 'Harmonized Test Scores', 'Spending per Student (thousands USD)') plt.show()

merged_table.plot(kind='scatter', x='Learning-Adjusted Years of School', y='Spending per Student (thousands USD)', figsize=(10, 6), color='darkblue') plt.title('Real Amount Spent on Each Student vs Years in School') plt.xlabel('Learning-Adjusted Years of School') plt.ylabel('Spending per Student (thousands USD)') linreg(merged_table, 'Learning-Adjusted Years of School', 'Spending per Student (thousands USD)') plt.show()