import pandas as pd
import scipy as stats
from scipy.stats import linregress
import numpy as np
import matplotlib.pyplot as plt
import wbgapi as wb
import seaborn as sns
wb.source.info()
wb.series.info()
HCI_data = pd.read_csv('HCI.csv')
print(list(HCI_data.columns))
HCI_data
['Country Name', 'WB Code', 'Region', 'Income Group', 'Probability of Survival to Age 5', 'Expected Years of School', 'Harmonized Test Scores', 'Learning-Adjusted Years of School', 'Fraction of Children Under 5 Not Stunted', 'Adult Survival Rate', 'HUMAN CAPITAL INDEX 2020 (LOWER BOUND)', 'HUMAN CAPITAL INDEX 2020', 'HUMAN CAPITAL INDEX 2020 (UPPER BOUND)']
def linreg(table,x,y):
res=linregress(table.dropna()[x], table.dropna()[y])
print('Correlation:', res.rvalue), print('P-Value:', res.pvalue)
plt.xlabel(x)
plt.ylabel(y)
plt.plot(table[x], res.intercept + res.slope*table[x], 'r', label='fitted line')
HCI_data.plot(kind='scatter', x='Harmonized Test Scores', y='Adult Survival Rate', figsize=(10, 6), color='darkblue')
plt.title('Correlation between Adult Survival Rate and Harmonized Test Scores')
plt.xlabel('Harmonized Test Scores')
plt.ylabel('Adult Survival Rate')
linreg(HCI_data, 'Harmonized Test Scores', 'Adult Survival Rate')
plt.show()
Correlation: 0.6153470804816563
P-Value: 1.648997379615839e-19
edu_spending = wb.data.DataFrame('NY.ADJ.AEDU.GN.ZS')
percent_gni_spending=edu_spending['YR2010']
#HCI_data.merge('region', 'NY.ADJ.AEDU.GN.ZS')
country_code=HCI_data['WB Code'].fillna(0)
percent_gni_spending = percent_gni_spending[:(len(country_code))]
HCI_data.insert(3, 'GNI Spending on Education', list(percent_gni_spending))
HCI_data
wb.economy.DataFrame()
HCI_data.plot(kind='scatter', x='Harmonized Test Scores', y='GNI Spending on Education', figsize=(10, 6), color='darkblue')
plt.title('Test Scores vs GNI Spending on Education')
plt.xlabel('Learning-Adjusted Years of School')
plt.ylabel('GNI Spending on Education')
linreg(HCI_data, 'Harmonized Test Scores', 'GNI Spending on Education')
plt.show()
Correlation: 0.10395515637604857
P-Value: 0.18802067905666314
HCI_data.plot(kind='scatter', x='Learning-Adjusted Years of School', y='GNI Spending on Education', figsize=(10, 6), color='darkblue')
plt.title('Does GNI Spending Proportion Affect How Many Years Children Spend in School?')
plt.xlabel('Learning-Adjusted Years of School')
plt.ylabel('GNI Spending on Education')
linreg(HCI_data, 'Learning-Adjusted Years of School', 'GNI Spending on Education')
plt.show()
Correlation: 0.1152402710161861
P-Value: 0.1442129164997901
http://nces.ed.gov/programs/coe/indicator/cmd#3
edu_test = pd.read_excel('edu_spending.xlsx')
edu_test
edu2 = edu_test.rename(columns={'Unnamed: 1':'WB Code', 'Unnamed: 4':'Spending per Student (thousands USD)'})
edu2
merged_table = pd.merge(HCI_data, edu2, on=['WB Code'])
merged_table = merged_table[merged_table['Spending per Student (thousands USD)']!= 'SE.XPD.PRIM.PC.ZS']
merged_table=merged_table.astype({'Spending per Student (thousands USD)': float})
merged_table
#merged_table_v2=merged_table.insert(0, "Spending per student(US Dollars)", new_lst, True)
merged_table.plot(kind='scatter', x='Harmonized Test Scores', y='Spending per Student (thousands USD)', figsize=(10, 6), color='darkblue')
plt.title('Real Amount Spent on Each Student vs Test Scores')
plt.xlabel('Harmonized Test Scores')
plt.ylabel('Spending per Student (thousands USD)')
linreg(merged_table, 'Harmonized Test Scores', 'Spending per Student (thousands USD)')
plt.show()
Correlation: 0.4765811784750939
P-Value: 3.0192584012384235e-09
merged_table.plot(kind='scatter', x='Learning-Adjusted Years of School', y='Spending per Student (thousands USD)', figsize=(10, 6), color='darkblue')
plt.title('Real Amount Spent on Each Student vs Years in School')
plt.xlabel('Learning-Adjusted Years of School')
plt.ylabel('Spending per Student (thousands USD)')
linreg(merged_table, 'Learning-Adjusted Years of School', 'Spending per Student (thousands USD)')
plt.show()
Correlation: 0.4878141235904284
P-Value: 1.1230901902525408e-09