Predicting global happiness
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import probplot
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
sns.set_style("whitegrid")
Abstract
Introduction
Data
Our dataset
happiness = pd.read_csv('WHR20_DataForFigure2.1.csv')
data = happiness.drop(['Standard error of ladder score','upperwhisker', 'lowerwhisker', 'Ladder score in Dystopia', 'Explained by: Log GDP per capita', 'Explained by: Social support', 'Explained by: Healthy life expectancy', 'Explained by: Freedom to make life choices', 'Explained by: Generosity', 'Explained by: Perceptions of corruption', 'Dystopia + residual'], axis='columns')
data.describe()
Response variable
sns.distplot(data['Ladder_score'])
plt.xlabel('Ladder score')
plt.title('Distribution of ladder scores')
plt.show()
print("Figure 2: A histogram of the ladder scores")
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
Figure 2: A histogram of the ladder scores
Explanatory variables
Population
The regression model
formula_string = "Ladder_score ~ Logged_GDP_per_capita + Social_support + Healthy_life_expectancy + Freedom_to_make_life_choices + Perceptions_of_corruption"
model = sm.formula.ols(formula = formula_string, data = data)
model_fitted = model.fit()
print(model_fitted.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Ladder_score R-squared: 0.746
Model: OLS Adj. R-squared: 0.737
Method: Least Squares F-statistic: 86.25
Date: Sat, 29 May 2021 Prob (F-statistic): 6.06e-42
Time: 14:44:08 Log-Likelihood: -128.11
No. Observations: 153 AIC: 268.2
Df Residuals: 147 BIC: 286.4
Df Model: 5
Covariance Type: nonrobust
================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------
Intercept -1.9389 0.633 -3.062 0.003 -3.190 -0.688
Logged_GDP_per_capita 0.2137 0.081 2.631 0.009 0.053 0.374
Social_support 2.7419 0.662 4.141 0.000 1.433 4.050
Healthy_life_expectancy 0.0347 0.013 2.672 0.008 0.009 0.060
Freedom_to_make_life_choices 1.9220 0.484 3.972 0.000 0.966 2.878
Perceptions_of_corruption -0.7275 0.305 -2.389 0.018 -1.329 -0.126
==============================================================================
Omnibus: 8.619 Durbin-Watson: 1.452
Prob(Omnibus): 0.013 Jarque-Bera (JB): 8.441
Skew: -0.524 Prob(JB): 0.0147
Kurtosis: 3.477 Cond. No. 1.12e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.12e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Checking assumptions for regression
predicted_values = model_fitted.params.Intercept + model_fitted.params.Logged_GDP_per_capita * data['Logged_GDP_per_capita'] + model_fitted.params.Social_support * data['Social_support'] + model_fitted.params.Healthy_life_expectancy * data['Healthy_life_expectancy'] + model_fitted.params.Freedom_to_make_life_choices * data['Freedom_to_make_life_choices'] + model_fitted.params.Perceptions_of_corruption * data['Perceptions_of_corruption']
residual_scores = data['Ladder_score'] - predicted_values
plt.figure(figsize=(20,20))
plt.subplot(331)
sns.scatterplot(x=predicted_values, y=abs(residual_scores))
plt.title("Absolute residuals")
plt.xlabel('Predicted ladder score')
plt.ylabel('Residual scores')
plt.subplot(332)
sns.distplot(residual_scores)
plt.xlabel('Residual scores')
plt.title('Histogram of residual scores')
plt.subplot(333)
stats.probplot(x=residual_scores, plot=plt)
plt.title('Q-Q plot of residual scores')
plt.show()
print("Figure 3: A scatter plot of the absolute residuals against the predicted ladder scores, a histogram of the residuals, and a Q-Q plot of the residuals")
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
Figure 3: A scatter plot of the absolute residuals against the predicted ladder scores, a histogram of the residuals, and a Q-Q plot of the residuals
plt.figure(figsize=(20,20))
plt.subplot(334)
sns.scatterplot(np.arange(0, len(residual_scores)), residual_scores)
plt.title('Residuals vs. order of collection')
plt.xlabel('Order of collection')
plt.ylabel('Residuals')
plt.subplot(335)
sns.scatterplot(x=data['Logged_GDP_per_capita'], y=residual_scores)
plt.title('Residuals by logged GDP per capita')
plt.xlabel('Logged GDP per capita')
plt.ylabel('Residuals')
plt.subplot(336)
sns.scatterplot(x=data['Social_support'], y=residual_scores)
plt.title('Residuals by social support')
plt.xlabel('Social support')
plt.ylabel('Residuals')
plt.subplot(337)
sns.scatterplot(x=data['Healthy_life_expectancy'], y=residual_scores)
plt.title('Residuals by healthy life expectancy')
plt.xlabel('Healthy life expectancy')
plt.ylabel('Residuals')
plt.subplot(338)
sns.scatterplot(x=data['Freedom_to_make_life_choices'], y=residual_scores)
plt.title('Residuals by freedom to make life choices')
plt.xlabel('Freedom to make life choices')
plt.ylabel('Residuals')
plt.subplot(339)
sns.scatterplot(x=data['Perceptions_of_corruption'], y=residual_scores)
plt.title('Residuals by perceptions of corruption')
plt.xlabel('Perception of corruption')
plt.ylabel('Residuals')
plt.show()
print("Figure 4: Scatter plots of the residuals against the order of collection and each explanatory variable.")
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
FutureWarning
Figure 4: Scatter plots of the residuals against the order of collection and each explanatory variable.
Collinearity
explan_variables = data[['Logged_GDP_per_capita', 'Social_support', 'Healthy_life_expectancy', 'Freedom_to_make_life_choices', 'Perceptions_of_corruption']]
vif = pd.DataFrame()
vif['Variance Inflation Factor'] = [variance_inflation_factor(explan_variables.values, i) for i in range(explan_variables.shape[1])]
vif['Features'] = explan_variables.columns
print(vif)
Variance Inflation Factor Features
0 269.715541 Logged_GDP_per_capita
1 131.596085 Social_support
2 280.048604 Healthy_life_expectancy
3 54.933133 Freedom_to_make_life_choices
4 12.347601 Perceptions_of_corruption
Sub-research questions
gdp = np.corrcoef(data['Ladder_score'], data['Logged_GDP_per_capita'])[0,1]
ss = np.corrcoef(data['Ladder_score'], data['Social_support'])[0,1]
hle = np.corrcoef(data['Ladder_score'], data['Healthy_life_expectancy'])[0,1]
ftmlc = np.corrcoef(data['Ladder_score'], data['Freedom_to_make_life_choices'])[0,1]
poc = np.corrcoef(data['Ladder_score'], data['Perceptions_of_corruption'])[0,1]
print('Logged GDP per capita:', gdp)
print('Social support:', ss)
print('Healthy life expectancy: ', hle)
print('Freedom to make life choices: ', ftmlc)
print('Perceptions of corruption: ', poc)
Logged GDP per capita: 0.7753744007526838
Social support: 0.7650007567161378
Healthy life expectancy: 0.7703162898718493
Freedom to make life choices: 0.590596782922588
Perceptions of corruption: -0.4183050872559095
formula_string = "Ladder_score ~ Logged_GDP_per_capita"
model2 = sm.formula.ols(formula = formula_string, data = data)
model_fitted2 = model2.fit()
print(model_fitted2.summary())
plt.figure(figsize=(8,8))
plt.scatter(x = data['Logged_GDP_per_capita'], y = data['Ladder_score'])
plt.title('Ladder score vs. logged GDP per capita')
plt.xlabel('Logged GDP per capita')
plt.ylabel('Ladder score')
sns.lineplot(x = data['Logged_GDP_per_capita'], y = model_fitted2.params.Intercept + model_fitted2.params.Logged_GDP_per_capita * data['Logged_GDP_per_capita'])
plt.show()
print("Figure 5: Linear regression model of the ladder score on the logged GDP per capita.")
OLS Regression Results
==============================================================================
Dep. Variable: Ladder_score R-squared: 0.601
Model: OLS Adj. R-squared: 0.599
Method: Least Squares F-statistic: 227.6
Date: Sat, 29 May 2021 Prob (F-statistic): 5.98e-32
Time: 14:44:41 Log-Likelihood: -162.55
No. Observations: 153 AIC: 329.1
Df Residuals: 151 BIC: 335.2
Df Model: 1
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------------
Intercept -1.1986 0.446 -2.688 0.008 -2.080 -0.318
Logged_GDP_per_capita 0.7177 0.048 15.088 0.000 0.624 0.812
==============================================================================
Omnibus: 2.646 Durbin-Watson: 1.113
Prob(Omnibus): 0.266 Jarque-Bera (JB): 2.697
Skew: -0.297 Prob(JB): 0.260
Kurtosis: 2.737 Cond. No. 74.2
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Figure 5: Linear regression model of the ladder score on the logged GDP per capita.
residual_scores = data['Ladder_score'] - (model_fitted.params.Intercept + model_fitted.params.Logged_GDP_per_capita * data['Logged_GDP_per_capita'])
plt.figure(figsize=(20,8))
plt.subplot(121)
sns.distplot(residual_scores)
plt.xlabel('Residual scores')
plt.show
plt.subplot(122)
stats.probplot(x=residual_scores,plot=plt)
plt.show()
print("Figure 6: A histogram and Probability plot of the residual scores of the model of ladder score on logged GDP per capita.")
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
Figure 6: A histogram and Probability plot of the residual scores of the model of ladder score on logged GDP per capita.