Final Assignment - Yochan Khoi & Victor Lequertier
This document is the notebook used to create our outputs presentation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
1. Data Exploration
df = pd.read_csv("Schools.txt", sep=" ")
df
df.round(2).head()
df.isna().sum()
df.describe()
fig, ax = plt.subplots(1, 4, figsize=(25, 7), constrained_layout = True)
for variable, subplot in zip(df.columns, ax.flatten()):
sns.boxplot(y = df[variable], ax = subplot)
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
variables = ["score", "class_size", "migration", "scholarship"]
colors = ['#00afb9', '#FDD5AF', '#f07167', '#B185DB']
fig = make_subplots(rows=1,
cols=4)
for i, variable in enumerate(variables):
trace = go.Box(y=df[variable], marker_color=colors[i], notched=True, name="")
fig.add_trace(
trace,
row=1,
col=i+1
)
fig.update_layout(height=500,
width=1200,
title_text="Box plot of each continuous variable",
showlegend=False)
fig.layout.template = 'plotly_white'
fig.show()
2. Class Size Bins Analysis
df['migration_bin'] = pd.cut(df.migration,bins=[0,33,66,200],labels=['Low', 'Medium', 'High'])
df['class_size_bin'] = pd.cut(df.migration,bins=[0,10,20,100],labels=['Low', 'Medium', 'High'])
df['migration_bin'].value_counts()
import plotly.figure_factory as ff
group_labels=[
"small class",
"medium class",
"large class"]
fig = ff.create_distplot(
[
df[df["class_size_bin"] == "Low"]["score"],
df[df["class_size_bin"] == "Medium"]["score"],
df[df["class_size_bin"] == "High"]["score"]
],
group_labels=group_labels,
show_hist=False)
fig.layout.template = 'plotly_white'
fig.show()
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
group_labels=[
"small class",
"medium class",
"large class"]
bins = ["Low", "Medium", "High"]
fig = make_subplots(rows=3,
cols=1)
for i, mybin in enumerate(bins):
trace = go.Box(x=df[df['class_size_bin'] == mybin]['score'], notched=True, name="")
fig.add_trace(
trace,
row=i+1,
col=1)
fig.update_layout(height=350,
width=1200,
showlegend=False)
fig.layout.template = 'plotly_white'
fig.show()
df.head()
3. Univariate data analysis
import plotly.express as px
fig = px.histogram(df["score"])
fig.show()
fig = px.histogram(df["scholarship"])
fig.show()
fig = px.histogram(df["class_size"])
fig.show()
fig = px.histogram(df["migration"])
fig.show()
df_corr = df.corr()
print(df_corr)
score class_size migration scholarship
score 1.000000 -0.226363 -0.644124 -0.868772
class_size -0.226363 1.000000 0.187642 0.135203
migration -0.644124 0.187642 1.000000 0.653061
scholarship -0.868772 0.135203 0.653061 1.000000
4. Multivariate Data Analysis
We start by looking at variables correlations with a correlation heatmap
%config InlineBackend.figure_format='retina'
fig, ax = plt.subplots(figsize = (7, 5))
sns.heatmap(df_corr, annot = True, cmap="Blues")
import plotly.express as px
px.imshow(df_corr, color_continuous_scale="blues")
# Lets plot a pairplot
%matplotlib inline
sns.set_context('paper')
pairplot = sns.PairGrid(df)
pairplot.map_upper(sns.scatterplot, linewidths = 1, edgecolor = 'w', s = 10)
pairplot.map_diag(plt.hist)
pairplot.map_lower(sns.kdeplot)
5. Linear regression
model1 = smf.ols("score ~ class_size + class_size + scholarship + migration", data=df).fit()
print(model1.summary())
OLS Regression Results
==============================================================================
Dep. Variable: score R-squared: 0.775
Model: OLS Adj. R-squared: 0.773
Method: Least Squares F-statistic: 476.3
Date: Mon, 18 Oct 2021 Prob (F-statistic): 4.03e-134
Time: 13:23:41 Log-Likelihood: -1520.5
No. Observations: 420 AIC: 3049.
Df Residuals: 416 BIC: 3065.
Df Model: 3
Covariance Type: nonrobust
===============================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------
Intercept 700.1500 4.686 149.423 0.000 690.939 709.361
class_size -0.9983 0.239 -4.181 0.000 -1.468 -0.529
scholarship -0.5473 0.022 -25.341 0.000 -0.590 -0.505
migration -0.1216 0.032 -3.762 0.000 -0.185 -0.058
==============================================================================
Omnibus: 6.998 Durbin-Watson: 1.438
Prob(Omnibus): 0.030 Jarque-Bera (JB): 10.626
Skew: -0.024 Prob(JB): 0.00493
Kurtosis: 3.778 Cond. No. 621.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
model2 = smf.ols("score ~ class_size + I(class_size**2)+ scholarship + migration", data=df).fit()
print(model2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: score R-squared: 0.776
Model: OLS Adj. R-squared: 0.773
Method: Least Squares F-statistic: 358.5
Date: Mon, 18 Oct 2021 Prob (F-statistic): 3.69e-133
Time: 13:23:41 Log-Likelihood: -1519.5
No. Observations: 420 AIC: 3049.
Df Residuals: 415 BIC: 3069.
Df Model: 4
Covariance Type: nonrobust
======================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------
Intercept 740.8142 29.787 24.871 0.000 682.262 799.366
class_size -5.1776 3.033 -1.707 0.089 -11.139 0.784
I(class_size ** 2) 0.1064 0.077 1.382 0.168 -0.045 0.258
scholarship -0.5491 0.022 -25.407 0.000 -0.592 -0.507
migration -0.1179 0.032 -3.641 0.000 -0.182 -0.054
==============================================================================
Omnibus: 8.215 Durbin-Watson: 1.441
Prob(Omnibus): 0.016 Jarque-Bera (JB): 13.057
Skew: -0.062 Prob(JB): 0.00146
Kurtosis: 3.855 Cond. No. 2.70e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.7e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
6. Visual representation of the model
To assess our model quality and to describe it let's start by plotting a visual representation of the variables impact:
fig = px.bar(model1.params[1:],
orientation="h",
text=round(model1.params[1:], 3)
)
fig.layout.template = 'plotly_white'
fig.show()
%config InlineBackend.figure_format='retina'
# Coefficient confidence interval
centers = model1.params[1:]
conf_ints = model1.conf_int().iloc[1:, :]
sort_index = np.argsort(centers.values)
centers = centers[sort_index]
conf_ints = conf_ints.iloc[sort_index]
plt.barh(y = range(len(centers)),
left = conf_ints[0],
width = conf_ints[1] - conf_ints[0],
height = 0.2,
color= "#1460A8")
plt.yticks(range(len(centers)), conf_ints.index)
plt.plot(centers, range(len(centers)), 'ro')
plt.plot([0, 0], [-0.5, len(centers) - 0.5],
color = 'gray',
linestyle = 'dashed')
plt.xlim((-2, 2))
plt.ylim((-0.5, len(centers) - 0.5))
plt.title('Confidence interval explanatory variables')
residuals = model1.resid
residuals
fig = px.scatter(x=model1.fittedvalues, y=residuals)
fig.show()
fig = px.histogram(residuals, title="Distribution of the residuals")
fig.layout.template = 'plotly_white'
fig.show()