Final Assignment - Yochan Khoi & Victor Lequertier

This document is the notebook used to create our outputs presentation

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import statsmodels.api as sm import statsmodels.formula.api as smf %matplotlib inline

1. Data Exploration

df = pd.read_csv("Schools.txt", sep=" ") df

df.round(2).head()

df.isna().sum()

df.describe()

fig, ax = plt.subplots(1, 4, figsize=(25, 7), constrained_layout = True) for variable, subplot in zip(df.columns, ax.flatten()): sns.boxplot(y = df[variable], ax = subplot)

import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots variables = ["score", "class_size", "migration", "scholarship"] colors = ['#00afb9', '#FDD5AF', '#f07167', '#B185DB'] fig = make_subplots(rows=1, cols=4) for i, variable in enumerate(variables): trace = go.Box(y=df[variable], marker_color=colors[i], notched=True, name="") fig.add_trace( trace, row=1, col=i+1 ) fig.update_layout(height=500, width=1200, title_text="Box plot of each continuous variable", showlegend=False) fig.layout.template = 'plotly_white' fig.show()

2. Class Size Bins Analysis

df['migration_bin'] = pd.cut(df.migration,bins=[0,33,66,200],labels=['Low', 'Medium', 'High']) df['class_size_bin'] = pd.cut(df.migration,bins=[0,10,20,100],labels=['Low', 'Medium', 'High'])

df['migration_bin'].value_counts()

import plotly.figure_factory as ff group_labels=[ "small class", "medium class", "large class"] fig = ff.create_distplot( [ df[df["class_size_bin"] == "Low"]["score"], df[df["class_size_bin"] == "Medium"]["score"], df[df["class_size_bin"] == "High"]["score"] ], group_labels=group_labels, show_hist=False) fig.layout.template = 'plotly_white' fig.show()

import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots group_labels=[ "small class", "medium class", "large class"] bins = ["Low", "Medium", "High"] fig = make_subplots(rows=3, cols=1) for i, mybin in enumerate(bins): trace = go.Box(x=df[df['class_size_bin'] == mybin]['score'], notched=True, name="") fig.add_trace( trace, row=i+1, col=1) fig.update_layout(height=350, width=1200, showlegend=False) fig.layout.template = 'plotly_white' fig.show()

df.head()

3. Univariate data analysis

import plotly.express as px

fig = px.histogram(df["score"]) fig.show()

fig = px.histogram(df["scholarship"]) fig.show()

fig = px.histogram(df["class_size"]) fig.show()

fig = px.histogram(df["migration"]) fig.show()

df_corr = df.corr() print(df_corr)

                score  class_size  migration  scholarship
score        1.000000   -0.226363  -0.644124    -0.868772
class_size  -0.226363    1.000000   0.187642     0.135203
migration   -0.644124    0.187642   1.000000     0.653061
scholarship -0.868772    0.135203   0.653061     1.000000

4. Multivariate Data Analysis

We start by looking at variables correlations with a correlation heatmap

%config InlineBackend.figure_format='retina' fig, ax = plt.subplots(figsize = (7, 5)) sns.heatmap(df_corr, annot = True, cmap="Blues")

import plotly.express as px px.imshow(df_corr, color_continuous_scale="blues")

# Lets plot a pairplot %matplotlib inline sns.set_context('paper') pairplot = sns.PairGrid(df) pairplot.map_upper(sns.scatterplot, linewidths = 1, edgecolor = 'w', s = 10) pairplot.map_diag(plt.hist) pairplot.map_lower(sns.kdeplot)

5. Linear regression

model1 = smf.ols("score ~ class_size + class_size + scholarship + migration", data=df).fit()

print(model1.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  score   R-squared:                       0.775
Model:                            OLS   Adj. R-squared:                  0.773
Method:                 Least Squares   F-statistic:                     476.3
Date:                Mon, 18 Oct 2021   Prob (F-statistic):          4.03e-134
Time:                        13:23:41   Log-Likelihood:                -1520.5
No. Observations:                 420   AIC:                             3049.
Df Residuals:                     416   BIC:                             3065.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     700.1500      4.686    149.423      0.000     690.939     709.361
class_size     -0.9983      0.239     -4.181      0.000      -1.468      -0.529
scholarship    -0.5473      0.022    -25.341      0.000      -0.590      -0.505
migration      -0.1216      0.032     -3.762      0.000      -0.185      -0.058
==============================================================================
Omnibus:                        6.998   Durbin-Watson:                   1.438
Prob(Omnibus):                  0.030   Jarque-Bera (JB):               10.626
Skew:                          -0.024   Prob(JB):                      0.00493
Kurtosis:                       3.778   Cond. No.                         621.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

model2 = smf.ols("score ~ class_size + I(class_size**2)+ scholarship + migration", data=df).fit() print(model2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  score   R-squared:                       0.776
Model:                            OLS   Adj. R-squared:                  0.773
Method:                 Least Squares   F-statistic:                     358.5
Date:                Mon, 18 Oct 2021   Prob (F-statistic):          3.69e-133
Time:                        13:23:41   Log-Likelihood:                -1519.5
No. Observations:                 420   AIC:                             3049.
Df Residuals:                     415   BIC:                             3069.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
======================================================================================
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept            740.8142     29.787     24.871      0.000     682.262     799.366
class_size            -5.1776      3.033     -1.707      0.089     -11.139       0.784
I(class_size ** 2)     0.1064      0.077      1.382      0.168      -0.045       0.258
scholarship           -0.5491      0.022    -25.407      0.000      -0.592      -0.507
migration             -0.1179      0.032     -3.641      0.000      -0.182      -0.054
==============================================================================
Omnibus:                        8.215   Durbin-Watson:                   1.441
Prob(Omnibus):                  0.016   Jarque-Bera (JB):               13.057
Skew:                          -0.062   Prob(JB):                      0.00146
Kurtosis:                       3.855   Cond. No.                     2.70e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.7e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

6. Visual representation of the model

To assess our model quality and to describe it let's start by plotting a visual representation of the variables impact:

fig = px.bar(model1.params[1:], orientation="h", text=round(model1.params[1:], 3) ) fig.layout.template = 'plotly_white' fig.show()

%config InlineBackend.figure_format='retina' # Coefficient confidence interval centers = model1.params[1:] conf_ints = model1.conf_int().iloc[1:, :] sort_index = np.argsort(centers.values) centers = centers[sort_index] conf_ints = conf_ints.iloc[sort_index] plt.barh(y = range(len(centers)), left = conf_ints[0], width = conf_ints[1] - conf_ints[0], height = 0.2, color= "#1460A8") plt.yticks(range(len(centers)), conf_ints.index) plt.plot(centers, range(len(centers)), 'ro') plt.plot([0, 0], [-0.5, len(centers) - 0.5], color = 'gray', linestyle = 'dashed') plt.xlim((-2, 2)) plt.ylim((-0.5, len(centers) - 0.5)) plt.title('Confidence interval explanatory variables')

residuals = model1.resid residuals

fig = px.scatter(x=model1.fittedvalues, y=residuals) fig.show()

fig = px.histogram(residuals, title="Distribution of the residuals") fig.layout.template = 'plotly_white' fig.show()

Final Assignment - Yochan Khoi &amp; Victor Lequertier