Final Assignment - Yochan Khoi & Victor Lequertier
This document is the notebook used to create our outputs presentation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
1. Data Exploration
df = pd.read_csv("Schools.txt", sep=" ")
df
df.round(2).head()
df.isna().sum()
df.describe()
fig, ax = plt.subplots(1, 4, figsize=(25, 7), constrained_layout = True)
for variable, subplot in zip(df.columns, ax.flatten()):
sns.boxplot(y = df[variable], ax = subplot)
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
variables = ["score", "class_size", "migration", "scholarship"]
colors = ['#00afb9', '#FDD5AF', '#f07167', '#B185DB']
fig = make_subplots(rows=1,
cols=4)
for i, variable in enumerate(variables):
trace = go.Box(y=df[variable], marker_color=colors[i], notched=True, name="")
fig.add_trace(
trace,
row=1,
col=i+1
)
fig.update_layout(height=500,
width=1200,
title_text="Box plot of each continuous variable",
showlegend=False)
fig.layout.template = 'plotly_white'
fig.show()
2. Class Size Bins Analysis
df['migration_bin'] = pd.cut(df.migration,bins=[0,33,66,200],labels=['Low', 'Medium', 'High'])
df['class_size_bin'] = pd.cut(df.migration,bins=[0,10,20,100],labels=['Low', 'Medium', 'High'])
df['migration_bin'].value_counts()
import plotly.figure_factory as ff
group_labels=[
"small class",
"medium class",
"large class"]
fig = ff.create_distplot(
[
df[df["class_size_bin"] == "Low"]["score"],
df[df["class_size_bin"] == "Medium"]["score"],
df[df["class_size_bin"] == "High"]["score"]
],
group_labels=group_labels,
show_hist=False)
fig.layout.template = 'plotly_white'
fig.show()
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
group_labels=[
"small class",
"medium class",
"large class"]
bins = ["Low", "Medium", "High"]
fig = make_subplots(rows=3,
cols=1)
for i, mybin in enumerate(bins):
trace = go.Box(x=df[df['class_size_bin'] == mybin]['score'], notched=True, name="")
fig.add_trace(
trace,
row=i+1,
col=1)
fig.update_layout(height=350,
width=1200,
showlegend=False)
fig.layout.template = 'plotly_white'
fig.show()
df.head()
3. Univariate data analysis
import plotly.express as px
fig = px.histogram(df["score"])
fig.show()
fig = px.histogram(df["scholarship"])
fig.show()
fig = px.histogram(df["class_size"])
fig.show()
fig = px.histogram(df["migration"])
fig.show()
df_corr = df.corr()
print(df_corr)
4. Multivariate Data Analysis
We start by looking at variables correlations with a correlation heatmap
%config InlineBackend.figure_format='retina'
fig, ax = plt.subplots(figsize = (7, 5))
sns.heatmap(df_corr, annot = True, cmap="Blues")
import plotly.express as px
px.imshow(df_corr, color_continuous_scale="blues")
# Lets plot a pairplot
%matplotlib inline
sns.set_context('paper')
pairplot = sns.PairGrid(df)
pairplot.map_upper(sns.scatterplot, linewidths = 1, edgecolor = 'w', s = 10)
pairplot.map_diag(plt.hist)
pairplot.map_lower(sns.kdeplot)
5. Linear regression
model1 = smf.ols("score ~ class_size + class_size + scholarship + migration", data=df).fit()
print(model1.summary())
model2 = smf.ols("score ~ class_size + I(class_size**2)+ scholarship + migration", data=df).fit()
print(model2.summary())
6. Visual representation of the model
To assess our model quality and to describe it let's start by plotting a visual representation of the variables impact:
fig = px.bar(model1.params[1:],
orientation="h",
text=round(model1.params[1:], 3)
)
fig.layout.template = 'plotly_white'
fig.show()
%config InlineBackend.figure_format='retina'
# Coefficient confidence interval
centers = model1.params[1:]
conf_ints = model1.conf_int().iloc[1:, :]
sort_index = np.argsort(centers.values)
centers = centers[sort_index]
conf_ints = conf_ints.iloc[sort_index]
plt.barh(y = range(len(centers)),
left = conf_ints[0],
width = conf_ints[1] - conf_ints[0],
height = 0.2,
color= "#1460A8")
plt.yticks(range(len(centers)), conf_ints.index)
plt.plot(centers, range(len(centers)), 'ro')
plt.plot([0, 0], [-0.5, len(centers) - 0.5],
color = 'gray',
linestyle = 'dashed')
plt.xlim((-2, 2))
plt.ylim((-0.5, len(centers) - 0.5))
plt.title('Confidence interval explanatory variables')
residuals = model1.resid
residuals
fig = px.scatter(x=model1.fittedvalues, y=residuals)
fig.show()
fig = px.histogram(residuals, title="Distribution of the residuals")
fig.layout.template = 'plotly_white'
fig.show()