Final Assignment - Yochan Khoi & Victor Lequertier

This document is the notebook used to create our outputs presentation

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import statsmodels.api as sm import statsmodels.formula.api as smf %matplotlib inline

1. Data Exploration

df = pd.read_csv("Schools.txt", sep=" ") df

df.round(2).head()

df.isna().sum()

df.describe()

fig, ax = plt.subplots(1, 4, figsize=(25, 7), constrained_layout = True) for variable, subplot in zip(df.columns, ax.flatten()): sns.boxplot(y = df[variable], ax = subplot)

import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots variables = ["score", "class_size", "migration", "scholarship"] colors = ['#00afb9', '#FDD5AF', '#f07167', '#B185DB'] fig = make_subplots(rows=1, cols=4) for i, variable in enumerate(variables): trace = go.Box(y=df[variable], marker_color=colors[i], notched=True, name="") fig.add_trace( trace, row=1, col=i+1 ) fig.update_layout(height=500, width=1200, title_text="Box plot of each continuous variable", showlegend=False) fig.layout.template = 'plotly_white' fig.show()

2. Class Size Bins Analysis

df['migration_bin'] = pd.cut(df.migration,bins=[0,33,66,200],labels=['Low', 'Medium', 'High']) df['class_size_bin'] = pd.cut(df.migration,bins=[0,10,20,100],labels=['Low', 'Medium', 'High'])

df['migration_bin'].value_counts()

import plotly.figure_factory as ff group_labels=[ "small class", "medium class", "large class"] fig = ff.create_distplot( [ df[df["class_size_bin"] == "Low"]["score"], df[df["class_size_bin"] == "Medium"]["score"], df[df["class_size_bin"] == "High"]["score"] ], group_labels=group_labels, show_hist=False) fig.layout.template = 'plotly_white' fig.show()

import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots group_labels=[ "small class", "medium class", "large class"] bins = ["Low", "Medium", "High"] fig = make_subplots(rows=3, cols=1) for i, mybin in enumerate(bins): trace = go.Box(x=df[df['class_size_bin'] == mybin]['score'], notched=True, name="") fig.add_trace( trace, row=i+1, col=1) fig.update_layout(height=350, width=1200, showlegend=False) fig.layout.template = 'plotly_white' fig.show()

df.head()

3. Univariate data analysis

import plotly.express as px

fig = px.histogram(df["score"]) fig.show()

fig = px.histogram(df["scholarship"]) fig.show()

fig = px.histogram(df["class_size"]) fig.show()

fig = px.histogram(df["migration"]) fig.show()

df_corr = df.corr() print(df_corr)

4. Multivariate Data Analysis

We start by looking at variables correlations with a correlation heatmap

%config InlineBackend.figure_format='retina' fig, ax = plt.subplots(figsize = (7, 5)) sns.heatmap(df_corr, annot = True, cmap="Blues")

import plotly.express as px px.imshow(df_corr, color_continuous_scale="blues")

# Lets plot a pairplot %matplotlib inline sns.set_context('paper') pairplot = sns.PairGrid(df) pairplot.map_upper(sns.scatterplot, linewidths = 1, edgecolor = 'w', s = 10) pairplot.map_diag(plt.hist) pairplot.map_lower(sns.kdeplot)

5. Linear regression

model1 = smf.ols("score ~ class_size + class_size + scholarship + migration", data=df).fit()

print(model1.summary())

model2 = smf.ols("score ~ class_size + I(class_size**2)+ scholarship + migration", data=df).fit() print(model2.summary())

6. Visual representation of the model

To assess our model quality and to describe it let's start by plotting a visual representation of the variables impact:

fig = px.bar(model1.params[1:], orientation="h", text=round(model1.params[1:], 3) ) fig.layout.template = 'plotly_white' fig.show()

%config InlineBackend.figure_format='retina' # Coefficient confidence interval centers = model1.params[1:] conf_ints = model1.conf_int().iloc[1:, :] sort_index = np.argsort(centers.values) centers = centers[sort_index] conf_ints = conf_ints.iloc[sort_index] plt.barh(y = range(len(centers)), left = conf_ints[0], width = conf_ints[1] - conf_ints[0], height = 0.2, color= "#1460A8") plt.yticks(range(len(centers)), conf_ints.index) plt.plot(centers, range(len(centers)), 'ro') plt.plot([0, 0], [-0.5, len(centers) - 0.5], color = 'gray', linestyle = 'dashed') plt.xlim((-2, 2)) plt.ylim((-0.5, len(centers) - 0.5)) plt.title('Confidence interval explanatory variables')

residuals = model1.resid residuals

fig = px.scatter(x=model1.fittedvalues, y=residuals) fig.show()

fig = px.histogram(residuals, title="Distribution of the residuals") fig.layout.template = 'plotly_white' fig.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Final Assignment - Yochan Khoi &amp; Victor Lequertier