Studentenprestaties
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import pingouin as pg
import math
import statistics
import researchpy as rp
import scipy.stats as stats
import statsmodels.api as sa
import statsmodels.formula.api as sfa
import scikit_posthocs as sp
# Importeer dataset
students=pd.read_csv('StudentsPerformance.csv')
# Om totale schoolresultaten te meten, vul ik een kolom met de optelsom van de wiskunde-, lees- en schrijfscores.
students['School_Score'] = students['math score'] + students["reading score"] + students['writing score']
# Controleer eventuele aanwezigheid null-waarden.
students.info()
# Er zijn 1000 regels in de dataset, zonder null-waarden.
# Numeriek maken waarden
# De kolom 'Parental level of education' bestaat uit ordinale waarden.
# Er kan echter wel een rangschikking in worden gemaakt. Ik vervang de waarden dus voor 1 t/m 6, van lage opleiding naar hoge.
students['parental level of education'].replace(
{"master's degree": 6,
"bachelor's degree": 5,
"associate's degree": 4,
"some college": 3,
"high school": 2,
"some high school": 1},
inplace = True)
# De Kolom Ethnische groep bevat nominale waarden. Er is dus geen rangorde of mogelijkheid om ermee te rekenen.
# Ik zou deze voor berekeningen in vijf kolommen kunnen uitzetten en met dummie-waarden kunnen vullen.
# Dit is echter niet nodig. Ik pas deze dus voor de eenvoud enkel zodat alleen de letters overblijven.
students['race/ethnicity'].replace(
{"group A": A,
"group E": E,
"group D": D,
"group C": C,
"group B": B},
inplace = True)
# De kolom 'lunch' bevat een binaire waarde. Deze pas ik dus aan naar 0 en 1.
students['lunch'].replace(
{"standard": 1,
"free/reduced": 0},inplace = True)
# De kolom 'gender' bevat een binaire waarde. De kolomtitel pas ik voor de duidelijkheid aan naar female.
# Een 0 betekent dan automatisch male
students['gender'].replace(
{"female": 1,
"male": 0},inplace = True)
# De kolom 'test preparation course' bevat ook enkel binaire waarden. Deze pas ik dus aan naar 0 en 1.
students['test preparation course'].replace(
{"completed": 1,
"none": 0},inplace = True)
# De kolommen met de schoolscores bevatten rationele waarden, deze laat ik voor wat ze zijn.
# Ik pas de headers aan voor eenvoudiger gebruik
mapping = {'race/ethnicity' : 'Ethnic_Group',
'parental level of education' : 'Prnt_Lev_Edu',
'test preparation course' : 'Test_Prep_Course',
'reading score' : 'Reading_Score',
'math score' : 'Math_Score',
'writing score' : 'Writing_Score',
'lunch' : 'Lunch',
'gender' : 'Female',
}
students.rename(columns = mapping, inplace = True)
# Ik neem uit de dataset een random subset van 500 studenten
random_subset = students.sample(n=500)
random_subset.describe().transpose()
fig, axes = plt.subplots(2, 2, figsize = (15, 10))
x = random_subset['School_Score']
sns.histplot(x, bins = 15, ax = axes[0, 0])
plt.title('School_Score Distribution')
x = random_subset['Writing_Score']
sns.histplot(x, bins = 15, ax = axes[0, 1])
plt.title('Writing_Score Distribution')
x = random_subset['Reading_Score']
sns.histplot(x, bins = 15, ax = axes[1, 0])
plt.title('Reading_Score Distribution')
x = random_subset['Math_Score']
sns.histplot(x, bins = 15, ax = axes[1, 1])
plt.title('Math_Score Distribution')
random_subset['Female'].value_counts()
stats.ttest_ind(random_subset['School_Score'][random_subset['Female'] == 1],
random_subset['School_Score'][random_subset['Female'] == 0], equal_var=False)
man = random_subset.loc[random_subset['Female'] == 0]
vrouw = random_subset.loc[random_subset['Female'] == 1]
fig, axes = plt.subplots(2, 1, figsize = (15, 10))
x = man['School_Score']
sns.histplot(x, bins = 15, ax = axes[0])
plt.title('Man School_Score Distribution')
x2 = vrouw['School_Score']
sns.histplot(x2, bins = 15, ax = axes[1])
plt.title('Vrouw School_Score Distribution')
rp.ttest(group1= random_subset['School_Score'][random_subset['Female'] == 1], group1_name= "Vrouw",
group2= random_subset['School_Score'][random_subset['Female'] == 0], group2_name= "Man")
Gr1 = random_subset.loc[random_subset['Ethnic_Group'] == 1]
Gr2 = random_subset.loc[random_subset['Ethnic_Group'] == 2]
Gr3 = random_subset.loc[random_subset['Ethnic_Group'] == 3]
Gr4 = random_subset.loc[random_subset['Ethnic_Group'] == 4]
Gr5 = random_subset.loc[random_subset['Ethnic_Group'] == 5]
fig, axes = plt.subplots(3, 2, figsize = (15, 10))
x = Gr1['School_Score']
sns.histplot(x, bins = 15, ax = axes[0,0])
plt.title('Groep 1 School_Score Distribution')
x2 = Gr2['School_Score']
sns.histplot(x2, bins = 15, ax = axes[0,1])
plt.title('Groep 2 School_Score Distribution')
x3 = Gr3['School_Score']
sns.histplot(x3, bins = 15, ax = axes[1,0])
plt.title('Groep 3 School_Score Distribution')
x4 = Gr4['School_Score']
sns.histplot(x4, bins = 15, ax = axes[1,1])
plt.title('Groep 4 School_Score Distribution')
x5 = Gr5['School_Score']
sns.histplot(x5, bins = 15, ax = axes[2,0])
plt.title('Groep 5 School_Score Distribution')
x6 = random_subset['School_Score']
sns.histplot(x6, bins = 15, ax = axes[2,1])
plt.title('School_Score Distribution')
lm = sfa.ols('School_Score~ C(Ethnic_Group)', data=random_subset).fit()
anova = sa.stats.anova_lm(lm)
print(anova)
import statsmodels.stats.multicomp as mc
comp = mc.MultiComparison(random_subset['School_Score'], random_subset['Ethnic_Group'])
post_hoc_res = comp.tukeyhsd()
post_hoc_res.summary()
comp.tukeyhsd().plot_simultaneous(ylabel= "School_Score", xlabel= "Ethnic_Group")
plt.show()
rp.crosstab(random_subset['Lunch'], random_subset['Ethnic_Group'], test= "chi-square")
lm = pg.linear_regression(random_subset[['Math_Score', 'Writing_Score', 'Reading_Score']], random_subset['School_Score'])
lm.round(4)
lm = pg.linear_regression(random_subset[['Writing_Score', 'Reading_Score', 'Prnt_Lev_Edu']], random_subset['Math_Score'])
lm.round(3)
lm = pg.linear_regression(random_subset['Prnt_Lev_Edu'], random_subset['Math_Score'])
lm.round(3)