Studentenprestaties
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import pingouin as pg
import math
import statistics
import researchpy as rp
import scipy.stats as stats
import statsmodels.api as sa
import statsmodels.formula.api as sfa
import scikit_posthocs as sp
# Importeer dataset
students=pd.read_csv('StudentsPerformance.csv')
# Om totale schoolresultaten te meten, vul ik een kolom met de optelsom van de wiskunde-, lees- en schrijfscores.
students['School_Score'] = students['math score'] + students["reading score"] + students['writing score']
# Controleer eventuele aanwezigheid null-waarden.
students.info()
# Er zijn 1000 regels in de dataset, zonder null-waarden.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 1000 non-null object
1 race/ethnicity 1000 non-null object
2 parental level of education 1000 non-null object
3 lunch 1000 non-null object
4 test preparation course 1000 non-null object
5 math score 1000 non-null int64
6 reading score 1000 non-null int64
7 writing score 1000 non-null int64
8 School_Score 1000 non-null int64
dtypes: int64(4), object(5)
memory usage: 70.4+ KB
# Numeriek maken waarden
# De kolom 'Parental level of education' bestaat uit ordinale waarden.
# Er kan echter wel een rangschikking in worden gemaakt. Ik vervang de waarden dus voor 1 t/m 6, van lage opleiding naar hoge.
students['parental level of education'].replace(
{"master's degree": 6,
"bachelor's degree": 5,
"associate's degree": 4,
"some college": 3,
"high school": 2,
"some high school": 1},
inplace = True)
# De Kolom Ethnische groep bevat nominale waarden. Er is dus geen rangorde of mogelijkheid om ermee te rekenen.
# Ik zou deze voor berekeningen in vijf kolommen kunnen uitzetten en met dummie-waarden kunnen vullen.
# Dit is echter niet nodig. Ik pas deze dus voor de eenvoud enkel zodat alleen de letters overblijven.
students['race/ethnicity'].replace(
{"group A": A,
"group E": E,
"group D": D,
"group C": C,
"group B": B},
inplace = True)
# De kolom 'lunch' bevat een binaire waarde. Deze pas ik dus aan naar 0 en 1.
students['lunch'].replace(
{"standard": 1,
"free/reduced": 0},inplace = True)
# De kolom 'gender' bevat een binaire waarde. De kolomtitel pas ik voor de duidelijkheid aan naar female.
# Een 0 betekent dan automatisch male
students['gender'].replace(
{"female": 1,
"male": 0},inplace = True)
# De kolom 'test preparation course' bevat ook enkel binaire waarden. Deze pas ik dus aan naar 0 en 1.
students['test preparation course'].replace(
{"completed": 1,
"none": 0},inplace = True)
# De kolommen met de schoolscores bevatten rationele waarden, deze laat ik voor wat ze zijn.
# Ik pas de headers aan voor eenvoudiger gebruik
mapping = {'race/ethnicity' : 'Ethnic_Group',
'parental level of education' : 'Prnt_Lev_Edu',
'test preparation course' : 'Test_Prep_Course',
'reading score' : 'Reading_Score',
'math score' : 'Math_Score',
'writing score' : 'Writing_Score',
'lunch' : 'Lunch',
'gender' : 'Female',
}
students.rename(columns = mapping, inplace = True)
# Ik neem uit de dataset een random subset van 500 studenten
random_subset = students.sample(n=500)
random_subset.describe().transpose()
countfloat64
500.0 - 500.0
meanfloat64
0.368 - 204.14
Female
500
0.526
Ethnic_Group
500
3.142
Prnt_Lev_Edu
500
3.094
Lunch
500
0.666
Test_Prep_Course
500
0.368
Math_Score
500
66.338
Reading_Score
500
69.428
Writing_Score
500
68.374
School_Score
500
204.14
fig, axes = plt.subplots(2, 2, figsize = (15, 10))
x = random_subset['School_Score']
sns.histplot(x, bins = 15, ax = axes[0, 0])
plt.title('School_Score Distribution')
x = random_subset['Writing_Score']
sns.histplot(x, bins = 15, ax = axes[0, 1])
plt.title('Writing_Score Distribution')
x = random_subset['Reading_Score']
sns.histplot(x, bins = 15, ax = axes[1, 0])
plt.title('Reading_Score Distribution')
x = random_subset['Math_Score']
sns.histplot(x, bins = 15, ax = axes[1, 1])
plt.title('Math_Score Distribution')
random_subset['Female'].value_counts()
stats.ttest_ind(random_subset['School_Score'][random_subset['Female'] == 1],
random_subset['School_Score'][random_subset['Female'] == 0], equal_var=False)
man = random_subset.loc[random_subset['Female'] == 0]
vrouw = random_subset.loc[random_subset['Female'] == 1]
fig, axes = plt.subplots(2, 1, figsize = (15, 10))
x = man['School_Score']
sns.histplot(x, bins = 15, ax = axes[0])
plt.title('Man School_Score Distribution')
x2 = vrouw['School_Score']
sns.histplot(x2, bins = 15, ax = axes[1])
plt.title('Vrouw School_Score Distribution')
rp.ttest(group1= random_subset['School_Score'][random_subset['Female'] == 1], group1_name= "Vrouw",
group2= random_subset['School_Score'][random_subset['Female'] == 0], group2_name= "Man")
Gr1 = random_subset.loc[random_subset['Ethnic_Group'] == 1]
Gr2 = random_subset.loc[random_subset['Ethnic_Group'] == 2]
Gr3 = random_subset.loc[random_subset['Ethnic_Group'] == 3]
Gr4 = random_subset.loc[random_subset['Ethnic_Group'] == 4]
Gr5 = random_subset.loc[random_subset['Ethnic_Group'] == 5]
fig, axes = plt.subplots(3, 2, figsize = (15, 10))
x = Gr1['School_Score']
sns.histplot(x, bins = 15, ax = axes[0,0])
plt.title('Groep 1 School_Score Distribution')
x2 = Gr2['School_Score']
sns.histplot(x2, bins = 15, ax = axes[0,1])
plt.title('Groep 2 School_Score Distribution')
x3 = Gr3['School_Score']
sns.histplot(x3, bins = 15, ax = axes[1,0])
plt.title('Groep 3 School_Score Distribution')
x4 = Gr4['School_Score']
sns.histplot(x4, bins = 15, ax = axes[1,1])
plt.title('Groep 4 School_Score Distribution')
x5 = Gr5['School_Score']
sns.histplot(x5, bins = 15, ax = axes[2,0])
plt.title('Groep 5 School_Score Distribution')
x6 = random_subset['School_Score']
sns.histplot(x6, bins = 15, ax = axes[2,1])
plt.title('School_Score Distribution')
lm = sfa.ols('School_Score~ C(Ethnic_Group)', data=random_subset).fit()
anova = sa.stats.anova_lm(lm)
print(anova)
df sum_sq mean_sq F PR(>F)
C(Ethnic_Group) 4.0 21473.210353 5368.302588 3.03854 0.017118
Residual 495.0 874534.989647 1766.737353 NaN NaN
import statsmodels.stats.multicomp as mc
comp = mc.MultiComparison(random_subset['School_Score'], random_subset['Ethnic_Group'])
post_hoc_res = comp.tukeyhsd()
post_hoc_res.summary()
comp.tukeyhsd().plot_simultaneous(ylabel= "School_Score", xlabel= "Ethnic_Group")
plt.show()
rp.crosstab(random_subset['Lunch'], random_subset['Ethnic_Group'], test= "chi-square")
lm = pg.linear_regression(random_subset[['Math_Score', 'Writing_Score', 'Reading_Score']], random_subset['School_Score'])
lm.round(4)
namesobject
coeffloat64
0
Intercept
0
1
Math_Score
1
2
Writing_Score
1
3
Reading_Score
1
lm = pg.linear_regression(random_subset[['Writing_Score', 'Reading_Score', 'Prnt_Lev_Edu']], random_subset['Math_Score'])
lm.round(3)
namesobject
coeffloat64
0
Intercept
10.282
1
Writing_Score
0.251
2
Reading_Score
0.572
3
Prnt_Lev_Edu
-0.274
lm = pg.linear_regression(random_subset['Prnt_Lev_Edu'], random_subset['Math_Score'])
lm.round(3)
namesobject
coeffloat64
0
Intercept
62.145
1
Prnt_Lev_Edu
1.355