Studentenprestaties

import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import os import pingouin as pg import math import statistics import researchpy as rp import scipy.stats as stats import statsmodels.api as sa import statsmodels.formula.api as sfa import scikit_posthocs as sp

# Importeer dataset students=pd.read_csv('StudentsPerformance.csv') # Om totale schoolresultaten te meten, vul ik een kolom met de optelsom van de wiskunde-, lees- en schrijfscores. students['School_Score'] = students['math score'] + students["reading score"] + students['writing score'] # Controleer eventuele aanwezigheid null-waarden. students.info() # Er zijn 1000 regels in de dataset, zonder null-waarden.

# Numeriek maken waarden # De kolom 'Parental level of education' bestaat uit ordinale waarden. # Er kan echter wel een rangschikking in worden gemaakt. Ik vervang de waarden dus voor 1 t/m 6, van lage opleiding naar hoge. students['parental level of education'].replace( {"master's degree": 6, "bachelor's degree": 5, "associate's degree": 4, "some college": 3, "high school": 2, "some high school": 1}, inplace = True) # De Kolom Ethnische groep bevat nominale waarden. Er is dus geen rangorde of mogelijkheid om ermee te rekenen. # Ik zou deze voor berekeningen in vijf kolommen kunnen uitzetten en met dummie-waarden kunnen vullen. # Dit is echter niet nodig. Ik pas deze dus voor de eenvoud enkel zodat alleen de letters overblijven. students['race/ethnicity'].replace( {"group A": A, "group E": E, "group D": D, "group C": C, "group B": B}, inplace = True) # De kolom 'lunch' bevat een binaire waarde. Deze pas ik dus aan naar 0 en 1. students['lunch'].replace( {"standard": 1, "free/reduced": 0},inplace = True) # De kolom 'gender' bevat een binaire waarde. De kolomtitel pas ik voor de duidelijkheid aan naar female. # Een 0 betekent dan automatisch male students['gender'].replace( {"female": 1, "male": 0},inplace = True) # De kolom 'test preparation course' bevat ook enkel binaire waarden. Deze pas ik dus aan naar 0 en 1. students['test preparation course'].replace( {"completed": 1, "none": 0},inplace = True) # De kolommen met de schoolscores bevatten rationele waarden, deze laat ik voor wat ze zijn. # Ik pas de headers aan voor eenvoudiger gebruik mapping = {'race/ethnicity' : 'Ethnic_Group', 'parental level of education' : 'Prnt_Lev_Edu', 'test preparation course' : 'Test_Prep_Course', 'reading score' : 'Reading_Score', 'math score' : 'Math_Score', 'writing score' : 'Writing_Score', 'lunch' : 'Lunch', 'gender' : 'Female', } students.rename(columns = mapping, inplace = True) # Ik neem uit de dataset een random subset van 500 studenten random_subset = students.sample(n=500)

random_subset.describe().transpose()

fig, axes = plt.subplots(2, 2, figsize = (15, 10)) x = random_subset['School_Score'] sns.histplot(x, bins = 15, ax = axes[0, 0]) plt.title('School_Score Distribution') x = random_subset['Writing_Score'] sns.histplot(x, bins = 15, ax = axes[0, 1]) plt.title('Writing_Score Distribution') x = random_subset['Reading_Score'] sns.histplot(x, bins = 15, ax = axes[1, 0]) plt.title('Reading_Score Distribution') x = random_subset['Math_Score'] sns.histplot(x, bins = 15, ax = axes[1, 1]) plt.title('Math_Score Distribution')

random_subset['Female'].value_counts()

stats.ttest_ind(random_subset['School_Score'][random_subset['Female'] == 1], random_subset['School_Score'][random_subset['Female'] == 0], equal_var=False)

man = random_subset.loc[random_subset['Female'] == 0] vrouw = random_subset.loc[random_subset['Female'] == 1] fig, axes = plt.subplots(2, 1, figsize = (15, 10)) x = man['School_Score'] sns.histplot(x, bins = 15, ax = axes[0]) plt.title('Man School_Score Distribution') x2 = vrouw['School_Score'] sns.histplot(x2, bins = 15, ax = axes[1]) plt.title('Vrouw School_Score Distribution')

rp.ttest(group1= random_subset['School_Score'][random_subset['Female'] == 1], group1_name= "Vrouw", group2= random_subset['School_Score'][random_subset['Female'] == 0], group2_name= "Man")

Gr1 = random_subset.loc[random_subset['Ethnic_Group'] == 1] Gr2 = random_subset.loc[random_subset['Ethnic_Group'] == 2] Gr3 = random_subset.loc[random_subset['Ethnic_Group'] == 3] Gr4 = random_subset.loc[random_subset['Ethnic_Group'] == 4] Gr5 = random_subset.loc[random_subset['Ethnic_Group'] == 5] fig, axes = plt.subplots(3, 2, figsize = (15, 10)) x = Gr1['School_Score'] sns.histplot(x, bins = 15, ax = axes[0,0]) plt.title('Groep 1 School_Score Distribution') x2 = Gr2['School_Score'] sns.histplot(x2, bins = 15, ax = axes[0,1]) plt.title('Groep 2 School_Score Distribution') x3 = Gr3['School_Score'] sns.histplot(x3, bins = 15, ax = axes[1,0]) plt.title('Groep 3 School_Score Distribution') x4 = Gr4['School_Score'] sns.histplot(x4, bins = 15, ax = axes[1,1]) plt.title('Groep 4 School_Score Distribution') x5 = Gr5['School_Score'] sns.histplot(x5, bins = 15, ax = axes[2,0]) plt.title('Groep 5 School_Score Distribution') x6 = random_subset['School_Score'] sns.histplot(x6, bins = 15, ax = axes[2,1]) plt.title('School_Score Distribution')

lm = sfa.ols('School_Score~ C(Ethnic_Group)', data=random_subset).fit() anova = sa.stats.anova_lm(lm) print(anova)

import statsmodels.stats.multicomp as mc comp = mc.MultiComparison(random_subset['School_Score'], random_subset['Ethnic_Group']) post_hoc_res = comp.tukeyhsd() post_hoc_res.summary()

comp.tukeyhsd().plot_simultaneous(ylabel= "School_Score", xlabel= "Ethnic_Group") plt.show()

rp.crosstab(random_subset['Lunch'], random_subset['Ethnic_Group'], test= "chi-square")

lm = pg.linear_regression(random_subset[['Math_Score', 'Writing_Score', 'Reading_Score']], random_subset['School_Score']) lm.round(4)

lm = pg.linear_regression(random_subset[['Writing_Score', 'Reading_Score', 'Prnt_Lev_Edu']], random_subset['Math_Score']) lm.round(3)

lm = pg.linear_regression(random_subset['Prnt_Lev_Edu'], random_subset['Math_Score']) lm.round(3)

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Studentenprestaties

Studentenprestaties