Students actionnable insights

Build a report of actionable insights using modeling and data analysis

Model student behavior using XGBoost and predict struggling/at-risk students

Explore student data and identify what makes a struggling student different than successful students

Create tableau dashboard to present the results

import pandas as pd import numpy as np import plotly.express as px import seaborn as sns import matplotlib.pyplot as plt

student_mat_df=pd.read_csv("student-mat.csv",sep=';') student_mat_df.head()

student_por_df=pd.read_csv("student-por.csv",sep=';') student_por_df.head()

fig = px.pie(student_por_df, values='failures', names='age', title='Age vs grades vs failure', hover_data=['G1'], labels={'G1':'grade 1'}) fig.update_traces(textposition='inside', textinfo='percent+label') fig.show()

grade1=student_por_df['G3'] grade1.describe()

grade2=student_mat_df['G3'] grade2.describe()

hist1=grade1.plot.hist(title="Students grades for Portuguese language ") hist1.set_ylabel("Number of students", labelpad=20, weight='bold', size=12) hist1.set_xlabel("Grade", labelpad=20, weight='bold', size=12)

hist2=grade2.plot.hist(title="Students grades for math course ") hist2.set_ylabel("Number of students", labelpad=20, weight='bold', size=12) hist2.set_xlabel("Grade", labelpad=20, weight='bold', size=12)

student_por_df

cat_var=student_por_df[student_por_df.columns[student_por_df.dtypes!='int64']] cat_var

#some information about the variables : for i in cat_var.columns: print('variable :\t',i,"\n values:\t ",set(student_por_df[i]),"\n\t\t ",set(student_por_df[i].value_counts()))

for i in cat_var.columns: #change the name of the column e.g : for romantic column we will change it to romantic_yes and romantic_no and cocatenate with the data dum=pd.get_dummies(student_por_df[i]).rename(columns=lambda x: i+'_'+str(x)) student_por_df = pd.concat([student_por_df, dum], axis=1)

student_por_df

#remove the old columns student_por_df=student_por_df.drop(columns=cat_var,axis=1) student_por_df.head()

#seperate the dependent and the independent variables : Y=student_por_df['G3'] X=student_por_df.drop(columns='G3',axis=1)

# create an xgboost model # run xgboost classification model and check # model code from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=42) import xgboost as xgb xgb_params = { 'eta': 0.01, 'max_depth': 3, 'subsample': 0.7, 'colsample_bytree': 0.7, 'objective': 'reg:linear', 'seed' : 0 } dtrain = xgb.DMatrix(X_train, y_train, feature_names = X.columns) dtest = xgb.DMatrix(X_test, y_test, feature_names = X.columns) evals = [(dtrain,'train'),(dtest,'eval')] xgb_model = xgb.train (params = xgb_params, dtrain = dtrain, num_boost_round = 2000, verbose_eval=50, early_stopping_rounds = 500, evals=evals, #feval = f1_score_cust, maximize = False)

# plot the important features fig, ax = plt.subplots(figsize=(6,9)) xgb.plot_importance(xgb_model, height=0.8, ax=ax, max_num_features=20) plt.show()

xgb_impor=pd.DataFrame(xgb_model.get_fscore().items(),columns=['feature','importance']).sort_values('importance', ascending=False) xgb_impor.head(10)

resu=xgb_model.predict(dtest) resu Resu=pd.DataFrame(resu) Resu X_test.shape

#the top 40 features key_features = list(xgb_impor['feature'].values[0:40]) key_features

# students with a predicted final score 'G3' of less than 10 students_with_pro=X_test[y_test<10] # see the causes of the bad score for i ,r in students_with_pro.iterrows() : print('Student Id ',i) #we compare the student 's features to the rest of the class for feat in key_features: if r[feat] < student_por_df[feat].quantile(0.25): print('\t', 'Below:', feat, r[feat], 'Class:', np.round(np.mean(student_por_df[feat]),2)) if r[feat] > student_por_df[feat].quantile(0.75): print('\t','Above:', feat, r[feat], 'Class:', np.round(np.mean(student_por_df[feat]),2))

#we start with student who are below 25 % of the class limit=0.25 for i ,r in students_with_pro.iterrows() : Student_Id = i important_low_features = [] #collect the features for limit =0.25 for feat in key_features: if r[feat] < student_por_df[feat].quantile(limit): important_low_features.append(feat) #create a new data set at_risk_student = pd.DataFrame(r[important_low_features]).T at_risk_student['Retention_Risk'] = True student_mean = pd.DataFrame(student_por_df[important_low_features].mean(axis=0)).T student_mean['Retention_Risk'] = False student_profile = pd.concat([at_risk_student,student_mean]) student_profile = pd.melt(student_profile, id_vars="Retention_Risk") var1= f"Student Id : {Student_Id}" fig = px.bar(student_profile, x="variable", y="value", color="Retention_Risk", barmode="group") fig.update_layout(title_text=var1) fig.show()

#we start with student who are below 25 % of the class limit=0.75 for i ,r in students_with_pro.iterrows() : Student_Id = i important_low_features = [] #collect the features for limit =0.25 for feat in key_features: if r[feat] > student_por_df[feat].quantile(limit): important_low_features.append(feat) #create a new data set at_risk_student = pd.DataFrame(r[important_low_features]).T at_risk_student['Retention_Risk'] = True student_mean = pd.DataFrame(student_por_df[important_low_features].mean(axis=0)).T student_mean['Retention_Risk'] = False student_profile = pd.concat([at_risk_student,student_mean]) student_profile = pd.melt(student_profile, id_vars="Retention_Risk") var1= f"Student Id : {Student_Id}" fig = px.bar(student_profile, x="variable", y="value", color="Retention_Risk", barmode="group") fig.update_layout(title_text=var1) fig.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Students actionnable insights

Tableau dashboard

Students actionnable insights