Students actionnable insights
Build a report of actionable insights using modeling and data analysis
Model student behavior using XGBoost and predict struggling/at-risk students
Explore student data and identify what makes a struggling student different than successful students
Create tableau dashboard to present the results
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
student_mat_df=pd.read_csv("student-mat.csv",sep=';')
student_mat_df.head()
student_por_df=pd.read_csv("student-por.csv",sep=';')
student_por_df.head()
fig = px.pie(student_por_df, values='failures', names='age',
title='Age vs grades vs failure',
hover_data=['G1'], labels={'G1':'grade 1'})
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
grade1=student_por_df['G3']
grade1.describe()
grade2=student_mat_df['G3']
grade2.describe()
hist1=grade1.plot.hist(title="Students grades for Portuguese language ")
hist1.set_ylabel("Number of students", labelpad=20, weight='bold', size=12)
hist1.set_xlabel("Grade", labelpad=20, weight='bold', size=12)
hist2=grade2.plot.hist(title="Students grades for math course ")
hist2.set_ylabel("Number of students", labelpad=20, weight='bold', size=12)
hist2.set_xlabel("Grade", labelpad=20, weight='bold', size=12)
student_por_df
cat_var=student_por_df[student_por_df.columns[student_por_df.dtypes!='int64']]
cat_var
#some information about the variables :
for i in cat_var.columns:
print('variable :\t',i,"\n values:\t ",set(student_por_df[i]),"\n\t\t ",set(student_por_df[i].value_counts()))
for i in cat_var.columns:
#change the name of the column e.g : for romantic column we will change it to romantic_yes and romantic_no and cocatenate with the data
dum=pd.get_dummies(student_por_df[i]).rename(columns=lambda x: i+'_'+str(x))
student_por_df = pd.concat([student_por_df, dum], axis=1)
student_por_df
#remove the old columns
student_por_df=student_por_df.drop(columns=cat_var,axis=1)
student_por_df.head()
#seperate the dependent and the independent variables :
Y=student_por_df['G3']
X=student_por_df.drop(columns='G3',axis=1)
# create an xgboost model
# run xgboost classification model and check
# model code
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=42)
import xgboost as xgb
xgb_params = {
'eta': 0.01,
'max_depth': 3,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'seed' : 0
}
dtrain = xgb.DMatrix(X_train, y_train, feature_names = X.columns)
dtest = xgb.DMatrix(X_test, y_test, feature_names = X.columns)
evals = [(dtrain,'train'),(dtest,'eval')]
xgb_model = xgb.train (params = xgb_params,
dtrain = dtrain,
num_boost_round = 2000,
verbose_eval=50,
early_stopping_rounds = 500,
evals=evals,
#feval = f1_score_cust,
maximize = False)
# plot the important features
fig, ax = plt.subplots(figsize=(6,9))
xgb.plot_importance(xgb_model, height=0.8, ax=ax, max_num_features=20)
plt.show()
xgb_impor=pd.DataFrame(xgb_model.get_fscore().items(),columns=['feature','importance']).sort_values('importance', ascending=False)
xgb_impor.head(10)
resu=xgb_model.predict(dtest)
resu
Resu=pd.DataFrame(resu)
Resu
X_test.shape
#the top 40 features
key_features = list(xgb_impor['feature'].values[0:40])
key_features
# students with a predicted final score 'G3' of less than 10
students_with_pro=X_test[y_test<10]
# see the causes of the bad score
for i ,r in students_with_pro.iterrows() :
print('Student Id ',i)
#we compare the student 's features to the rest of the class
for feat in key_features:
if r[feat] < student_por_df[feat].quantile(0.25):
print('\t', 'Below:', feat, r[feat], 'Class:',
np.round(np.mean(student_por_df[feat]),2))
if r[feat] > student_por_df[feat].quantile(0.75):
print('\t','Above:', feat, r[feat], 'Class:',
np.round(np.mean(student_por_df[feat]),2))
#we start with student who are below 25 % of the class
limit=0.25
for i ,r in students_with_pro.iterrows() :
Student_Id = i
important_low_features = []
#collect the features for limit =0.25
for feat in key_features:
if r[feat] < student_por_df[feat].quantile(limit):
important_low_features.append(feat)
#create a new data set
at_risk_student = pd.DataFrame(r[important_low_features]).T
at_risk_student['Retention_Risk'] = True
student_mean = pd.DataFrame(student_por_df[important_low_features].mean(axis=0)).T
student_mean['Retention_Risk'] = False
student_profile = pd.concat([at_risk_student,student_mean])
student_profile = pd.melt(student_profile, id_vars="Retention_Risk")
var1= f"Student Id : {Student_Id}"
fig = px.bar(student_profile, x="variable", y="value", color="Retention_Risk", barmode="group")
fig.update_layout(title_text=var1)
fig.show()
#we start with student who are below 25 % of the class
limit=0.75
for i ,r in students_with_pro.iterrows() :
Student_Id = i
important_low_features = []
#collect the features for limit =0.25
for feat in key_features:
if r[feat] > student_por_df[feat].quantile(limit):
important_low_features.append(feat)
#create a new data set
at_risk_student = pd.DataFrame(r[important_low_features]).T
at_risk_student['Retention_Risk'] = True
student_mean = pd.DataFrame(student_por_df[important_low_features].mean(axis=0)).T
student_mean['Retention_Risk'] = False
student_profile = pd.concat([at_risk_student,student_mean])
student_profile = pd.melt(student_profile, id_vars="Retention_Risk")
var1= f"Student Id : {Student_Id}"
fig = px.bar(student_profile, x="variable", y="value", color="Retention_Risk", barmode="group")
fig.update_layout(title_text=var1)
fig.show()