Who is Dropping out of college?
Prepared by: Viswajith E S
Connect with me on LinkedIn
View full analysis and code on GitHub
## importing data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
cross_tab = pd.crosstab(np.where(train["Educational special needs"], "Special needs", "Others"), train["Target"]) ## we are only interested in dropout. so the other categories are aggregated
cross_tab
from scipy.stats import chi2_contingency
stat, p, dof, expected = chi2_contingency(cross_tab)
## interpret p value
alpha = 0.05 # 95% confidence level
if p < alpha:
print(f"P value is {p}")
print("Null hypothesis is rejected.")
else:
print(f"P value is {p}")
print("Failed to reject the Null hypothesis.")
def acc_type(x):
white = ["managerial", "professional", "clerical", "technical"]
blue = ["service", "factory", "craftsmen", "agriculture", "elementary"]
if x in white:
return "white"
elif x in blue:
return "blue"
else:
return "others"
train["father_occ_type"] = train["father_occ"].apply(acc_type)
train["mother_occ_type"] = train["mother_occ"].apply(acc_type)
train["father_occ_type"].value_counts()
total_debtors = train.Debtor.sum()
total_students = train.shape[0]
debtors_dropped = train[train["Debtor"] == 1]["Target"].value_counts()["Dropout"]
print(f"{total_debtors*100/total_students:.2f}% students are debtors. {debtors_dropped*100/total_debtors:.2f}% of them dropped out")
ct = pd.crosstab(train["Debtor"], train["Tuition fees up to date"])
ct
ct = pd.crosstab(train["Debtor"], train["Scholarship holder"])
ct
scholarship_holders = train[train["Scholarship holder"] == 1]
print(f"There are {scholarship_holders.shape[0]}({scholarship_holders.shape[0]*100/train.shape[0]:.1f}%) scholarship holders among a total of {train.shape[0]} students.")
features_to_include = ['Marital status', 'Application mode', 'Application order', 'Course',
'Attendance_mode', 'Previous qualification (grade)', 'Admission grade', 'Displaced',
'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder',
'Age at enrollment', 'International', 'father_occ', 'mother_occ', 'father_qual', 'mother_qual', 'previous_qual',
'Target']
train_df = train[features_to_include]
test_df = test[features_to_include]
y_train = train_df["Target"]
X_train = train_df.drop("Target", axis=1)
y_test = test_df["Target"]
X_test = test_df.drop("Target", axis=1)
y_train = np.where(y_train == 'Dropout', 1, 0)
y_test = np.where(y_test == 'Dropout', 1, 0)
cat_features = ['Course', 'previous_qual', 'Debtor', 'Gender', 'Scholarship holder', 'father_occ', 'mother_occ', 'father_qual', 'mother_qual', 'Marital status', 'Application mode', 'Attendance_mode', 'International', 'Displaced']
num_features = ['Previous qualification (grade)', 'Admission grade', 'Age at enrollment', 'Application order']
select_cat_features = ColumnTransformer([('select_cat', 'passthrough', cat_features)])
cat_transformers = Pipeline([('selector', select_cat_features),
('onehot', OneHotEncoder(handle_unknown='ignore')),
])
select_num_features = ColumnTransformer([('select_num', 'passthrough', num_features)])
num_transformers = Pipeline([('selector', select_num_features),
('scaler', StandardScaler()),
])
preprocess_pipe = FeatureUnion([('cat', cat_transformers),
('num', num_transformers),
])
preprocess_pipe
## Cross validation scheme
cv = StratifiedShuffleSplit(n_splits = 5, test_size=0.2, random_state=32)
class Model:
def __init__(self, model_name, estimator, preprocessor=None, scoring=None, cv=5, sampler = None, random_state=42):
self.name = model_name
self.estimator = estimator
self.preprocess = preprocessor
self.scoring = scoring
self.cv = cv
self.rs = random_state
self.sampler = sampler
def make_model_pipeline(self):
if self.sampler == None:
self.model = Pipeline([('preprocess', self.preprocess),
('model', self.estimator)])
else:
self.model = imb_pipeline([('sampler', self.sampler),
('preprocess', self.preprocess),
('model', self.estimator)])
def train(self, X_train, y_train):
'''Trains the model
Args:
X_train: Training data feature matrix
y_train: Training data label vector
Returns:
trained model
'''
self.make_model_pipeline()
self.cv_results = cross_validate(self.model, X_train, y_train, cv=self.cv, scoring=self.scoring, return_train_score=True)
mean_train_score = self.cv_results["train_score"].mean()
mean_val_score = self.cv_results["test_score"].mean()
std_train_score = self.cv_results["train_score"].std()
std_val_score = self.cv_results["test_score"].std()
print(f"Cross validated training results for {self.name} model")
print("---------------------------------------------------------")
print(f"Train score: {mean_train_score} +/- {std_train_score}" )
print(f"Validation score: {mean_val_score} +/- {std_val_score}" )
self.fitted_model = self.model.fit(X_train, y_train)
def evaluate(self, X_test, y_test):
y_pred = self.fitted_model.predict(X_test)
recall = recall_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print("Recall score on test set: ", recall)
print("Accuracy score on test set: ", acc)
print()
print(classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
def tune(self, param_grid, X_train, y_train):
'''Do hyper parameter tuning using GridSearch strategy
Args:
model: Model to be tuned
param_grid: dict of parameters
X_train: Feature matrix
y_train: Label matrix
Returns:
best parameters
best estimator
'''
self.make_model_pipeline()
search = GridSearchCV(self.model, param_grid = param_grid,
cv = self.cv,
scoring=self.scoring,
return_train_score=True)
search.fit(X_train, y_train)
print("Best parameters: ", search.best_params_)
print("-------------------Best model performance --------------------------")
mean_train_score = search.cv_results_['mean_train_score'][search.best_index_]
mean_val_score = search.cv_results_['mean_test_score'][search.best_index_]
std_train_score = search.cv_results_['std_train_score'][search.best_index_]
std_val_score = search.cv_results_['std_test_score'][search.best_index_]
print(f"Score of the model on the train set:\n"
f"{mean_train_score:.3f} +/- {std_train_score:.6f}")
print(f"Score of the model on the validation set:\n"
f"{mean_val_score:.3f} +/- {std_val_score:.6f}")
self.fitted_model = search.best_estimator_
cat_mask = [True if x in cat_features else False for x in X_train.columns]
smote = SMOTENC(categorical_features=cat_mask, random_state=32)
tree_os_clf = Model(model_name = "Decision tree classifier",
estimator = DecisionTreeClassifier(max_depth = 3),
preprocessor = preprocess_pipe,
sampler = smote,
scoring = "recall",
random_state = 32)
tree_os_clf.train(X_train, y_train)
tree_os_clf.evaluate(X_test, y_test)
Please feel free to drop an email to esviswajith95@gmail.com if you have a query/comment.