Who is Dropping out of college?

Connect with me on LinkedIn

View full analysis and code on GitHub

## importing data train = pd.read_csv("train.csv") test = pd.read_csv("test.csv")

cross_tab = pd.crosstab(np.where(train["Educational special needs"], "Special needs", "Others"), train["Target"]) ## we are only interested in dropout. so the other categories are aggregated cross_tab

from scipy.stats import chi2_contingency stat, p, dof, expected = chi2_contingency(cross_tab) ## interpret p value alpha = 0.05 # 95% confidence level if p < alpha: print(f"P value is {p}") print("Null hypothesis is rejected.") else: print(f"P value is {p}") print("Failed to reject the Null hypothesis.")

def acc_type(x): white = ["managerial", "professional", "clerical", "technical"] blue = ["service", "factory", "craftsmen", "agriculture", "elementary"] if x in white: return "white" elif x in blue: return "blue" else: return "others" train["father_occ_type"] = train["father_occ"].apply(acc_type) train["mother_occ_type"] = train["mother_occ"].apply(acc_type)

train["father_occ_type"].value_counts()

total_debtors = train.Debtor.sum() total_students = train.shape[0] debtors_dropped = train[train["Debtor"] == 1]["Target"].value_counts()["Dropout"] print(f"{total_debtors*100/total_students:.2f}% students are debtors. {debtors_dropped*100/total_debtors:.2f}% of them dropped out")

ct = pd.crosstab(train["Debtor"], train["Tuition fees up to date"]) ct

ct = pd.crosstab(train["Debtor"], train["Scholarship holder"]) ct

scholarship_holders = train[train["Scholarship holder"] == 1] print(f"There are {scholarship_holders.shape[0]}({scholarship_holders.shape[0]*100/train.shape[0]:.1f}%) scholarship holders among a total of {train.shape[0]} students.")

features_to_include = ['Marital status', 'Application mode', 'Application order', 'Course', 'Attendance_mode', 'Previous qualification (grade)', 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'father_occ', 'mother_occ', 'father_qual', 'mother_qual', 'previous_qual', 'Target']

train_df = train[features_to_include] test_df = test[features_to_include] y_train = train_df["Target"] X_train = train_df.drop("Target", axis=1) y_test = test_df["Target"] X_test = test_df.drop("Target", axis=1) y_train = np.where(y_train == 'Dropout', 1, 0) y_test = np.where(y_test == 'Dropout', 1, 0)

cat_features = ['Course', 'previous_qual', 'Debtor', 'Gender', 'Scholarship holder', 'father_occ', 'mother_occ', 'father_qual', 'mother_qual', 'Marital status', 'Application mode', 'Attendance_mode', 'International', 'Displaced'] num_features = ['Previous qualification (grade)', 'Admission grade', 'Age at enrollment', 'Application order']

select_cat_features = ColumnTransformer([('select_cat', 'passthrough', cat_features)]) cat_transformers = Pipeline([('selector', select_cat_features), ('onehot', OneHotEncoder(handle_unknown='ignore')), ]) select_num_features = ColumnTransformer([('select_num', 'passthrough', num_features)]) num_transformers = Pipeline([('selector', select_num_features), ('scaler', StandardScaler()), ]) preprocess_pipe = FeatureUnion([('cat', cat_transformers), ('num', num_transformers), ])

preprocess_pipe

## Cross validation scheme cv = StratifiedShuffleSplit(n_splits = 5, test_size=0.2, random_state=32)

class Model: def __init__(self, model_name, estimator, preprocessor=None, scoring=None, cv=5, sampler = None, random_state=42): self.name = model_name self.estimator = estimator self.preprocess = preprocessor self.scoring = scoring self.cv = cv self.rs = random_state self.sampler = sampler def make_model_pipeline(self): if self.sampler == None: self.model = Pipeline([('preprocess', self.preprocess), ('model', self.estimator)]) else: self.model = imb_pipeline([('sampler', self.sampler), ('preprocess', self.preprocess), ('model', self.estimator)]) def train(self, X_train, y_train): '''Trains the model Args: X_train: Training data feature matrix y_train: Training data label vector Returns: trained model ''' self.make_model_pipeline() self.cv_results = cross_validate(self.model, X_train, y_train, cv=self.cv, scoring=self.scoring, return_train_score=True) mean_train_score = self.cv_results["train_score"].mean() mean_val_score = self.cv_results["test_score"].mean() std_train_score = self.cv_results["train_score"].std() std_val_score = self.cv_results["test_score"].std() print(f"Cross validated training results for {self.name} model") print("---------------------------------------------------------") print(f"Train score: {mean_train_score} +/- {std_train_score}" ) print(f"Validation score: {mean_val_score} +/- {std_val_score}" ) self.fitted_model = self.model.fit(X_train, y_train) def evaluate(self, X_test, y_test): y_pred = self.fitted_model.predict(X_test) recall = recall_score(y_test, y_pred) acc = accuracy_score(y_test, y_pred) print("Recall score on test set: ", recall) print("Accuracy score on test set: ", acc) print() print(classification_report(y_test, y_pred)) ConfusionMatrixDisplay.from_predictions(y_test, y_pred) def tune(self, param_grid, X_train, y_train): '''Do hyper parameter tuning using GridSearch strategy Args: model: Model to be tuned param_grid: dict of parameters X_train: Feature matrix y_train: Label matrix Returns: best parameters best estimator ''' self.make_model_pipeline() search = GridSearchCV(self.model, param_grid = param_grid, cv = self.cv, scoring=self.scoring, return_train_score=True) search.fit(X_train, y_train) print("Best parameters: ", search.best_params_) print("-------------------Best model performance --------------------------") mean_train_score = search.cv_results_['mean_train_score'][search.best_index_] mean_val_score = search.cv_results_['mean_test_score'][search.best_index_] std_train_score = search.cv_results_['std_train_score'][search.best_index_] std_val_score = search.cv_results_['std_test_score'][search.best_index_] print(f"Score of the model on the train set:\n" f"{mean_train_score:.3f} +/- {std_train_score:.6f}") print(f"Score of the model on the validation set:\n" f"{mean_val_score:.3f} +/- {std_val_score:.6f}") self.fitted_model = search.best_estimator_

cat_mask = [True if x in cat_features else False for x in X_train.columns] smote = SMOTENC(categorical_features=cat_mask, random_state=32)

tree_os_clf = Model(model_name = "Decision tree classifier", estimator = DecisionTreeClassifier(max_depth = 3), preprocessor = preprocess_pipe, sampler = smote, scoring = "recall", random_state = 32)

tree_os_clf.train(X_train, y_train)

tree_os_clf.evaluate(X_test, y_test)

Please feel free to drop an email to esviswajith95@gmail.com if you have a query/comment.

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Who is Dropping out of college?

Who is Dropping out of college?