A Primer On Working With Imbalanced Data

import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report,confusion_matrix,matthews_corrcoef,cohen_kappa_score,precision_score,recall_score,accuracy_score from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier from imblearn.over_sampling import RandomOverSampler,SMOTE

X,y = make_classification(n_samples=50000,n_features=30, n_redundant=15, n_informative=15,class_sep=.66,weights=(.99,1-.99),random_state=1)

X_train,X_test, y_train, y_test = train_test_split(X ,y, random_state=2) #Standardize Data scaler=StandardScaler() X_train_scaled=scaler.fit_transform(X_train) X_test_scaled=scaler.transform(X_test) pca=PCA(.95) pca.fit(X_train_scaled) X_train_scaled_pca = pca.transform(X_train_scaled) X_test_scaled_pca = pca.transform(X_test_scaled)

oversample = RandomOverSampler(sampling_strategy=1, random_state=2) X_over, y_over = oversample.fit_resample(X_train_scaled_pca, y_train) smote = SMOTE(random_state=2) X_SMOTE, y_SMOTE = smote.fit_resample(X_train_scaled_pca, y_train)

RndFor=RandomForestClassifier(min_samples_leaf=25,max_depth=8,random_state=2) RndFor.fit(X_train_scaled_pca,y_train) y_pred=RndFor.predict(X_test_scaled_pca) #print(classification_report(y_test,y_pred)) # tn fp fn tp print(confusion_matrix(y_test,y_pred).ravel()) print(matthews_corrcoef(y_test,y_pred)) print(cohen_kappa_score(y_test,y_pred)) print(precision_score(y_test,y_pred)) print(recall_score(y_test,y_pred)) print(accuracy_score(y_test,y_pred))

RndFor=RandomForestClassifier(min_samples_leaf=25,max_depth=8,random_state=2) RndFor.fit(X_over,y_over) y_pred=RndFor.predict(X_test_scaled_pca) #print(classification_report(y_test,y_pred)) print(confusion_matrix(y_test,y_pred).ravel()) print(matthews_corrcoef(y_test,y_pred)) print(cohen_kappa_score(y_test,y_pred)) print(precision_score(y_test,y_pred)) print(recall_score(y_test,y_pred)) print(accuracy_score(y_test,y_pred))

RndFor=RandomForestClassifier(min_samples_leaf=25,max_depth=8,random_state=2) RndFor.fit(X_SMOTE, y_SMOTE) y_pred=RndFor.predict(X_test_scaled_pca) print(confusion_matrix(y_test,y_pred).ravel()) print(matthews_corrcoef(y_test,y_pred)) print(cohen_kappa_score(y_test,y_pred)) print(precision_score(y_test,y_pred)) print(recall_score(y_test,y_pred)) print(accuracy_score(y_test,y_pred))