import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,matthews_corrcoef,cohen_kappa_score,precision_score,recall_score,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler,SMOTE
X,y = make_classification(n_samples=50000,n_features=30, n_redundant=15, n_informative=15,class_sep=.66,weights=(.99,1-.99),random_state=1)
X_train,X_test, y_train, y_test = train_test_split(X ,y, random_state=2)
#Standardize Data
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)
pca=PCA(.95)
pca.fit(X_train_scaled)
X_train_scaled_pca = pca.transform(X_train_scaled)
X_test_scaled_pca = pca.transform(X_test_scaled)
oversample = RandomOverSampler(sampling_strategy=1, random_state=2)
X_over, y_over = oversample.fit_resample(X_train_scaled_pca, y_train)
smote = SMOTE(random_state=2)
X_SMOTE, y_SMOTE = smote.fit_resample(X_train_scaled_pca, y_train)
RndFor=RandomForestClassifier(min_samples_leaf=25,max_depth=8,random_state=2)
RndFor.fit(X_train_scaled_pca,y_train)
y_pred=RndFor.predict(X_test_scaled_pca)
#print(classification_report(y_test,y_pred))
# tn fp fn tp
print(confusion_matrix(y_test,y_pred).ravel())
print(matthews_corrcoef(y_test,y_pred))
print(cohen_kappa_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
[12303 0 197 0]
0.0
0.0
0.0
0.0
0.98424
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
RndFor=RandomForestClassifier(min_samples_leaf=25,max_depth=8,random_state=2)
RndFor.fit(X_over,y_over)
y_pred=RndFor.predict(X_test_scaled_pca)
#print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred).ravel())
print(matthews_corrcoef(y_test,y_pred))
print(cohen_kappa_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
[11082 1221 119 78]
0.12109211863048315
0.07907294149160704
0.06004618937644342
0.39593908629441626
0.8928
RndFor=RandomForestClassifier(min_samples_leaf=25,max_depth=8,random_state=2)
RndFor.fit(X_SMOTE, y_SMOTE)
y_pred=RndFor.predict(X_test_scaled_pca)
print(confusion_matrix(y_test,y_pred).ravel())
print(matthews_corrcoef(y_test,y_pred))
print(cohen_kappa_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
[9971 2332 99 98]
0.09690610210675733
0.04681857761962949
0.040329218106995884
0.49746192893401014
0.80552