import pandas as pd
import numpy as np
import scipy as sc
import seaborn as sns
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import pydotplus
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
def split(df, column):
x = df.drop(columns=[column])
y = df[column]
return x,y
data_train = read_csv("train.csv")
data_test = read_csv("test.csv")
x_train,y_train = split(data_train, "indicator")
x_test,y_test = split(data_test, "indicator")
x_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8064 entries, 0 to 8063
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 8064 non-null int64
1 smoker 8064 non-null int64
2 erytrocyty 8064 non-null float64
3 hbver 8064 non-null float64
4 hemoglobin 8064 non-null float64
5 relationship 8064 non-null int64
6 alp 8064 non-null float64
7 etytr 8064 non-null float64
8 hematokrit 8064 non-null float64
9 alt 8064 non-null float64
10 weight 8064 non-null float64
11 ast 8064 non-null float64
12 leukocyty 8064 non-null float64
13 trombocyty 8064 non-null float64
14 er-cv 8064 non-null float64
15 blood_group 8064 non-null int64
16 sex 8064 non-null int64
17 race 8064 non-null int64
dtypes: float64(12), int64(6)
memory usage: 1.1 MB
info = ['0 - negative', '1 - positive']
sns.boxplot(x = 'indicator', y = 'hematokrit', data = data_test)
data_train.groupby("indicator").hematokrit.plot(kind="kde")
predictions = []
for index,row in x_test.iterrows():
if row.hematokrit < 6.8:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.702 0.440 0.541 716
1 - positive 0.744 0.897 0.814 1301
accuracy 0.735 2017
macro avg 0.723 0.668 0.677 2017
weighted avg 0.729 0.735 0.717 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
rslt_df = data_train[data_train['indicator'] == 1]
x=rslt_df['hematokrit'].value_counts().head()
x
predictions = []
for index,row in x_test.iterrows():
if row.hematokrit < 8.33747 and row.hematokrit > 3.71539:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.504 0.159 0.242 716
1 - positive 0.664 0.914 0.769 1301
accuracy 0.646 2017
macro avg 0.584 0.537 0.506 2017
weighted avg 0.607 0.646 0.582 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
sns.boxplot(x = 'indicator', y = 'hemoglobin', data = data_test)
data_train.groupby("indicator").hemoglobin.plot(kind="kde")
predictions = []
for index,row in x_test.iterrows():
if row.hemoglobin < 6.9:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.655 0.443 0.528 716
1 - positive 0.740 0.872 0.800 1301
accuracy 0.719 2017
macro avg 0.697 0.657 0.664 2017
weighted avg 0.710 0.719 0.704 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
rslt_df = data_train[data_train['indicator'] == 1]
x=rslt_df['hemoglobin'].value_counts().head()
x
predictions = []
for index,row in x_test.iterrows():
if row.hematokrit > 3.2623:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.000 0.000 0.000 716
1 - positive 0.645 1.000 0.784 1301
accuracy 0.645 2017
macro avg 0.323 0.500 0.392 2017
weighted avg 0.416 0.645 0.506 2017
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
sns.boxplot(x = 'indicator', y = 'alp', data = data_test)
data_train.groupby("indicator").alp.plot(kind="kde")
predictions = []
for index,row in x_test.iterrows():
if row.alp < 90.3:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.268 0.036 0.064 716
1 - positive 0.641 0.945 0.764 1301
accuracy 0.623 2017
macro avg 0.454 0.491 0.414 2017
weighted avg 0.508 0.623 0.515 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
rslt_df = data_train[data_train['indicator'] == 1]
x=rslt_df['alp'].value_counts().head()
x
predictions = []
for index,row in x_test.iterrows():
if row.hematokrit < 90.32745:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.000 0.000 0.000 716
1 - positive 0.645 1.000 0.784 1301
accuracy 0.645 2017
macro avg 0.323 0.500 0.392 2017
weighted avg 0.416 0.645 0.506 2017
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1308: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
predictions = []
for index,row in x_test.iterrows():
if row.alp < 90.3 and row.hemoglobin < 6.9:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.590 0.478 0.528 716
1 - positive 0.740 0.817 0.776 1301
accuracy 0.697 2017
macro avg 0.665 0.647 0.652 2017
weighted avg 0.686 0.697 0.688 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
predictions = []
for index,row in x_test.iterrows():
if row.alp < 90.3 and row.hematokrit < 6.8:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.628 0.466 0.535 716
1 - positive 0.743 0.848 0.792 1301
accuracy 0.712 2017
macro avg 0.685 0.657 0.664 2017
weighted avg 0.702 0.712 0.701 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
predictions = []
for index,row in x_test.iterrows():
if row.hemoglobin < 6.9 and row.hematokrit < 6.8:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.683 0.803 0.738 716
1 - positive 0.880 0.795 0.835 1301
accuracy 0.798 2017
macro avg 0.781 0.799 0.787 2017
weighted avg 0.810 0.798 0.801 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
predictions = []
for index,row in x_test.iterrows():
if row.alp < 90.3 and row.hemoglobin < 6.9 and row.hematokrit < 6.8:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.642 0.828 0.723 716
1 - positive 0.887 0.746 0.810 1301
accuracy 0.775 2017
macro avg 0.765 0.787 0.767 2017
weighted avg 0.800 0.775 0.779 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
strom = tree.DecisionTreeClassifier()
strom = strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.859 0.853 0.856 716
1 - positive 0.920 0.923 0.921 1301
accuracy 0.898 2017
macro avg 0.889 0.888 0.889 2017
weighted avg 0.898 0.898 0.898 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
tree.plot_tree(strom)
stromcek = export_graphviz(strom)
pydot_graph = pydotplus.graph_from_dot_data(stromcek)
pydot_graph.write_png('original_tree.png')
strom = RandomForestClassifier()
strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.924 0.923 0.924 716
1 - positive 0.958 0.958 0.958 1301
accuracy 0.946 2017
macro avg 0.941 0.941 0.941 2017
weighted avg 0.946 0.946 0.946 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
estimator = DecisionTreeClassifier(random_state=0)
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'min_samples_leaf': [2, 3] }
scoring = ['accuracy',
'precision_micro',
'recall']
gs = GridSearchCV(estimator=estimator,
param_grid=parameters,
cv=10,
scoring=scoring,
refit='accuracy')
search = gs.fit(x_train, y_train)
print(search.best_estimator_)
print(search.best_score_)
print(search.best_params_)
# list(ParameterGrid(parameters))
# search.best_estimator_
DecisionTreeClassifier(max_depth=9, min_samples_leaf=3, random_state=0)
0.9275824746864441
{'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 3, 'splitter': 'best'}
estimator = RandomForestClassifier()
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'n_estimators': [50, 100, 200, 500],
'criterion': ['gini', 'entropy'],
'min_samples_leaf': [2, 3] }
scoring = ['accuracy',
'precision_micro',
'recall']
gs = GridSearchCV(estimator=estimator,
param_grid=parameters,
cv=10,
scoring=scoring,
refit='accuracy')
search = gs.fit(x_train, y_train)
print(search.best_estimator_)
print(search.best_score_)
print(search.best_params_)
# list(ParameterGrid(parameters))
RandomForestClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2,
n_estimators=500)
0.934649207769486
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 500}
estimator = DecisionTreeClassifier(random_state=0)
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'min_samples_leaf': [2, 3] }
scoring = ['accuracy',
'precision_micro',
'recall']
clf = RandomizedSearchCV(estimator=estimator,
param_distributions=parameters,
cv=10,
scoring=scoring,
refit='accuracy')
search = clf.fit(x_train, y_train)
print(search.best_estimator_)
print(search.best_score_)
print(search.best_params_)
# list(ParameterGrid(parameters))
search.best_estimator_
DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2,
random_state=0)
0.9251000089170134
{'splitter': 'best', 'min_samples_leaf': 2, 'max_depth': 10, 'criterion': 'entropy'}
estimator = RandomForestClassifier()
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'criterion': ['gini', 'entropy'],
'n_estimators': [50, 100, 200, 500],
'min_samples_leaf': [2, 3] }
scoring = ['accuracy',
'precision_micro',
'recall']
clf = RandomizedSearchCV(estimator=estimator,
param_distributions=parameters,
cv=10,
scoring=scoring,
refit='accuracy')
search = clf.fit(x_train, y_train)
print(search.best_estimator_)
print(search.best_score_)
print(search.best_params_)
# list(ParameterGrid(parameters))
search.best_estimator_
RandomForestClassifier(max_depth=10, min_samples_leaf=3)
0.9320449786452905
{'n_estimators': 100, 'min_samples_leaf': 3, 'max_depth': 10, 'criterion': 'gini'}
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
x = x_train[:150]
y = y_train[:150]
lasso = linear_model.Lasso()
print(cross_val_score(lasso, x, y, cv=5))
[-0.00550156 0.00307471 -0.07524299 -0.13240974 -0.01734805]
train_knn = read_csv("train_knn.csv")
test_knn = read_csv("test_knn.csv")
x_train,y_train = split(train_knn, "indicator")
x_test,y_test = split(test_knn, "indicator")
strom = tree.DecisionTreeClassifier()
strom = strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.869 0.864 0.867 708
1 - positive 0.927 0.930 0.928 1309
accuracy 0.907 2017
macro avg 0.898 0.897 0.898 2017
weighted avg 0.907 0.907 0.907 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
strom = RandomForestClassifier()
strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.935 0.912 0.924 708
1 - positive 0.953 0.966 0.959 1309
accuracy 0.947 2017
macro avg 0.944 0.939 0.941 2017
weighted avg 0.947 0.947 0.947 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
train_median = read_csv("train_median.csv")
test_median = read_csv("test_median.csv")
x_train,y_train = split(train_median, "indicator")
x_test,y_test = split(test_median, "indicator")
strom = tree.DecisionTreeClassifier()
strom = strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.827 0.857 0.842 719
1 - positive 0.919 0.901 0.910 1298
accuracy 0.885 2017
macro avg 0.873 0.879 0.876 2017
weighted avg 0.886 0.885 0.885 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
strom = RandomForestClassifier()
strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.908 0.921 0.914 719
1 - positive 0.956 0.948 0.952 1298
accuracy 0.939 2017
macro avg 0.932 0.935 0.933 2017
weighted avg 0.939 0.939 0.939 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
train_mean = read_csv("train_mean.csv")
test_mean = read_csv("test_mean.csv")
x_train,y_train = split(train_mean, "indicator")
x_test,y_test = split(test_mean, "indicator")
strom = tree.DecisionTreeClassifier()
strom = strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.838 0.862 0.850 739
1 - positive 0.919 0.904 0.911 1278
accuracy 0.888 2017
macro avg 0.879 0.883 0.881 2017
weighted avg 0.889 0.888 0.889 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
strom = RandomForestClassifier()
strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
precision recall f1-score support
0 - negative 0.926 0.930 0.928 739
1 - positive 0.959 0.957 0.958 1278
accuracy 0.947 2017
macro avg 0.943 0.943 0.943 2017
weighted avg 0.947 0.947 0.947 2017
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()