import pandas as pd
import numpy as np
import scipy as sc
import seaborn as sns
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import pydotplus
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
def split(df, column):
x = df.drop(columns=[column])
y = df[column]
return x,y
data_train = read_csv("train.csv")
data_test = read_csv("test.csv")
x_train,y_train = split(data_train, "indicator")
x_test,y_test = split(data_test, "indicator")
x_train.info()
info = ['0 - negative', '1 - positive']
sns.boxplot(x = 'indicator', y = 'hematokrit', data = data_test)
data_train.groupby("indicator").hematokrit.plot(kind="kde")
predictions = []
for index,row in x_test.iterrows():
if row.hematokrit < 6.8:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
rslt_df = data_train[data_train['indicator'] == 1]
x=rslt_df['hematokrit'].value_counts().head()
x
predictions = []
for index,row in x_test.iterrows():
if row.hematokrit < 8.33747 and row.hematokrit > 3.71539:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
sns.boxplot(x = 'indicator', y = 'hemoglobin', data = data_test)
data_train.groupby("indicator").hemoglobin.plot(kind="kde")
predictions = []
for index,row in x_test.iterrows():
if row.hemoglobin < 6.9:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
rslt_df = data_train[data_train['indicator'] == 1]
x=rslt_df['hemoglobin'].value_counts().head()
x
predictions = []
for index,row in x_test.iterrows():
if row.hematokrit > 3.2623:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
sns.boxplot(x = 'indicator', y = 'alp', data = data_test)
data_train.groupby("indicator").alp.plot(kind="kde")
predictions = []
for index,row in x_test.iterrows():
if row.alp < 90.3:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
rslt_df = data_train[data_train['indicator'] == 1]
x=rslt_df['alp'].value_counts().head()
x
predictions = []
for index,row in x_test.iterrows():
if row.hematokrit < 90.32745:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
predictions = []
for index,row in x_test.iterrows():
if row.alp < 90.3 and row.hemoglobin < 6.9:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
predictions = []
for index,row in x_test.iterrows():
if row.alp < 90.3 and row.hematokrit < 6.8:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
predictions = []
for index,row in x_test.iterrows():
if row.hemoglobin < 6.9 and row.hematokrit < 6.8:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
predictions = []
for index,row in x_test.iterrows():
if row.alp < 90.3 and row.hemoglobin < 6.9 and row.hematokrit < 6.8:
predictions.append(1)
else:
predictions.append(0)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
strom = tree.DecisionTreeClassifier()
strom = strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
tree.plot_tree(strom)
stromcek = export_graphviz(strom)
pydot_graph = pydotplus.graph_from_dot_data(stromcek)
pydot_graph.write_png('original_tree.png')
strom = RandomForestClassifier()
strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
estimator = DecisionTreeClassifier(random_state=0)
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'min_samples_leaf': [2, 3] }
scoring = ['accuracy',
'precision_micro',
'recall']
gs = GridSearchCV(estimator=estimator,
param_grid=parameters,
cv=10,
scoring=scoring,
refit='accuracy')
search = gs.fit(x_train, y_train)
print(search.best_estimator_)
print(search.best_score_)
print(search.best_params_)
# list(ParameterGrid(parameters))
# search.best_estimator_
estimator = RandomForestClassifier()
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'n_estimators': [50, 100, 200, 500],
'criterion': ['gini', 'entropy'],
'min_samples_leaf': [2, 3] }
scoring = ['accuracy',
'precision_micro',
'recall']
gs = GridSearchCV(estimator=estimator,
param_grid=parameters,
cv=10,
scoring=scoring,
refit='accuracy')
search = gs.fit(x_train, y_train)
print(search.best_estimator_)
print(search.best_score_)
print(search.best_params_)
# list(ParameterGrid(parameters))
estimator = DecisionTreeClassifier(random_state=0)
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'min_samples_leaf': [2, 3] }
scoring = ['accuracy',
'precision_micro',
'recall']
clf = RandomizedSearchCV(estimator=estimator,
param_distributions=parameters,
cv=10,
scoring=scoring,
refit='accuracy')
search = clf.fit(x_train, y_train)
print(search.best_estimator_)
print(search.best_score_)
print(search.best_params_)
# list(ParameterGrid(parameters))
search.best_estimator_
estimator = RandomForestClassifier()
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'criterion': ['gini', 'entropy'],
'n_estimators': [50, 100, 200, 500],
'min_samples_leaf': [2, 3] }
scoring = ['accuracy',
'precision_micro',
'recall']
clf = RandomizedSearchCV(estimator=estimator,
param_distributions=parameters,
cv=10,
scoring=scoring,
refit='accuracy')
search = clf.fit(x_train, y_train)
print(search.best_estimator_)
print(search.best_score_)
print(search.best_params_)
# list(ParameterGrid(parameters))
search.best_estimator_
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
x = x_train[:150]
y = y_train[:150]
lasso = linear_model.Lasso()
print(cross_val_score(lasso, x, y, cv=5))
train_knn = read_csv("train_knn.csv")
test_knn = read_csv("test_knn.csv")
x_train,y_train = split(train_knn, "indicator")
x_test,y_test = split(test_knn, "indicator")
strom = tree.DecisionTreeClassifier()
strom = strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
strom = RandomForestClassifier()
strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
train_median = read_csv("train_median.csv")
test_median = read_csv("test_median.csv")
x_train,y_train = split(train_median, "indicator")
x_test,y_test = split(test_median, "indicator")
strom = tree.DecisionTreeClassifier()
strom = strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
strom = RandomForestClassifier()
strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
train_mean = read_csv("train_mean.csv")
test_mean = read_csv("test_mean.csv")
x_train,y_train = split(train_mean, "indicator")
x_test,y_test = split(test_mean, "indicator")
strom = tree.DecisionTreeClassifier()
strom = strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()
strom = RandomForestClassifier()
strom.fit(x_train, y_train)
predictions = strom.predict(x_test)
print(classification_report(y_test, predictions, digits = 3, target_names=info))
ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()