Brain Stroke Detection Using Multiple Classifiers

# Starting with importing the requried modules import pandas as pd #For data manipulation and analysis. In particular #it offers data structures and operations for manipulating numerical tables and time series. import numpy as np # is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, # along with a large collection of high-level mathematical functions.(Linear Algebra) import matplotlib.pyplot as plt # Matplotlib is a plotting library for the Python programming language and its numerical mathematics extension NumPy. import seaborn as sns # Seaborn is a Python data visualization library based on matplotlib, # It provides a high-level interface for drawing attractive and informative statistical graphics. import sklearn # Scikit-learn is a free software machine learning library for the Python programming language, # It features various classification & regression %matplotlib inline # It is a magic function that renders the figure in a notebook (instead of displaying a dump of the figure object). sns.set() # You can also customize seaborn theme or use one of six variations of the default theme, # Which are called deep, muted, pastel, bright, dark, and colorblind. print('Setup Complete!')

data_path = '/content/healthcare-dataset-stroke-data.csv' # We can Store our dataset as var in order to use it for different tasks df = pd.read_csv(data_path) # is an important pandas function to read csv files and do operations on it df.head() # For representing the first five rows

df.tail() # Fuction could be used to present the last five rows in the dataframe

df.dtypes # Return the dtypes in the DataFrame. This returns a Series with the data type of each column.

df.info() # This method prints information about a DataFrame, # including the index dtype and column dtypes, non-null values and memory usage

df.dtypes.value_counts() # Return a Series containing counts of unique rows in the DataFrame.

# First let's to checkout the missing values do we have on this dataframe and the we can decide to clean up (remove) them, # or leave them !!! missing_values = df.isnull().sum() # get the number of missing data points per column. missing_values[:] # look at the whole missing points at all columns

# This time we do need to know the percentage % of the mssing values in the dataset!!! total_cells = np.product(df.shape) # returns the product of array elements over a given axis. total_missing = missing_values.sum() percent_missing = (total_missing/total_cells) * 100 print(percent_missing)

# it seems like we do not have that huge amout of missing values , but now we want to remove , # Some columns like (bmi) cuz it contains some NAN"S and that is going to affect the ML model work when it comes to predictions. df.drop(['id', 'bmi'],axis=1, inplace=True) #Remove rows or columns by specifying label names and corresponding axis, # or by specifying directly index or column names.

df.describe() # Descriptive statistics include those that summarize the central tendency, # dispersion and shape of a dataset’s distribution, excluding NaN values. NOTE ==> this funtion just will represent the numaric values.

df.describe(include='object') # here by adding this argument (include='object') it allows us to look across the object(string) values

df_uniques = df.nunique() # Count number of distinct elements in specified axis. #Return Series with number of distinct elements. Can ignore NaN values. df_uniques

binary_vals = list(df_uniques[df_uniques == 2].index) # Create alist for our bin_vales (composed of at least two values) binary_vals

df[binary_vals].dtypes # Check the datatypes in case if it needs to be encoded (converted into numaric values^_^)

categorical_vals = list(df_uniques[(df_uniques > 2) & (df_uniques <=6)].index) # This function allow us to determine how many categorical do we have actually those are more than 2 and less than 6 categorical_vals

[[i, list(df[i].unique())] for i in categorical_vals] # iterating over all the cate_values and ordering them for applying the encoding process

numaric_vals = list(set(df.columns) - set(categorical_vals)- set(binary_vals)) # Extending the numaric values among cate_vals, dataframe columns and bin_values numaric_vals

from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder

lb, le, lo = LabelBinarizer(), LabelEncoder(), OrdinalEncoder()

for col in categorical_vals: df[col] = le.fit_transform(df[col])

for col in binary_vals: df[col] = lb.fit_transform(df[col])

df.head()

df.stroke.value_counts() # Return a Series containing counts of values in our Y (target) after encoding , # 1 refers to have Stroke & 0 refers to don't have Stroke

# create a histogram: ax = plt.axes() ax.hist(df.stroke, bins=5, alpha=0.8, color='red') ax.set(xlabel='Class distribution of 249 patients Have been already diagnosed with brain stroke and 4861 Haven not been diagnosed with Brain stroke ', ylabel='Frequncy', title='Brain Stroke')

# Create Feature columns : features_col = [x for x in df.columns if x not in 'stroke'] features_col

df[features_col]

from sklearn.model_selection import StratifiedShuffleSplit

strat_shuff_split = StratifiedShuffleSplit(n_splits=1, test_size=1000, random_state=42) # This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, which returns stratified randomized folds. # The folds are made by preserving the percentage of samples for each class.

# Get the index values from the genetor: train_index , test_index = next(strat_shuff_split.split(df[features_col], df['stroke']))

# Create the datasets : x_train = df.loc[train_index, features_col] y_train = df.loc[train_index, 'stroke'] x_test = df.loc[test_index, features_col] y_test = df.loc[test_index, 'stroke']

# Checking the perentage compistion of each level in the train and test sets: y_train.value_counts(normalize=True).sort_index()

y_test.value_counts(normalize=True).sort_index()

df.describe()

# Estimate Knn model and report the outcome : from sklearn.neighbors import KNeighborsClassifier

# Create the model : knn = KNeighborsClassifier(n_neighbors=3)

# Train (fit) the model : knn = knn.fit(x_train, y_train)

# Make a prediction at the first 5 rows regrads (k = 3) till we dertmine the optimum value of k y_pred = knn.predict(x_test) y_pred[:5]

from sklearn.metrics import accuracy_score, classification_report, f1_score

print(classification_report(y_test, y_pred)) print('Accuracy Score:', round(accuracy_score(y_test, y_pred,2))) print("F1_score: ", round(f1_score(y_test, y_pred,2)))

from sklearn.tree import DecisionTreeClassifier

# Create the model dt = DecisionTreeClassifier(random_state=42)

# Fit the model dt = dt.fit(x_train, y_train)

# Determine the number of nodes and maximum depth: dt.tree_.node_count, dt.tree_.max_depth

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def measure_error(y_true, y_pred, label): return pd.Series({'accuracy': accuracy_score(y_true, y_pred), 'precision': precision_score(y_true, y_pred), 'recall': recall_score(y_true, y_pred), 'f1': f1_score(y_true, y_pred)}, name=label)

# This step may lead to overfitting because we did not prune the tree: y_train_pred = dt.predict(x_train)

y_test_pred = dt.predict(x_test)

y_test_pred[:5]

train_test_full_error = pd.concat([measure_error(y_train, y_train_pred, 'train'), measure_error(y_test, y_test_pred,'test')],axis=1)

train_test_full_error

from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth':range(1, dt.tree_.max_depth+1,2), 'max_features': range(1, len(dt.feature_importances_)+1)} GR = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid = param_grid, scoring = 'accuracy', n_jobs = -1)

# Fit the Grid Search model: GR = GR.fit(x_train, y_train)

# Get the number of the nodes and Maximmum Depth: GR.best_estimator_.tree_.node_count , GR.best_estimator_.tree_.max_depth

# Make predication: y_train_pred_gr = GR.predict(x_train) y_train_pred_gr[:]

y_test_pred_gr = GR.predict(x_test) y_test_pred_gr[:5]

train_test_full_error_gr = pd.concat([measure_error(y_train, y_train_pred_gr, 'train'), measure_error(y_test, y_test_pred_gr,'test')],axis=1)

train_test_full_error_gr

# Create a heatmap for checking variables correlations : fig, ax = plt.subplots(figsize=(12, 8)) sns.heatmap(df.corr())

import warnings

warnings.filterwarnings("ignore", category= UserWarning) warnings.filterwarnings("ignore", category= RuntimeWarning)

from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(oob_score=True, random_state=42, warm_start=True, n_jobs = -1) oob_list = list() for n_trees in [15, 20, 50 , 60, 100, 110, 115, 200, 260, 300, 500, 550, 600, 670, 700, 780, 800]: RF.set_params(n_estimators=n_trees) # Fit the model : RF.fit(x_train, y_train) # Get the oob Erros: oob_error = 1 - RF.oob_score_ # Score it : oob_list.append(pd.Series({'n_trees': n_trees, 'OOb':oob_error})) rf_oob_df = pd.concat(oob_list, axis=1).T.set_index('n_trees') rf_oob_df

sns.set_context('talk') sns.set_style('white') ax = rf_oob_df.plot(legend=False, marker='o', figsize=(12, 8),linewidth=5, color='red') ax.set(ylabel='out of bagging Error')