# Starting with importing the requried modules
import pandas as pd
#For data manipulation and analysis. In particular
#it offers data structures and operations for manipulating numerical tables and time series.
import numpy as np
# is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices,
# along with a large collection of high-level mathematical functions.(Linear Algebra)
import matplotlib.pyplot as plt
# Matplotlib is a plotting library for the Python programming language and its numerical mathematics extension NumPy.
import seaborn as sns
# Seaborn is a Python data visualization library based on matplotlib,
# It provides a high-level interface for drawing attractive and informative statistical graphics.
import sklearn
# Scikit-learn is a free software machine learning library for the Python programming language,
# It features various classification & regression
%matplotlib inline
# It is a magic function that renders the figure in a notebook (instead of displaying a dump of the figure object).
sns.set()
# You can also customize seaborn theme or use one of six variations of the default theme,
# Which are called deep, muted, pastel, bright, dark, and colorblind.
print('Setup Complete!')
data_path = '/content/healthcare-dataset-stroke-data.csv' # We can Store our dataset as var in order to use it for different tasks
df = pd.read_csv(data_path) # is an important pandas function to read csv files and do operations on it
df.head() # For representing the first five rows
df.tail() # Fuction could be used to present the last five rows in the dataframe
df.dtypes # Return the dtypes in the DataFrame. This returns a Series with the data type of each column.
df.info() # This method prints information about a DataFrame,
# including the index dtype and column dtypes, non-null values and memory usage
df.dtypes.value_counts() # Return a Series containing counts of unique rows in the DataFrame.
# First let's to checkout the missing values do we have on this dataframe and the we can decide to clean up (remove) them,
# or leave them !!!
missing_values = df.isnull().sum() # get the number of missing data points per column.
missing_values[:] # look at the whole missing points at all columns
# This time we do need to know the percentage % of the mssing values in the dataset!!!
total_cells = np.product(df.shape) # returns the product of array elements over a given axis.
total_missing = missing_values.sum()
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)
# it seems like we do not have that huge amout of missing values , but now we want to remove ,
# Some columns like (bmi) cuz it contains some NAN"S and that is going to affect the ML model work when it comes to predictions.
df.drop(['id', 'bmi'],axis=1, inplace=True) #Remove rows or columns by specifying label names and corresponding axis,
# or by specifying directly index or column names.
df.describe() # Descriptive statistics include those that summarize the central tendency,
# dispersion and shape of a dataset’s distribution, excluding NaN values. NOTE ==> this funtion just will represent the numaric values.
df.describe(include='object') # here by adding this argument (include='object') it allows us to look across the object(string) values
df_uniques = df.nunique()
# Count number of distinct elements in specified axis.
#Return Series with number of distinct elements. Can ignore NaN values.
df_uniques
binary_vals = list(df_uniques[df_uniques == 2].index) # Create alist for our bin_vales (composed of at least two values)
binary_vals
df[binary_vals].dtypes # Check the datatypes in case if it needs to be encoded (converted into numaric values^_^)
categorical_vals = list(df_uniques[(df_uniques > 2) & (df_uniques <=6)].index)
# This function allow us to determine how many categorical do we have actually those are more than 2 and less than 6
categorical_vals
[[i, list(df[i].unique())] for i in categorical_vals]
# iterating over all the cate_values and ordering them for applying the encoding process
numaric_vals = list(set(df.columns) - set(categorical_vals)- set(binary_vals))
# Extending the numaric values among cate_vals, dataframe columns and bin_values
numaric_vals
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder
lb, le, lo = LabelBinarizer(), LabelEncoder(), OrdinalEncoder()
for col in categorical_vals:
df[col] = le.fit_transform(df[col])
for col in binary_vals:
df[col] = lb.fit_transform(df[col])
df.head()
df.stroke.value_counts()
# Return a Series containing counts of values in our Y (target) after encoding ,
# 1 refers to have Stroke & 0 refers to don't have Stroke
# create a histogram:
ax = plt.axes()
ax.hist(df.stroke, bins=5, alpha=0.8, color='red')
ax.set(xlabel='Class distribution of 249 patients Have been already diagnosed with brain stroke and 4861 Haven not been diagnosed with Brain stroke ',
ylabel='Frequncy',
title='Brain Stroke')
# Create Feature columns :
features_col = [x for x in df.columns if x not in 'stroke']
features_col
df[features_col]
from sklearn.model_selection import StratifiedShuffleSplit
strat_shuff_split = StratifiedShuffleSplit(n_splits=1, test_size=1000, random_state=42)
# This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, which returns stratified randomized folds.
# The folds are made by preserving the percentage of samples for each class.
# Get the index values from the genetor:
train_index , test_index = next(strat_shuff_split.split(df[features_col], df['stroke']))
# Create the datasets :
x_train = df.loc[train_index, features_col]
y_train = df.loc[train_index, 'stroke']
x_test = df.loc[test_index, features_col]
y_test = df.loc[test_index, 'stroke']
# Checking the perentage compistion of each level in the train and test sets:
y_train.value_counts(normalize=True).sort_index()
y_test.value_counts(normalize=True).sort_index()
df.describe()
# Estimate Knn model and report the outcome :
from sklearn.neighbors import KNeighborsClassifier
# Create the model :
knn = KNeighborsClassifier(n_neighbors=3)
# Train (fit) the model :
knn = knn.fit(x_train, y_train)
# Make a prediction at the first 5 rows regrads (k = 3) till we dertmine the optimum value of k
y_pred = knn.predict(x_test)
y_pred[:5]
from sklearn.metrics import accuracy_score, classification_report, f1_score
print(classification_report(y_test, y_pred))
print('Accuracy Score:', round(accuracy_score(y_test, y_pred,2)))
print("F1_score: ", round(f1_score(y_test, y_pred,2)))
from sklearn.tree import DecisionTreeClassifier
# Create the model
dt = DecisionTreeClassifier(random_state=42)
# Fit the model
dt = dt.fit(x_train, y_train)
# Determine the number of nodes and maximum depth:
dt.tree_.node_count, dt.tree_.max_depth
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def measure_error(y_true, y_pred, label):
return pd.Series({'accuracy': accuracy_score(y_true, y_pred),
'precision': precision_score(y_true, y_pred),
'recall': recall_score(y_true, y_pred),
'f1': f1_score(y_true, y_pred)},
name=label)
# This step may lead to overfitting because we did not prune the tree:
y_train_pred = dt.predict(x_train)
y_test_pred = dt.predict(x_test)
y_test_pred[:5]
train_test_full_error = pd.concat([measure_error(y_train, y_train_pred, 'train'),
measure_error(y_test, y_test_pred,'test')],axis=1)
train_test_full_error
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth':range(1, dt.tree_.max_depth+1,2),
'max_features': range(1, len(dt.feature_importances_)+1)}
GR = GridSearchCV(DecisionTreeClassifier(random_state=42),
param_grid = param_grid,
scoring = 'accuracy',
n_jobs = -1)
# Fit the Grid Search model:
GR = GR.fit(x_train, y_train)
# Get the number of the nodes and Maximmum Depth:
GR.best_estimator_.tree_.node_count , GR.best_estimator_.tree_.max_depth
# Make predication:
y_train_pred_gr = GR.predict(x_train)
y_train_pred_gr[:]
y_test_pred_gr = GR.predict(x_test)
y_test_pred_gr[:5]
train_test_full_error_gr = pd.concat([measure_error(y_train, y_train_pred_gr, 'train'),
measure_error(y_test, y_test_pred_gr,'test')],axis=1)
train_test_full_error_gr
# Create a heatmap for checking variables correlations :
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(df.corr())
import warnings
warnings.filterwarnings("ignore", category= UserWarning)
warnings.filterwarnings("ignore", category= RuntimeWarning)
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(oob_score=True,
random_state=42,
warm_start=True,
n_jobs = -1)
oob_list = list()
for n_trees in [15, 20, 50 , 60, 100, 110, 115, 200, 260, 300, 500, 550, 600, 670, 700, 780, 800]:
RF.set_params(n_estimators=n_trees)
# Fit the model :
RF.fit(x_train, y_train)
# Get the oob Erros:
oob_error = 1 - RF.oob_score_
# Score it :
oob_list.append(pd.Series({'n_trees': n_trees, 'OOb':oob_error}))
rf_oob_df = pd.concat(oob_list, axis=1).T.set_index('n_trees')
rf_oob_df
sns.set_context('talk')
sns.set_style('white')
ax = rf_oob_df.plot(legend=False, marker='o', figsize=(12, 8),linewidth=5, color='red')
ax.set(ylabel='out of bagging Error')