Liver Disorders Storytelling

import pandas as pd import numpy as np import matplotlib.pyplot as plt import sklearn from sklearn.preprocessing import LabelBinarizer pd.plotting.register_matplotlib_converters() import seaborn as sns %matplotlib inline sns.set() print('Setup completed!!')

data = pd.read_csv('HepatitisCdata.csv') data.head()

data.dtypes

data.isnull().sum()

data.fillna(0, axis=0, inplace=True)

data.describe().T

data.drop(['Unnamed: 0'], axis=1, inplace=True)

data.describe(include='object')

# Count Plot (a.k.a. Bar Plot) sns.set_style('white') sns.countplot(y='Category', data=data) # Rotate x-labels plt.xticks(rotation=-45)

sns.barplot(x=data['ALP'], y=data['Category'])

plt.figure(figsize=(10, 8)) sns.set_style('white') sns.violinplot(x=data['Category'], y=data['ALB'])

# Create violinplot: plt.figure(figsize=(10, 8)) sns.set_style('white') sns.violinplot(x=data['Category'], y=data['AST'],palette='Reds')

# Create violinplot: plt.figure(figsize=(10, 8)) sns.set_style('white') sns.violinplot(x=data['Category'], y=data['ALT'],palette='Greens')

plt.figure(figsize=(10,8)) sns.set_style('white') sns.violinplot(x=data['Category'], y=data['BIL'],palette='Blues')

# Create a histogram: # Change the style of the figure to the "white" theme sns.set_style("white") plt.figure(figsize=(13, 8)) sns.distplot(a=data['BIL'],color='Red', kde=True)

sns.set_style('white') plt.figure(figsize=(10, 8)) sns.distplot(a= data['ALB'], kde=True)

sns.set_style('white') plt.figure(figsize=(10, 8)) sns.kdeplot(data=data['AST'], shade=True, color='Red')

# 2D KDE plots: sns.set_style('white') plt.figure(figsize=(20, 10)) sns.jointplot(x=data['ALT'], y=data['AST'], kind='kde', color='green')

sns.set_style('white') plt.figure(figsize=(20, 10)) sns.jointplot(x=data['ALT'], y=data['ALP'], kind='kde', color='Red')

# Scatterplots: sns.scatterplot(x=data['ALB'], y=data['BIL'], hue=data['Category'])

# Scatter for Categoriacal Values: sns.swarmplot(x=data['ALB'], y=data['Category'])

sns.scatterplot(x=data['ALT'], y=data['AST'], hue=data['Category'])

sns.scatterplot(x=data['ALP'], y=data['AST'], hue=data['Category'])

sns.scatterplot(x=data['ALB'], y=data['ALT'], hue=data['Category'])

# Pairplot: plt.figure(figsize=(12, 10)) sns.pairplot(data, kind='scatter', hue='Category', palette='Set1') plt.show()

df_uniques = data.nunique() # Count number of distinct elements in specified axis. #Return Series with number of distinct elements. Can ignore NaN values. df_uniques

bin_vals = list(df_uniques[df_uniques == 2].index) # Create alist for our bin_vales (composed of at least two values) bin_vals

categorical_vals = list(df_uniques[(df_uniques > 2) & (df_uniques <=6)].index) # This function allow us to determine how many categorical do we have actually those are more than 2 and less than 6 categorical_vals

from sklearn.preprocessing import LabelBinarizer, LabelEncoder

lb , le = LabelBinarizer(), LabelEncoder()

for col in categorical_vals: data[col] = le.fit_transform(data[col])

for col in bin_vals: data[col] = lb.fit_transform(data[col])

data.head()

data.tail()

X = data y = data['Category']

X.head()

y.head()

cols = X.columns

from sklearn.preprocessing import MinMaxScaler ms = MinMaxScaler() X = ms.fit_transform(X)

X = pd.DataFrame(X, columns=[cols])

X.head()

fig, ax = plt.subplots(figsize=(12, 8)) sns.heatmap(X.corr())

from sklearn.cluster import KMeans # Kmeans model with k=4: kmeans = KMeans(n_clusters=4, random_state=0) kmeans.fit(X)

# Get the inertia: kmeans.inertia_

pred = kmeans.predict(X)

data_with_clusters = data.copy() data_with_clusters['clusters'] = pred sns.scatterplot(x=data_with_clusters['AST'], y=data_with_clusters['ALT'],c=data_with_clusters['clusters']) plt.xlabel('AST levels') plt.ylabel('ALT levels') plt.title('Making a clustering when k=4')

cs = [] for i in range(1, 11): K_model = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0) K_model.fit(X) cs.append(K_model.inertia_) plt.plot(range(1, 11), cs) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('CS') plt.show()

# from the graph above it seems that the right value of k = 8: k_means = KMeans(n_clusters=8, random_state=0) k_means.fit(X)

# Get the inertia : k_means.inertia_

pred_2 = k_means.predict(X)

data_with_clusters = data.copy() data_with_clusters['clusters'] = pred_2 sns.scatterplot(x=data_with_clusters['AST'], y=data_with_clusters['ALT'],c=data_with_clusters['clusters']) plt.xlabel('AST levels') plt.ylabel('ALT levels') plt.title('Making a clustering when k=8')