import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import LabelBinarizer
pd.plotting.register_matplotlib_converters()
import seaborn as sns
%matplotlib inline
sns.set()
print('Setup completed!!')
data = pd.read_csv('HepatitisCdata.csv')
data.head()
data.dtypes
data.isnull().sum()
data.fillna(0, axis=0, inplace=True)
data.describe().T
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.describe(include='object')
# Count Plot (a.k.a. Bar Plot)
sns.set_style('white')
sns.countplot(y='Category', data=data)
# Rotate x-labels
plt.xticks(rotation=-45)
sns.barplot(x=data['ALP'], y=data['Category'])
plt.figure(figsize=(10, 8))
sns.set_style('white')
sns.violinplot(x=data['Category'], y=data['ALB'])
# Create violinplot:
plt.figure(figsize=(10, 8))
sns.set_style('white')
sns.violinplot(x=data['Category'], y=data['AST'],palette='Reds')
# Create violinplot:
plt.figure(figsize=(10, 8))
sns.set_style('white')
sns.violinplot(x=data['Category'], y=data['ALT'],palette='Greens')
plt.figure(figsize=(10,8))
sns.set_style('white')
sns.violinplot(x=data['Category'], y=data['BIL'],palette='Blues')
# Create a histogram:
# Change the style of the figure to the "white" theme
sns.set_style("white")
plt.figure(figsize=(13, 8))
sns.distplot(a=data['BIL'],color='Red', kde=True)
sns.set_style('white')
plt.figure(figsize=(10, 8))
sns.distplot(a= data['ALB'], kde=True)
sns.set_style('white')
plt.figure(figsize=(10, 8))
sns.kdeplot(data=data['AST'], shade=True, color='Red')
# 2D KDE plots:
sns.set_style('white')
plt.figure(figsize=(20, 10))
sns.jointplot(x=data['ALT'], y=data['AST'], kind='kde', color='green')
sns.set_style('white')
plt.figure(figsize=(20, 10))
sns.jointplot(x=data['ALT'], y=data['ALP'], kind='kde', color='Red')
# Scatterplots:
sns.scatterplot(x=data['ALB'], y=data['BIL'], hue=data['Category'])
# Scatter for Categoriacal Values:
sns.swarmplot(x=data['ALB'], y=data['Category'])
sns.scatterplot(x=data['ALT'], y=data['AST'], hue=data['Category'])
sns.scatterplot(x=data['ALP'], y=data['AST'], hue=data['Category'])
sns.scatterplot(x=data['ALB'], y=data['ALT'], hue=data['Category'])
# Pairplot:
plt.figure(figsize=(12, 10))
sns.pairplot(data, kind='scatter', hue='Category', palette='Set1')
plt.show()
df_uniques = data.nunique()
# Count number of distinct elements in specified axis.
#Return Series with number of distinct elements. Can ignore NaN values.
df_uniques
bin_vals = list(df_uniques[df_uniques == 2].index) # Create alist for our bin_vales (composed of at least two values)
bin_vals
categorical_vals = list(df_uniques[(df_uniques > 2) & (df_uniques <=6)].index)
# This function allow us to determine how many categorical do we have actually those are more than 2 and less than 6
categorical_vals
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
lb , le = LabelBinarizer(), LabelEncoder()
for col in categorical_vals:
data[col] = le.fit_transform(data[col])
for col in bin_vals:
data[col] = lb.fit_transform(data[col])
data.head()
data.tail()
X = data
y = data['Category']
X.head()
y.head()
cols = X.columns
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()
X = ms.fit_transform(X)
X = pd.DataFrame(X, columns=[cols])
X.head()
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(X.corr())
from sklearn.cluster import KMeans
# Kmeans model with k=4:
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(X)
# Get the inertia:
kmeans.inertia_
pred = kmeans.predict(X)
data_with_clusters = data.copy()
data_with_clusters['clusters'] = pred
sns.scatterplot(x=data_with_clusters['AST'], y=data_with_clusters['ALT'],c=data_with_clusters['clusters'])
plt.xlabel('AST levels')
plt.ylabel('ALT levels')
plt.title('Making a clustering when k=4')
cs = []
for i in range(1, 11):
K_model = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
K_model.fit(X)
cs.append(K_model.inertia_)
plt.plot(range(1, 11), cs)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()
# from the graph above it seems that the right value of k = 8:
k_means = KMeans(n_clusters=8, random_state=0)
k_means.fit(X)
# Get the inertia :
k_means.inertia_
pred_2 = k_means.predict(X)
data_with_clusters = data.copy()
data_with_clusters['clusters'] = pred_2
sns.scatterplot(x=data_with_clusters['AST'], y=data_with_clusters['ALT'],c=data_with_clusters['clusters'])
plt.xlabel('AST levels')
plt.ylabel('ALT levels')
plt.title('Making a clustering when k=8')