import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import LabelBinarizer
pd.plotting.register_matplotlib_converters()
import seaborn as sns
%matplotlib inline
sns.set()
print('Setup completed!!')
Setup completed!!
data = pd.read_csv('HepatitisCdata.csv')
data.head()
Unnamed: 0int64
Categoryobject
0
1
0=Blood Donor
1
2
0=Blood Donor
2
3
0=Blood Donor
3
4
0=Blood Donor
4
5
0=Blood Donor
data.dtypes
data.isnull().sum()
data.fillna(0, axis=0, inplace=True)
data.describe().T
countfloat64
615.0 - 615.0
meanfloat64
5.280813008130081 - 308.0
Unnamed: 0
615
308
Age
615
47.40813008
ALB
615
41.55252033
ALP
615
66.28536585
ALT
615
28.40455285
AST
615
34.78634146
BIL
615
11.39674797
CHE
615
8.196634146
CHOL
615
5.280813008
CREA
615
81.28780488
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.describe(include='object')
Categoryobject
Sexobject
count
615
615
unique
5
2
top
0=Blood Donor
m
freq
533
377
# Count Plot (a.k.a. Bar Plot)
sns.set_style('white')
sns.countplot(y='Category', data=data)
# Rotate x-labels
plt.xticks(rotation=-45)
sns.barplot(x=data['ALP'], y=data['Category'])
plt.figure(figsize=(10, 8))
sns.set_style('white')
sns.violinplot(x=data['Category'], y=data['ALB'])
# Create violinplot:
plt.figure(figsize=(10, 8))
sns.set_style('white')
sns.violinplot(x=data['Category'], y=data['AST'],palette='Reds')
# Create violinplot:
plt.figure(figsize=(10, 8))
sns.set_style('white')
sns.violinplot(x=data['Category'], y=data['ALT'],palette='Greens')
plt.figure(figsize=(10,8))
sns.set_style('white')
sns.violinplot(x=data['Category'], y=data['BIL'],palette='Blues')
# Create a histogram:
# Change the style of the figure to the "white" theme
sns.set_style("white")
plt.figure(figsize=(13, 8))
sns.distplot(a=data['BIL'],color='Red', kde=True)
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
sns.set_style('white')
plt.figure(figsize=(10, 8))
sns.distplot(a= data['ALB'], kde=True)
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
sns.set_style('white')
plt.figure(figsize=(10, 8))
sns.kdeplot(data=data['AST'], shade=True, color='Red')
# 2D KDE plots:
sns.set_style('white')
plt.figure(figsize=(20, 10))
sns.jointplot(x=data['ALT'], y=data['AST'], kind='kde', color='green')
sns.set_style('white')
plt.figure(figsize=(20, 10))
sns.jointplot(x=data['ALT'], y=data['ALP'], kind='kde', color='Red')
# Scatterplots:
sns.scatterplot(x=data['ALB'], y=data['BIL'], hue=data['Category'])
# Scatter for Categoriacal Values:
sns.swarmplot(x=data['ALB'], y=data['Category'])
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/categorical.py:1296: UserWarning: 78.2% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
sns.scatterplot(x=data['ALT'], y=data['AST'], hue=data['Category'])
sns.scatterplot(x=data['ALP'], y=data['AST'], hue=data['Category'])
sns.scatterplot(x=data['ALB'], y=data['ALT'], hue=data['Category'])
# Pairplot:
plt.figure(figsize=(12, 10))
sns.pairplot(data, kind='scatter', hue='Category', palette='Set1')
plt.show()
df_uniques = data.nunique()
# Count number of distinct elements in specified axis.
#Return Series with number of distinct elements. Can ignore NaN values.
df_uniques
bin_vals = list(df_uniques[df_uniques == 2].index) # Create alist for our bin_vales (composed of at least two values)
bin_vals
categorical_vals = list(df_uniques[(df_uniques > 2) & (df_uniques <=6)].index)
# This function allow us to determine how many categorical do we have actually those are more than 2 and less than 6
categorical_vals
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
lb , le = LabelBinarizer(), LabelEncoder()
for col in categorical_vals:
data[col] = le.fit_transform(data[col])
for col in bin_vals:
data[col] = lb.fit_transform(data[col])
data.head()
Categoryint64
Ageint64
0
0
32
1
0
32
2
0
32
3
0
32
4
0
32
data.tail()
Categoryint64
Ageint64
610
4
62
611
4
64
612
4
64
613
4
46
614
4
59
X = data
y = data['Category']
X.head()
Categoryint64
Ageint64
0
0
32
1
0
32
2
0
32
3
0
32
4
0
32
y.head()
cols = X.columns
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()
X = ms.fit_transform(X)
X = pd.DataFrame(X, columns=[cols])
X.head()
Categoryfloat64
Agefloat64
0
1
2
3
4
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(X.corr())
from sklearn.cluster import KMeans
# Kmeans model with k=4:
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(X)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
# Get the inertia:
kmeans.inertia_
pred = kmeans.predict(X)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
data_with_clusters = data.copy()
data_with_clusters['clusters'] = pred
sns.scatterplot(x=data_with_clusters['AST'], y=data_with_clusters['ALT'],c=data_with_clusters['clusters'])
plt.xlabel('AST levels')
plt.ylabel('ALT levels')
plt.title('Making a clustering when k=4')
cs = []
for i in range(1, 11):
K_model = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
K_model.fit(X)
cs.append(K_model.inertia_)
plt.plot(range(1, 11), cs)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
# from the graph above it seems that the right value of k = 8:
k_means = KMeans(n_clusters=8, random_state=0)
k_means.fit(X)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
# Get the inertia :
k_means.inertia_
pred_2 = k_means.predict(X)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:1679: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
data_with_clusters = data.copy()
data_with_clusters['clusters'] = pred_2
sns.scatterplot(x=data_with_clusters['AST'], y=data_with_clusters['ALT'],c=data_with_clusters['clusters'])
plt.xlabel('AST levels')
plt.ylabel('ALT levels')
plt.title('Making a clustering when k=8')