Assignment 3
Done by Ayam Jain (spent roughly 12hrs on the assignment)
#importing Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
# from sklearn.datasets import make_blobs
#loading and cleaning csv data
xls = pd.read_csv('/work/Dataset/assignment3-data.csv')
data = xls.dropna( how='all')
data
Question 1
#Scatter Plot
plt.figure(figsize=(10,10))
plt.scatter(data['phi'], data['psi'])
plt.title("Scatter distribution of Phi vs Psi", fontsize = 10)
plt.xlabel("Phi (in degress)", fontsize = 10)
plt.ylabel("Psi (in degress)", fontsize = 10)
plt.show()
#Heatmap Plot
plt.figure(figsize=(15, 10))
plt.hist2d(data['phi'],data['psi'], bins=300, cmap='inferno')
plt.title('Heatmap distribution of Phi vs Psi' , fontsize = 10)
plt.xlabel('Phi (in degress)', fontsize = 10)
plt.ylabel('Psi (in degress)', fontsize = 10)
cb = plt.colorbar()
cb.set_label('Number of samples')
plt.show()
Question 2 - K-means
# K-Cluster Plot
kdata = data.iloc[:][['phi','psi']]
data_with_clusters = kdata.copy()
kmeans = KMeans(3)
identified_clusters = kmeans.fit_predict(kdata)
data_with_clusters['Clusters'] = identified_clusters
plt.figure(figsize=(7, 7))
plt.scatter(data_with_clusters['phi'],data_with_clusters['psi'],c=data_with_clusters['Clusters'],cmap='rainbow')
#K-Cluster Plot
nrk = 8 #number of clusters needed
kdata = data.iloc[:][['phi','psi']]
data_with_clusters = kdata.copy()
i=0;j=0;wcss=[];sil=[]
nrx = round(nrk/4)
if nrk%4>0:
nrx+=1
fig,axs= plt.subplots(nrx,4, figsize=(30,15))
for k in range(1,nrk+1):
kmeans = KMeans(k)
identified_clusters = kmeans.fit_predict(kdata)
data_with_clusters['Clusters'] = identified_clusters
wcss_iter = kmeans.inertia_
wcss.append(wcss_iter)
if k>=2:
labels = kmeans.labels_
sil_iter = metrics.silhouette_score(kdata, labels, metric='euclidean')
sil.append(sil_iter)
axs[i][j].scatter(data_with_clusters['phi'],data_with_clusters['psi'],c=data_with_clusters['Clusters'],cmap='rainbow')
axs[i][j].set_xlabel('Phi (in degress)', fontsize = 15)
axs[i][j].set_ylabel('Psi (in degress)', fontsize = 15)
axs[i][j].set_title(f'k = {k}', fontsize = 20)
j+=1
if j%4==0:
j=0
i+=1
plt.subplots_adjust(wspace=0.2, hspace=0.2)
fig.suptitle(f'Clustered scatterplots for k ranging from 1 to {nrk}', fontsize=20)
plt.show()
number_clusters = range(1,nrk+1)
plt.figure(figsize=(10,10))
plt.plot(number_clusters,wcss)
plt.title('The Elbow Plot')
plt.xlabel('Number of clusters')
plt.ylabel('Diameter')
plt.figure(figsize=(10,10))
plt.plot(number_clusters[1:],sil)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette score vs number of Clusters')
plt.show()
#Validating clustering K-Cluster Plot
sample_size = [1,.75,.5,.25,.125,]
nrk = 3 #optimal clusters needed
i=0
fig,axs= plt.subplots(1,5, figsize=(30,7))
for k in range(0,len(sample_size)):
kdata = data.iloc[:][['phi','psi']]
subset = kdata.sample(frac=sample_size[k])
data_with_clusters = subset.copy()
kmeans = KMeans(nrk)
identified_clusters = kmeans.fit_predict(subset)
data_with_clusters['Clusters'] = identified_clusters
axs[i].scatter(data_with_clusters['phi'],data_with_clusters['psi'],c=data_with_clusters['Clusters'],cmap='rainbow')
axs[i].set_xlabel('Phi (in degress)', fontsize = 15)
axs[i].set_ylabel('Psi (in degress)', fontsize = 15)
axs[i].set_title(f'sample size = {sample_size[k]}', fontsize = 20)
i+=1
plt.subplots_adjust(wspace=0.2, hspace=0.2)
fig.suptitle(f'Clustered scatterplots for varying sample size', fontsize=20)
plt.show()
#Change the data to get better results
phi_offset = 0
psi_offset = 100
data['phi corrected'] = data.apply(lambda row: ((row.phi + phi_offset) % 360), axis=1)
data['psi corrected'] = data.apply(lambda row: ((row.psi + psi_offset) % 360), axis=1)
plt.figure(figsize=(8,8))
plt.scatter(data['phi corrected'], data['psi corrected'])
plt.title('Corrected Data')
plt.xlabel('phi (in degrees)')
plt.ylabel('psi (in degrees)')
plt.show()
# K-Cluster plot on Corrected data
kdata = data.iloc[:][['phi corrected','psi corrected']]
data_with_clusters = kdata.copy()
kmeans = KMeans(3)
identified_clusters = kmeans.fit_predict(kdata)
data_with_clusters['Clusters'] = identified_clusters
plt.figure(figsize=(7, 7))
plt.scatter(data_with_clusters['phi corrected'],data_with_clusters['psi corrected'],c=data_with_clusters['Clusters'],cmap='rainbow')
plt.title(' K-Cluster on Corrected Data')
plt.xlabel('phi (in degrees)')
plt.ylabel('psi (in degrees)')
plt.show()
Question 3 - DBSCAN method
# Original data is being used without any translation.
kdata = data.iloc[:][['phi','psi']]
data_dbscan = kdata.copy()
data_dbscan = data_dbscan.sample(frac=.3)
data_dbscan = StandardScaler().fit_transform(data_dbscan)
db = DBSCAN(eps=.2, min_samples=5).fit(data_dbscan)
labels = db.labels_
# identified_clusters = db.fit_predict(data_dbscan)
# data_dbscan['Clusters'] = identified_clusters
plt.figure(figsize=(7, 7))
# plt.scatter(data_dbscan['phi'],data_dbscan['psi'],c=data_dbscan['Clusters'],cmap='rainbow')
plt.scatter(data_dbscan[:,0], data_dbscan[:,1],c= labels ,cmap='rainbow',s=10)
plt.title(' DBSCAN Cluster on Data')
plt.xlabel('phi (in transformed scale)')
plt.ylabel('psi (in transformed scale)')
plt.show()
min_samples = [5,10,20,40,80,120,160,200]
eps = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]
kdata = data.iloc[:][['phi','psi']]
data_dbscan = kdata.copy()
# data_dbscan = data_dbscan.sample(frac=.1)
i=0;j=0
# nrx = round(nrk/8)
# if nrk%4>0:
# nrx+=1
fig,axs= plt.subplots(8,8, figsize=(30,50))
for k in range(0,64):
d_dbscan = StandardScaler().fit_transform(data_dbscan)
db = DBSCAN(eps=eps[i], min_samples=min_samples[j]).fit(d_dbscan)
labels = db.labels_
# Number of clusters
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
axs[i][j].scatter(d_dbscan[:,0], d_dbscan[:,1],c= labels ,cmap='rainbow')
# axs[i][j].set_xlabel('Phi (in degress)', fontsize = 15)
# axs[i][j].set_ylabel('Psi (in degress)', fontsize = 15)
axs[i][j].set_title('eps=%.2f, \nCluster=%d, \n min_sample=%d' %(eps[i], n_clusters_, min_samples[j]))
j+=1
if j%8==0:
j=0
i+=1
plt.subplots_adjust(wspace=0.2, hspace=0.2)
fig.suptitle('DBSCAN scatterplots for varying Epsilon and minimum sample values', fontsize=20)
plt.show()
kdata = data.iloc[:][['phi','psi']]
data_dbscan = kdata.copy()
data_db = StandardScaler().fit_transform(data_dbscan)
db = DBSCAN(eps=.5, min_samples=120).fit(data_db)
labels = db.labels_
fig, (ax1)= plt.subplots(1)
fig.suptitle('DBSCAN scatterplots and outlier bar plot')
plt.figure(figsize=(10,10))
# plt.scatter(data_dbscan['phi'],data_dbscan['psi'],c=data_dbscan['Clusters'],cmap='rainbow')
ax1.scatter(data_db[:,0], data_db[:,1],c= labels ,cmap='rainbow',s=10)
#filter out and only keep the noise
data_outliers = data[(labels == -1)]
bar = data_outliers['residue name'].value_counts(sort=True).plot.bar()
plt.title('Outlier bar plot')
plt.ylabel('Count')
Question 4 - Data file Stratified by amino acid residue type
data_PRO = data[(data['residue name'] == 'PRO')]
data_db = StandardScaler().fit_transform(data_PRO[['phi','psi']])
db = DBSCAN(eps=.5, min_samples=120).fit(data_db)
labels = db.labels_
plt.figure(figsize=(10,10))
# plt.scatter(data_dbscan['phi'],data_dbscan['psi'],c=data_dbscan['Clusters'],cmap='rainbow')
plt.scatter(data_db[:,0], data_db[:,1],c= labels ,cmap='rainbow',s=10)
plt.title(' DBSCAN Cluster on PRO residue type')
plt.xlabel('phi (in transformed scale)')
plt.ylabel('psi (in transformed scale)')
plt.show()
data_PRO = data[(data['residue name'] == 'GLY')]
data_db = StandardScaler().fit_transform(data_PRO[['phi','psi']])
db = DBSCAN(eps=.5, min_samples=120).fit(data_db)
labels = db.labels_
plt.figure(figsize=(10,10))
# plt.scatter(data_dbscan['phi'],data_dbscan['psi'],c=data_dbscan['Clusters'],cmap='rainbow')
plt.scatter(data_db[:,0], data_db[:,1],c= labels ,cmap='rainbow',s=10)
plt.title(' DBSCAN Cluster on GLY residue type')
plt.xlabel('phi (in transformed scale)')
plt.ylabel('psi (in transformed scale)')
plt.show()