import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
plt.rcParams['figure.figsize'] = (16,8)
df = pd.read_csv('data_all.csv')
df.head()
phi = df.phi.to_numpy().copy()
psi = df.psi.to_numpy().copy()
heatmap, xedges, yedges = np.histogram2d(phi, psi, bins=360)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.imshow(heatmap.T, extent=extent, origin='lower')
plt.colorbar()
plt.xlabel('phi in deg')
plt.ylabel('psi in deg')
plt.show()
from sklearn.cluster import KMeans
X = np.stack((phi,psi), axis = 1)
km = KMeans(n_clusters=2, random_state=0)
kmeans=km.fit(X)
plt.subplot(121)
plt.scatter(X[kmeans.labels_ == 0, 0], X[kmeans.labels_ == 0, -1],c='green',label='Cluster 0')
plt.scatter(X[kmeans.labels_ == 1, 0], X[kmeans.labels_ == 1, -1],c='blue',label='Cluster 1')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red',label='centroid')
plt.legend()
plt.title('Data points and cluster centroids (K=2)')
plt.xlabel('phi in degree')
plt.ylabel('psi in degree')
plt.show()
from sklearn.metrics import silhouette_samples, silhouette_score
for i, k in enumerate([2,3,4]):
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# Run the Kmeans algorithm
km = KMeans(n_clusters=k)
labels = km.fit_predict(X)
centroids = km.cluster_centers_
# Get silhouette samples
silhouette_vals = silhouette_samples(X, labels)
# Silhouette plot
y_ticks = []
y_lower, y_upper = 0, 0
for i, cluster in enumerate(np.unique(labels)):
cluster_silhouette_vals = silhouette_vals[labels == cluster]
cluster_silhouette_vals.sort()
y_upper += len(cluster_silhouette_vals)
ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
y_lower += len(cluster_silhouette_vals)
# Get the average silhouette score and plot it
avg_score = np.mean(silhouette_vals)
ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
ax1.set_yticks([])
ax1.set_xlim([-0.1, 1])
ax1.set_xlabel('Silhouette coefficient values')
ax1.set_ylabel('Cluster labels')
ax1.set_title('Silhouette plot for the various clusters', y=1.02);
# Scatter plot of data colored with labels
ax2.scatter(X[:, 0], X[:, 1], c=labels)
ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250)
ax2.set_xlim([-180, 180])
ax2.set_ylim([-180, 180])
ax2.set_xlabel('Phi')
ax2.set_ylabel('Psi')
ax2.set_title('Visualization of clustered data', y=1.02)
ax2.set_aspect('equal')
plt.tight_layout()
plt.suptitle(f'Silhouette analysis using k = {k}',
fontsize=16, fontweight='semibold', y=1.05);
plt.show()
shiftPsi = 60
shiftPhi = 170
phi = df.phi.to_numpy().copy()
psi = df.psi.to_numpy().copy()
phiShifted = ((phi-180+45) % 360) -180
psiShifted = ((psi-180-45) % 360) -180
heatmap, xedges, yedges = np.histogram2d(phiShifted, psiShifted, bins=120)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.imshow(heatmap.T, extent=extent, origin='lower')
plt.colorbar()
plt.xlabel('phi in deg')
plt.ylabel('psi in deg')
plt.show()
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn import datasets
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Compute DBSCAN
#X1 = StandardScaler().fit_transform(X)
X1 = X
db = DBSCAN(eps=15, min_samples=30).fit(X1)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
# Plot result
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.xlabel('phi in deg')
plt.ylabel('psi in deg')
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
print('Estimated number of noise points: %d' % n_noise_)
sns.countplot(x=df['residue name'][labels == -1])
plt.ylabel('Number of noise points over residue types')
plt.show()
X1 = np.stack((phi,psi), axis = 1)
db = DBSCAN(eps=15, min_samples=30).fit(X1)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters with epsilon = 15: %d' % n_clusters_)
X1 = X
db = DBSCAN(eps=5, min_samples=30).fit(X1)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters with epsilon = 5: %d' % n_clusters_)
phiPro = df.phi[df['residue name'] == 'PRO']
psiPro = df.psi[df['residue name'] == 'PRO']
XPro = np.stack((phiPro,psiPro), axis = 1)
db = DBSCAN(eps=10, min_samples=10).fit(XPro)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
plt.subplot(121)
# Plot result
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = XPro[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = XPro[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.xlabel('phi in deg')
plt.ylabel('psi in deg')
plt.title('Only looking at residue type pro, estimated number of clusters: %d' % n_clusters_)
plt.subplot(122)
X1 = X
db = DBSCAN(eps=15, min_samples=30).fit(X1)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
# Plot result
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.xlabel('phi in deg')
plt.ylabel('psi in deg')
plt.title('All residue types, Estimated number of clusters: %d' % n_clusters_)
plt.show()
phiGly = df.phi[df['residue name'] == 'GLY']
psiGly = df.psi[df['residue name'] == 'GLY']
XGly = np.stack((phiGly,psiGly), axis = 1)
db = DBSCAN(eps=15, min_samples=30).fit(XGly)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
# Plot result
# Black removed and is used for noise instead.pl
plt.subplot(121)
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = XGly[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = XGly[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.xlabel('phi in deg')
plt.ylabel('psi in deg')
plt.title('Residue type GLY, Estimated number of clusters: %d' % n_clusters_)
plt.subplot(122)
X1 = X
db = DBSCAN(eps=15, min_samples=10).fit(X1)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
# Plot result
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.xlabel('phi in deg')
plt.ylabel('psi in deg')
plt.title('All residue types, Estimated number of clusters: %d' % n_clusters_)
plt.show()