DAT405 - Assignment 3

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import sklearn plt.rcParams['figure.figsize'] = (16,8)

df = pd.read_csv('data_all.csv') df.head()

phi = df.phi.to_numpy().copy() psi = df.psi.to_numpy().copy() heatmap, xedges, yedges = np.histogram2d(phi, psi, bins=360) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.clf() plt.imshow(heatmap.T, extent=extent, origin='lower') plt.colorbar() plt.xlabel('phi in deg') plt.ylabel('psi in deg') plt.show()

from sklearn.cluster import KMeans X = np.stack((phi,psi), axis = 1) km = KMeans(n_clusters=2, random_state=0) kmeans=km.fit(X) plt.subplot(121) plt.scatter(X[kmeans.labels_ == 0, 0], X[kmeans.labels_ == 0, -1],c='green',label='Cluster 0') plt.scatter(X[kmeans.labels_ == 1, 0], X[kmeans.labels_ == 1, -1],c='blue',label='Cluster 1') plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red',label='centroid') plt.legend() plt.title('Data points and cluster centroids (K=2)') plt.xlabel('phi in degree') plt.ylabel('psi in degree') plt.show()

from sklearn.metrics import silhouette_samples, silhouette_score for i, k in enumerate([2,3,4]): fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # Run the Kmeans algorithm km = KMeans(n_clusters=k) labels = km.fit_predict(X) centroids = km.cluster_centers_ # Get silhouette samples silhouette_vals = silhouette_samples(X, labels) # Silhouette plot y_ticks = [] y_lower, y_upper = 0, 0 for i, cluster in enumerate(np.unique(labels)): cluster_silhouette_vals = silhouette_vals[labels == cluster] cluster_silhouette_vals.sort() y_upper += len(cluster_silhouette_vals) ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1) ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1)) y_lower += len(cluster_silhouette_vals) # Get the average silhouette score and plot it avg_score = np.mean(silhouette_vals) ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green') ax1.set_yticks([]) ax1.set_xlim([-0.1, 1]) ax1.set_xlabel('Silhouette coefficient values') ax1.set_ylabel('Cluster labels') ax1.set_title('Silhouette plot for the various clusters', y=1.02); # Scatter plot of data colored with labels ax2.scatter(X[:, 0], X[:, 1], c=labels) ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250) ax2.set_xlim([-180, 180]) ax2.set_ylim([-180, 180]) ax2.set_xlabel('Phi') ax2.set_ylabel('Psi') ax2.set_title('Visualization of clustered data', y=1.02) ax2.set_aspect('equal') plt.tight_layout() plt.suptitle(f'Silhouette analysis using k = {k}', fontsize=16, fontweight='semibold', y=1.05); plt.show()

shiftPsi = 60 shiftPhi = 170 phi = df.phi.to_numpy().copy() psi = df.psi.to_numpy().copy() phiShifted = ((phi-180+45) % 360) -180 psiShifted = ((psi-180-45) % 360) -180 heatmap, xedges, yedges = np.histogram2d(phiShifted, psiShifted, bins=120) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.clf() plt.imshow(heatmap.T, extent=extent, origin='lower') plt.colorbar() plt.xlabel('phi in deg') plt.ylabel('psi in deg') plt.show()

import numpy as np from sklearn.cluster import DBSCAN from sklearn import metrics from sklearn import datasets from sklearn.datasets import make_blobs from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt

# Compute DBSCAN #X1 = StandardScaler().fit_transform(X) X1 = X db = DBSCAN(eps=15, min_samples=30).fit(X1) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_)

# Plot result # Black removed and is used for noise instead. unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14) xy = X[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6) plt.xlabel('phi in deg') plt.ylabel('psi in deg') plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show() print('Estimated number of noise points: %d' % n_noise_)

sns.countplot(x=df['residue name'][labels == -1]) plt.ylabel('Number of noise points over residue types') plt.show()

X1 = np.stack((phi,psi), axis = 1) db = DBSCAN(eps=15, min_samples=30).fit(X1) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters with epsilon = 15: %d' % n_clusters_) X1 = X db = DBSCAN(eps=5, min_samples=30).fit(X1) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters with epsilon = 5: %d' % n_clusters_)

phiPro = df.phi[df['residue name'] == 'PRO'] psiPro = df.psi[df['residue name'] == 'PRO'] XPro = np.stack((phiPro,psiPro), axis = 1) db = DBSCAN(eps=10, min_samples=10).fit(XPro) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) plt.subplot(121) # Plot result # Black removed and is used for noise instead. unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = XPro[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14) xy = XPro[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6) plt.xlabel('phi in deg') plt.ylabel('psi in deg') plt.title('Only looking at residue type pro, estimated number of clusters: %d' % n_clusters_) plt.subplot(122) X1 = X db = DBSCAN(eps=15, min_samples=30).fit(X1) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) # Plot result # Black removed and is used for noise instead. unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14) xy = X[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6) plt.xlabel('phi in deg') plt.ylabel('psi in deg') plt.title('All residue types, Estimated number of clusters: %d' % n_clusters_) plt.show()

phiGly = df.phi[df['residue name'] == 'GLY'] psiGly = df.psi[df['residue name'] == 'GLY'] XGly = np.stack((phiGly,psiGly), axis = 1) db = DBSCAN(eps=15, min_samples=30).fit(XGly) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) # Plot result # Black removed and is used for noise instead.pl plt.subplot(121) unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = XGly[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14) xy = XGly[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6) plt.xlabel('phi in deg') plt.ylabel('psi in deg') plt.title('Residue type GLY, Estimated number of clusters: %d' % n_clusters_) plt.subplot(122) X1 = X db = DBSCAN(eps=15, min_samples=10).fit(X1) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) # Plot result # Black removed and is used for noise instead. unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14) xy = X[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6) plt.xlabel('phi in deg') plt.ylabel('psi in deg') plt.title('All residue types, Estimated number of clusters: %d' % n_clusters_) plt.show()