DAT405 Assignment 3

import pandas as pd import matplotlib.pyplot as plt import numpy as np import sklearn from sklearn.cluster import DBSCAN from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score from sklearn.neighbors import NearestNeighbors from sklearn.metrics import plot_confusion_matrix

data = pd.read_csv("data_all.csv")

data.head(5)

data.shape

data

plt.scatter(data["phi"], data["psi"], s=1) plt.title("Ramachandran plot") plt.xlabel("Phi (degrees)") plt.ylabel("Psi (degrees)") plt.rcParams["figure.figsize"] = [10,10]

X = data[["phi", "psi"]]

plt.hist2d(X['phi'],X['psi'], bins=150) plt.colorbar().set_label('Number of chains') plt.title("Ramachandran Heatmap") plt.xlabel("Phi (degrees)") plt.ylabel("Psi (degrees)") plt.rcParams["figure.figsize"] = [10,10] plt.show()

error = [] def draw_kmeans(X): k=1 for ax in (axes.flatten()): kmeans = KMeans(n_clusters=k) kmeans.fit(X) y_kmeans = kmeans.predict(X) error.append(kmeans.inertia_) ax.scatter(x=X["phi"], y=X["psi"], c=y_kmeans, s=50, cmap='viridis') ax.set_xlabel("Phi (degrees)") ax.set_ylabel("Psi (degrees)") ax.title.set_text("k="+str(k)) k=k+1 fig, axes = plt.subplots(3,2, figsize=(15, 15)) draw_kmeans(X) plt.tight_layout() plt.show()

plt.figure(figsize=(16,8)) K = range(1, 7) plt.plot(K, error, 'bx-') plt.xlabel('Values of K') plt.ylabel('Squared error') plt.title('The Elbow Method using Inertia') plt.show()

def draw_kmeans_silhouette_coefficient(): i=2 random_sample = np.random.choice(X.index,1,replace=False) for ax in (axes.flatten()): kmeans = KMeans(n_clusters=i) kmeans.fit(X) y_kmeans = kmeans.predict(X) silhouette_coefficient=silhouette_score(X, y_kmeans, metric='euclidean') ax.scatter(x=X["phi"], y=X["psi"], c=y_kmeans, s=50, cmap='magma') ax.set_xlabel("Phi (degrees)") ax.set_ylabel("Psi (degrees)") ax.title.set_text("Silhouette coefficient="+str(round(silhouette_coefficient,3))+" k="+str(i)) i=i+1 fig, axes = plt.subplots(3,2, figsize=(15, 15)) draw_kmeans_silhouette_coefficient() plt.tight_layout() plt.show()

def draw_kmeans_removed_values(Value): i=0 len_x=len(X) for ax in (axes.flatten()): remove_n=Value[i] X_copy=X.copy() #Number of value to remove drop_indices = np.random.choice(X_copy.index, remove_n, replace=False) X_subset = X_copy.drop(drop_indices) kmeans = KMeans(n_clusters=3) kmeans.fit(X_subset) y_kmeans = kmeans.predict(X_subset) ax.scatter(x=X_subset["phi"], y=X_subset["psi"], c=y_kmeans, s=50, cmap='cividis') ax.set_xlabel("Phi (degrees)") ax.set_ylabel("Psi (degrees)") percentage = round((remove_n/len_x)*100, 1) ax.title.set_text(f"Percentage of data points removed: {percentage}%") i=i+1 fig, axes = plt.subplots(3,3, figsize=(15, 15)) draw_kmeans_removed_values([100,1000,5000,12500,15000,17500,20000,22500,25000]) plt.tight_layout() plt.show()

X = X.copy() X["psi"] = ((X["psi"] + 145) % 360) X["phi"] = ((X["phi"] + 200) % 360)

# Here we modify X directly and use it for question 3 and 4 kmeans = KMeans(n_clusters=3) kmeans.fit(X) y_kmeans = kmeans.predict(X) error.append(kmeans.inertia_) plt.scatter(X["phi"], X["psi"], c=y_kmeans, s=50) plt.xlabel("Phi (degrees)") plt.ylabel("Psi (degrees)") plt.title("Cluster groups for k=3")

model = NearestNeighbors(n_neighbors=3) nbrs = model.fit(X) distances, indices = nbrs.kneighbors(X) distances = np.sort(distances, axis=0)[:,2] plt.plot(distances) plt.title("Distance to second neighbor for each datapoint") plt.xlabel("Datapoint number") plt.ylabel("Distance to second neighbor")

clustering = DBSCAN(eps=12.5, min_samples=160).fit(X) plt.scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_) plt.title("Ramachandran plot") plt.xlabel("Phi (degrees)") plt.ylabel("Psi (degrees)")

num_outliers = np.sum(clustering.labels_ == -1) print(f"Number of outliers: {num_outliers}") unique, counts = np.unique(data[clustering.labels_ == -1]["residue name"], return_counts=True) plt.figure(figsize=(9, 5)) plt.bar(unique, counts) plt.title("Residue type of outlier datapoints") plt.xlabel("Residue name") plt.ylabel("Frequency")

fig, axs = plt.subplots(5, figsize=(12, 12)) clustering = DBSCAN(eps=12.5, min_samples=160).fit(X) axs[0].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_) axs[0].set_xlabel('Phi (degrees)') axs[0].set_ylabel('Psi (degrees)') axs[0].set_title('Ramachandran plot with epsilon=12.5') clustering = DBSCAN(eps=15, min_samples=160).fit(X) axs[1].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_) axs[1].set_xlabel('Phi (degrees)') axs[1].set_ylabel('Psi (degrees)') axs[1].set_title('Ramachandran plot with epsilon=15') clustering = DBSCAN(eps=20, min_samples=160).fit(X) axs[2].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_) axs[2].set_xlabel('Phi (degrees)') axs[2].set_ylabel('Psi (degrees)') axs[2].set_title('Ramachandran plot with epsilon=20') clustering = DBSCAN(eps=10, min_samples=160).fit(X) axs[3].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_) axs[3].set_xlabel('Phi (degrees)') axs[3].set_ylabel('Psi (degrees)') axs[3].set_title('Ramachandran plot with epsilon=10') clustering = DBSCAN(eps=0.5, min_samples=160).fit(X) axs[4].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_) axs[4].set_xlabel('Phi (degrees)') axs[4].set_ylabel('Psi (degrees)') axs[4].set_title('Ramachandran plot with epsilon=0.5') fig.tight_layout()

fig, axs = plt.subplots(3, figsize=(12, 12)) clustering = DBSCAN(eps=12.5, min_samples=160).fit(X) axs[0].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_) axs[0].set_xlabel('Phi (degrees)') axs[0].set_ylabel('Psi (degrees)') axs[0].set_title('Ramachandran plot with eps=12.5 and min_samples=160') clustering = DBSCAN(eps=12.5, min_samples=50).fit(X) axs[1].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_) axs[1].set_xlabel('Phi (degrees)') axs[1].set_ylabel('Psi (degrees)') axs[1].set_title('Ramachandran plot with eps=12.5 and min_samples=50') clustering = DBSCAN(eps=12.5, min_samples=5).fit(X) axs[2].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_) axs[2].set_xlabel('Phi (degrees)') axs[2].set_ylabel('Psi (degrees)') axs[2].set_title('Ramachandran plot with eps=12.5 and min_samples=5') fig.tight_layout()

X_sub = data[data["residue name"] == "PRO"][["phi", "psi"]] X_sub = X_sub.copy() X_sub["psi"] = ((X_sub["psi"] + 145) % 360) X_sub["phi"] = ((X_sub["phi"] + 200) % 360) plt.scatter(X_sub["phi"], X_sub["psi"], s=0.75) plt.xlim(0, 360) plt.ylim(0, 360) plt.title("Ramachandran plot") plt.xlabel("Phi (degrees)") plt.ylabel("Psi (degrees)")

# Since we now have less data we modified min_samples from 160 to 100 clustering = DBSCAN(eps=12.5, min_samples=100).fit(X_sub) plt.scatter(X_sub["phi"], X_sub["psi"], s=1, c=clustering.labels_) plt.title("Ramachandran plot for Proline") plt.xlabel("Phi (degrees)") plt.ylabel("Psi (degrees)")

X_sub = data[data["residue name"] == "GLY"][["phi", "psi"]] X_sub = X_sub.copy() X_sub["psi"] = ((X_sub["psi"] + 145) % 360) X_sub["phi"] = ((X_sub["phi"] + 200) % 360) clustering = DBSCAN(eps=25, min_samples=100).fit(X_sub) plt.scatter(X_sub["phi"], X_sub["psi"], s=0.75, c=clustering.labels_) plt.title("Ramachandran plot for Glycine") plt.xlabel("Phi (degrees)") plt.ylabel("Psi (degrees)")