import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import plot_confusion_matrix
data = pd.read_csv("data_all.csv")
data.head(5)
data.shape
data
plt.scatter(data["phi"], data["psi"], s=1)
plt.title("Ramachandran plot")
plt.xlabel("Phi (degrees)")
plt.ylabel("Psi (degrees)")
plt.rcParams["figure.figsize"] = [10,10]
X = data[["phi", "psi"]]
plt.hist2d(X['phi'],X['psi'], bins=150)
plt.colorbar().set_label('Number of chains')
plt.title("Ramachandran Heatmap")
plt.xlabel("Phi (degrees)")
plt.ylabel("Psi (degrees)")
plt.rcParams["figure.figsize"] = [10,10]
plt.show()
error = []
def draw_kmeans(X):
k=1
for ax in (axes.flatten()):
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
error.append(kmeans.inertia_)
ax.scatter(x=X["phi"], y=X["psi"], c=y_kmeans, s=50, cmap='viridis')
ax.set_xlabel("Phi (degrees)")
ax.set_ylabel("Psi (degrees)")
ax.title.set_text("k="+str(k))
k=k+1
fig, axes = plt.subplots(3,2, figsize=(15, 15))
draw_kmeans(X)
plt.tight_layout()
plt.show()
plt.figure(figsize=(16,8))
K = range(1, 7)
plt.plot(K, error, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Squared error')
plt.title('The Elbow Method using Inertia')
plt.show()
def draw_kmeans_silhouette_coefficient():
i=2
random_sample = np.random.choice(X.index,1,replace=False)
for ax in (axes.flatten()):
kmeans = KMeans(n_clusters=i)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
silhouette_coefficient=silhouette_score(X, y_kmeans, metric='euclidean')
ax.scatter(x=X["phi"], y=X["psi"], c=y_kmeans, s=50, cmap='magma')
ax.set_xlabel("Phi (degrees)")
ax.set_ylabel("Psi (degrees)")
ax.title.set_text("Silhouette coefficient="+str(round(silhouette_coefficient,3))+" k="+str(i))
i=i+1
fig, axes = plt.subplots(3,2, figsize=(15, 15))
draw_kmeans_silhouette_coefficient()
plt.tight_layout()
plt.show()
def draw_kmeans_removed_values(Value):
i=0
len_x=len(X)
for ax in (axes.flatten()):
remove_n=Value[i]
X_copy=X.copy() #Number of value to remove
drop_indices = np.random.choice(X_copy.index, remove_n, replace=False)
X_subset = X_copy.drop(drop_indices)
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_subset)
y_kmeans = kmeans.predict(X_subset)
ax.scatter(x=X_subset["phi"], y=X_subset["psi"], c=y_kmeans, s=50, cmap='cividis')
ax.set_xlabel("Phi (degrees)")
ax.set_ylabel("Psi (degrees)")
percentage = round((remove_n/len_x)*100, 1)
ax.title.set_text(f"Percentage of data points removed: {percentage}%")
i=i+1
fig, axes = plt.subplots(3,3, figsize=(15, 15))
draw_kmeans_removed_values([100,1000,5000,12500,15000,17500,20000,22500,25000])
plt.tight_layout()
plt.show()
X = X.copy()
X["psi"] = ((X["psi"] + 145) % 360)
X["phi"] = ((X["phi"] + 200) % 360)
# Here we modify X directly and use it for question 3 and 4
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
error.append(kmeans.inertia_)
plt.scatter(X["phi"], X["psi"], c=y_kmeans, s=50)
plt.xlabel("Phi (degrees)")
plt.ylabel("Psi (degrees)")
plt.title("Cluster groups for k=3")
model = NearestNeighbors(n_neighbors=3)
nbrs = model.fit(X)
distances, indices = nbrs.kneighbors(X)
distances = np.sort(distances, axis=0)[:,2]
plt.plot(distances)
plt.title("Distance to second neighbor for each datapoint")
plt.xlabel("Datapoint number")
plt.ylabel("Distance to second neighbor")
clustering = DBSCAN(eps=12.5, min_samples=160).fit(X)
plt.scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_)
plt.title("Ramachandran plot")
plt.xlabel("Phi (degrees)")
plt.ylabel("Psi (degrees)")
num_outliers = np.sum(clustering.labels_ == -1)
print(f"Number of outliers: {num_outliers}")
unique, counts = np.unique(data[clustering.labels_ == -1]["residue name"], return_counts=True)
plt.figure(figsize=(9, 5))
plt.bar(unique, counts)
plt.title("Residue type of outlier datapoints")
plt.xlabel("Residue name")
plt.ylabel("Frequency")
fig, axs = plt.subplots(5, figsize=(12, 12))
clustering = DBSCAN(eps=12.5, min_samples=160).fit(X)
axs[0].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_)
axs[0].set_xlabel('Phi (degrees)')
axs[0].set_ylabel('Psi (degrees)')
axs[0].set_title('Ramachandran plot with epsilon=12.5')
clustering = DBSCAN(eps=15, min_samples=160).fit(X)
axs[1].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_)
axs[1].set_xlabel('Phi (degrees)')
axs[1].set_ylabel('Psi (degrees)')
axs[1].set_title('Ramachandran plot with epsilon=15')
clustering = DBSCAN(eps=20, min_samples=160).fit(X)
axs[2].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_)
axs[2].set_xlabel('Phi (degrees)')
axs[2].set_ylabel('Psi (degrees)')
axs[2].set_title('Ramachandran plot with epsilon=20')
clustering = DBSCAN(eps=10, min_samples=160).fit(X)
axs[3].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_)
axs[3].set_xlabel('Phi (degrees)')
axs[3].set_ylabel('Psi (degrees)')
axs[3].set_title('Ramachandran plot with epsilon=10')
clustering = DBSCAN(eps=0.5, min_samples=160).fit(X)
axs[4].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_)
axs[4].set_xlabel('Phi (degrees)')
axs[4].set_ylabel('Psi (degrees)')
axs[4].set_title('Ramachandran plot with epsilon=0.5')
fig.tight_layout()
fig, axs = plt.subplots(3, figsize=(12, 12))
clustering = DBSCAN(eps=12.5, min_samples=160).fit(X)
axs[0].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_)
axs[0].set_xlabel('Phi (degrees)')
axs[0].set_ylabel('Psi (degrees)')
axs[0].set_title('Ramachandran plot with eps=12.5 and min_samples=160')
clustering = DBSCAN(eps=12.5, min_samples=50).fit(X)
axs[1].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_)
axs[1].set_xlabel('Phi (degrees)')
axs[1].set_ylabel('Psi (degrees)')
axs[1].set_title('Ramachandran plot with eps=12.5 and min_samples=50')
clustering = DBSCAN(eps=12.5, min_samples=5).fit(X)
axs[2].scatter(X["phi"], X["psi"], s=0.75, c=clustering.labels_)
axs[2].set_xlabel('Phi (degrees)')
axs[2].set_ylabel('Psi (degrees)')
axs[2].set_title('Ramachandran plot with eps=12.5 and min_samples=5')
fig.tight_layout()
X_sub = data[data["residue name"] == "PRO"][["phi", "psi"]]
X_sub = X_sub.copy()
X_sub["psi"] = ((X_sub["psi"] + 145) % 360)
X_sub["phi"] = ((X_sub["phi"] + 200) % 360)
plt.scatter(X_sub["phi"], X_sub["psi"], s=0.75)
plt.xlim(0, 360)
plt.ylim(0, 360)
plt.title("Ramachandran plot")
plt.xlabel("Phi (degrees)")
plt.ylabel("Psi (degrees)")
# Since we now have less data we modified min_samples from 160 to 100
clustering = DBSCAN(eps=12.5, min_samples=100).fit(X_sub)
plt.scatter(X_sub["phi"], X_sub["psi"], s=1, c=clustering.labels_)
plt.title("Ramachandran plot for Proline")
plt.xlabel("Phi (degrees)")
plt.ylabel("Psi (degrees)")
X_sub = data[data["residue name"] == "GLY"][["phi", "psi"]]
X_sub = X_sub.copy()
X_sub["psi"] = ((X_sub["psi"] + 145) % 360)
X_sub["phi"] = ((X_sub["phi"] + 200) % 360)
clustering = DBSCAN(eps=25, min_samples=100).fit(X_sub)
plt.scatter(X_sub["phi"], X_sub["psi"], s=0.75, c=clustering.labels_)
plt.title("Ramachandran plot for Glycine")
plt.xlabel("Phi (degrees)")
plt.ylabel("Psi (degrees)")