import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.ticker as mtick
df = pd.read_csv("/work/data/data_all.csv")
df_phi = df.loc[:,'phi']
df_psi = df.loc[:,'psi']
df_matrix = df.loc[:,['phi','psi']]
fig,ax = plt.subplots(figsize = (8,8))
ax.scatter(df_phi,df_psi, s=1)
# Formatting
ax.set_title(f"Scatter of bond rotation-pairs $\phi$ and $\psi$")
ax.set_xlabel('$\phi$')
ax.set_ylabel('$\psi$')
# Set the x and y-ticks.
ax.set_xticks(np.linspace(-180, 180, 9))
ax.set_yticks(np.linspace(-180, 180, 9))
# Add degree symbol to x and y labels
tick1 = mtick.StrMethodFormatter('{x:,.0f}°')
ax.yaxis.set_major_formatter(tick1)
ax.xaxis.set_major_formatter(tick1)
fig.show()
fig, ax = plt.subplots(figsize =(10, 7))
# 2D histogram will return the count of rotation-pairs in a given bin
# Each bin has width and height: 360°/45=8°
bins=45
H = ax.hist2d(df_phi, df_psi, bins=bins)
# Add a colorbar
fig.colorbar(H[3], ax=ax)
# Formatting
ax.set_title(f"Heatmap of bond rotation-pairs $\phi$ and $\psi$ (bins={bins})")
ax.set_xlabel('$\phi$')
ax.set_ylabel('$\psi$')
# Set the x and y-ticks.
ax.set_xticks(np.linspace(-180, 180, 9))
ax.set_yticks(np.linspace(-180, 180, 9))
# Add degree symbol to x and y labels
tick1 = mtick.StrMethodFormatter('{x:,.0f}°')
ax.yaxis.set_major_formatter(tick1)
ax.xaxis.set_major_formatter(tick1)
fig.show()
fig,axs = plt.subplots(3,2,figsize=(9,8))
sse = {}
for k, ax in enumerate(axs.ravel(), 1):
# Formatting
ax.set_title(f'k = {k}')
ax.set_xlabel('$\phi$')
ax.set_ylabel('$\psi$')
ax.yaxis.set_major_formatter(tick1)
ax.xaxis.set_major_formatter(tick1)
# Instantiate a KMeans cluster with k centroids
kmeans = KMeans(n_clusters=k,random_state=0)
# Fit the model
kmeans.fit(df_matrix)
y_kmeans = kmeans.predict(df_matrix)
# Scatter the clusters in respective color
ax.scatter(df_matrix.iloc[:, 0], df_matrix.iloc[:, 1], c=y_kmeans, s=1, cmap='viridis')
# Plot the centroids for each cluster
centers = kmeans.cluster_centers_
ax.scatter(centers[:, 0], centers[:, 1], c='black', s=80, alpha=0.5);
# Insert the sum of squares from centroid to within-cluster points
# for all clusters.
sse[k] = kmeans.inertia_
fig.tight_layout()
fig.show()
fig,axs = plt.subplots(1,1,figsize = (6,6))
axs.set_title("Elbow method, SSE vs $k$")
axs.set_ylabel('SSE')
axs.set_xlabel('$k$')
plt.plot(list(sse.keys()), list(sse.values()))
plt.show()
fig,axs = plt.subplots(2,2,figsize=(8,8))
#Divide into random sets
num_rows = df_matrix.shape[0]
random_indices = np.random.choice(num_rows, size=int(0.8*num_rows), replace=False)
random_rows = df_matrix.iloc[random_indices, :]
for ax in axs.ravel():
# Cluster
kmeans = KMeans(n_clusters=3,random_state=0)
y_kmeans = kmeans.fit(random_rows).predict(random_rows)
ax.scatter(random_rows.iloc[:, 0], random_rows.iloc[:, 1], c=y_kmeans, s=1, cmap='viridis')
# Formatting
ax.set_xlabel('$\phi$')
ax.set_ylabel('$\psi$')
ax.yaxis.set_major_formatter(tick1)
ax.xaxis.set_major_formatter(tick1)
fig.suptitle('Four different randomly partitioned sets of data to prove stability of our subsets')
plt.tight_layout()
fig,axs = plt.subplots(1,1,figsize=(10,10))
angle_df = df_matrix.copy()
# Shift the data and try match the low density areas to the wrap around point.
psi_shift = 100
phi_shift = 0
angle_df['psi'] = (angle_df.loc[:,'psi'] + psi_shift) % 360
angle_df['phi'] = (angle_df.loc[:,'phi'] + phi_shift) % 360
kmeans = KMeans(n_clusters=3,random_state=0)
y_kmeans = kmeans.fit(angle_df).predict(angle_df)
axs.scatter(angle_df.iloc[:, 0], angle_df.iloc[:, 1], c=y_kmeans, s=1, cmap='viridis')
axs.set_xlabel('$\phi$')
axs.set_ylabel('$\psi$')
axs.yaxis.set_major_formatter(tick1)
axs.xaxis.set_major_formatter(tick1)
axs.set_title('Shifted Ramachandran plot')
fig.show()
fig, ax = plt.subplots(figsize=(7,7))
# Set x and y label to indicate the angle shift
ax.set_title(f"Scatter of bond rotation-pairs $\phi$ and $\psi$")
phi_label = f'+ {phi_shift}°' if phi_shift > 0 else f'- {-phi_shift}°'
psi_label = f'+ {psi_shift}°' if psi_shift > 0 else f'- {-psi_shift}°'
if phi_shift == 0:
ax.set_xlabel('$\phi$')
else:
ax.set_xlabel(f'$\phi {phi_label}$')
if psi_shift == 0:
ax.set_ylabel('$\psi$')
else:
ax.set_ylabel(f'$\psi {psi_label}$')
# Add degree symbol to x and y labels
tick1 = mtick.StrMethodFormatter('{x:,.0f}°')
ax.yaxis.set_major_formatter(tick1)
ax.xaxis.set_major_formatter(tick1)
# Set the x and y-ticks.
ax.set_xticks(np.linspace(0, 360, 9))
ax.set_yticks(np.linspace(0, 360, 9))
eps = 10
min_samples = 100
db = DBSCAN(eps = eps, min_samples = min_samples)
pred_dbscan = db.fit_predict(angle_df[['phi', 'psi']])
# Scatter the clusters in respective color
ax.scatter(angle_df.loc[:, 'phi'], angle_df.loc[:, 'psi'], c=pred_dbscan, s=0.3, cmap='gnuplot')
ax.set_title(f'Ramachandran plot\n$\epsilon=${eps} min samples={min_samples}')
fig.show()
angle_df['residue name'] = df['residue name']
angle_df['db_pred'] = db.labels_
outlier_counts = {}
# For each residue type, count the number of classified -1's
for res in np.unique(df['residue name']):
outlier_counts[res] = np.count_nonzero(angle_df.loc[angle_df['residue name']==res,'db_pred'] == -1)
print(f'Total outlier count: {sum(outlier_counts.values())}.')
# Bar chart of residue type vs -1 count
fig, ax = plt.subplots(figsize=(10,5))
ax.set_title('Bar chart of residue type vs outlier count')
ax.bar(*zip(*outlier_counts.items()))
ax.text(6.25, 170, outlier_counts['GLY'], fontsize=21, backgroundcolor='white')
ax.set_ylim([0, 250])
ax.set_ylabel('Number of outliers')
fig.show()
fig, ax = plt.subplots(2,2,figsize=(8,8))
# Try these values for epsilon and num_samples
eps_vals = [8, 12,8,12]
m_samples = [80, 120,120,80]
for i, ax in enumerate(ax.ravel()):
# Set x and y label to indicate the angle shift
ax.set_title(f"Scatter of bond rotation-pairs $\phi$ and $\psi$")
phi_label = f'+ {phi_shift}°' if phi_shift > 0 else f'- {-phi_shift}°'
psi_label = f'+ {psi_shift}°' if psi_shift > 0 else f'- {-psi_shift}°'
if phi_shift == 0:
ax.set_xlabel('$\phi$')
else:
ax.set_xlabel(f'$\phi {phi_label}$')
if psi_shift == 0:
ax.set_ylabel('$\psi$')
else:
ax.set_ylabel(f'$\psi {psi_label}$')
# Add degree symbol to x and y labels
tick1 = mtick.StrMethodFormatter('{x:,.0f}°')
ax.yaxis.set_major_formatter(tick1)
ax.xaxis.set_major_formatter(tick1)
# Set the x and y-ticks.
ax.set_xticks(np.linspace(0, 360, 5))
ax.set_yticks(np.linspace(0, 360, 5))
# here we set epsilon and min_samples
eps = eps_vals[i]
min_samples = m_samples[i]
db = DBSCAN(eps = eps, min_samples = min_samples)
pred_dbscan = db.fit_predict(angle_df[['phi', 'psi']])
# Scatter the clusters in respective color
ax.scatter(angle_df.loc[:, 'phi'], angle_df.loc[:, 'psi'], c=pred_dbscan, s=0.3, cmap='gnuplot')
ax.set_title(f'$\epsilon=${eps} min samples={min_samples}')
fig.tight_layout()
fig.show()
angle_df = df.loc[df['residue name']=='PRO',['phi', 'psi']].copy()
# Shift the data and try match the low density areas to the wrap around point.
psi_shift = 130
phi_shift = -115
angle_df['psi'] = (angle_df.loc[:,'psi'] + psi_shift) % 360
angle_df['phi'] = (angle_df.loc[:,'phi'] + phi_shift) % 360
fig, ax = plt.subplots(figsize=(8,8))
# Set x and y label to indicate the angle shift
ax.set_title(f"Scatter of bond rotation-pairs $\phi$ and $\psi$")
phi_label = f'+ {phi_shift}°' if phi_shift > 0 else f'- {-phi_shift}°'
psi_label = f'+ {psi_shift}°' if psi_shift > 0 else f'- {-psi_shift}°'
if phi_shift == 0:
ax.set_xlabel('$\phi$')
else:
ax.set_xlabel(f'$\phi {phi_label}$')
if psi_shift == 0:
ax.set_ylabel('$\psi$')
else:
ax.set_ylabel(f'$\psi {psi_label}$')
# Add degree symbol to x and y labels
tick1 = mtick.StrMethodFormatter('{x:,.0f}°')
ax.yaxis.set_major_formatter(tick1)
ax.xaxis.set_major_formatter(tick1)
# Set the x and y-ticks.
ax.set_xticks(np.linspace(0, 360, 9))
ax.set_yticks(np.linspace(0, 360, 9))
ax.set_ylim(0,360)
ax.set_xlim(0,360)
# Scatter the clusters in respective color
ax.scatter(angle_df.loc[:, 'phi'], angle_df.loc[:, 'psi'], s=1, cmap='gnuplot')
ax.set_title(f'Ramachandran plot for residue type PRO.\n$\epsilon=${eps}')
fig.show()
plt.tight_layout()
angle_df = df.loc[df['residue name']=='PRO',['phi', 'psi']].copy()
# Shift the data and try match the low density areas to the wrap around point.
psi_shift = 130
phi_shift = -115
angle_df['psi'] = (angle_df.loc[:,'psi'] + psi_shift) % 360
angle_df['phi'] = (angle_df.loc[:,'phi'] + phi_shift) % 360
fig, ax = plt.subplots(figsize=(8,8))
# Set x and y label to indicate the angle shift
ax.set_title(f"Scatter of bond rotation-pairs $\phi$ and $\psi$")
phi_label = f'+ {phi_shift}°' if phi_shift > 0 else f'- {-phi_shift}°'
psi_label = f'+ {psi_shift}°' if psi_shift > 0 else f'- {-psi_shift}°'
if phi_shift == 0:
ax.set_xlabel('$\phi$')
else:
ax.set_xlabel(f'$\phi {phi_label}$')
if psi_shift == 0:
ax.set_ylabel('$\psi$')
else:
ax.set_ylabel(f'$\psi {psi_label}$')
# Add degree symbol to x and y labels
tick1 = mtick.StrMethodFormatter('{x:,.0f}°')
ax.yaxis.set_major_formatter(tick1)
ax.xaxis.set_major_formatter(tick1)
# Set the x and y-ticks.
ax.set_xticks(np.linspace(0, 360, 9))
ax.set_yticks(np.linspace(0, 360, 9))
ax.set_ylim(0,360)
ax.set_xlim(0,360)
eps = 5
min_samples = 30
db = DBSCAN(eps = eps, min_samples = min_samples)
#only_PRO = angle_df.loc[angle_df['residue name']=='PRO',['phi', 'psi']]
pred_dbscan = db.fit_predict(angle_df)
# Scatter the clusters in respective color
ax.scatter(angle_df.loc[:, 'phi'], angle_df.loc[:, 'psi'], c=pred_dbscan, s=1, cmap='gnuplot')
ax.set_title(f'Ramachandran plot for residue type PRO $\epsilon=${eps} min samples={min_samples}')
fig.show()
plt.tight_layout()
angle_df = df.loc[df['residue name']=='GLY',['phi', 'psi']].copy()
# Shift the data and try match the low density areas to the wrap around point.
psi_shift = -90
phi_shift = 0
angle_df['psi'] = (angle_df.loc[:,'psi'] + psi_shift) % 360
angle_df['phi'] = (angle_df.loc[:,'phi'] + phi_shift) % 360
fig, ax = plt.subplots(figsize=(8,8))
# Set x and y label to indicate the angle shift
ax.set_title(f"Scatter of bond rotation-pairs $\phi$ and $\psi$")
phi_label = f'+ {phi_shift}°' if phi_shift > 0 else f'- {-phi_shift}°'
psi_label = f'+ {psi_shift}°' if psi_shift > 0 else f'- {-psi_shift}°'
if phi_shift == 0:
ax.set_xlabel('$\phi$')
else:
ax.set_xlabel(f'$\phi {phi_label}$')
if psi_shift == 0:
ax.set_ylabel('$\psi$')
else:
ax.set_ylabel(f'$\psi {psi_label}$')
# Add degree symbol to x and y labels
tick1 = mtick.StrMethodFormatter('{x:,.0f}°')
ax.yaxis.set_major_formatter(tick1)
ax.xaxis.set_major_formatter(tick1)
# Set the x and y-ticks.
ax.set_xticks(np.linspace(0, 360, 9))
ax.set_yticks(np.linspace(0, 360, 9))
ax.set_ylim(0,360)
ax.set_xlim(0,360)
eps = 15
min_samples = 30
db = DBSCAN(eps = eps, min_samples = min_samples)
#only_PRO = angle_df.loc[angle_df['residue name']=='PRO',['phi', 'psi']]
pred_dbscan = db.fit_predict(angle_df)
# Scatter the clusters in respective color
ax.scatter(angle_df.loc[:, 'phi'], angle_df.loc[:, 'psi'], c=pred_dbscan, s=1, cmap='gnuplot')
ax.set_title(f'Ramachandran plot for residue type GRY $\epsilon=${eps} min samples={min_samples}')
fig.show()
plt.tight_layout()