Assignment 3: Introduction to Data Science and AI

import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm import math from scipy.ndimage.filters import gaussian_filter import seaborn as sns from matplotlib import mlab as ml from sklearn.datasets import make_blobs from sklearn import metrics from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler

# Importing the data set to a data frame file_name = "data_all.csv" df = pd.read_csv(file_name) # Checking for any NaN values in the data set df.isnull().values.any()

# Extracting the variables X_phi = df['phi'] X_psi = df['psi'] # Making a scatterplot plt.figure(figsize=(14,9)) plt.scatter(X_phi, X_psi, s = 10, c = 'darkcyan') plt.grid(True) plt.title('Distribution of Phi and Psi combinations for protein molecules') plt.xlabel('Phi, in degrees') plt.ylabel('Psi, in degrees') plt.show()

# Generating some test data plt.figure(figsize=(14,9)) heatmap, xedges, yedges = np.histogram2d(X_phi, X_psi, bins = 220) heatmap = gaussian_filter(heatmap, sigma = 32) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.clf() plt.axis([X_phi.min(), X_phi.max(), X_psi.min(), X_psi.max()]) plt.imshow(heatmap.T, extent=extent, origin='lower', cmap=cm.jet) cb = plt.colorbar() cb.set_label('Number of samples per bin') plt.title("Distribution of Phi and Psi combinations for protein molecules") plt.xlabel('Phi, in degrees') plt.ylabel('Psi, in degrees') plt.grid(True) plt.show()

# Function for conducting K-means Clustering and plotting the results def kmeans_clustering(X, n_clusters): plt.figure(figsize=(6,4)) # Perform K-means clustering kmeans = KMeans(n_clusters = n_clusters, random_state = 42) y_pred = kmeans.fit_predict(X) plt.scatter(X[:,0], X[:,1], c = y_pred, cmap='gist_rainbow', edgecolor='black', s = 20) plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', c='black') plt.title(f'Data points with K-means algorithm when k = {n_clusters}') plt.show()

# Scaling the data as the euclidean distance is used for K-means algorithm X = df[['phi', 'psi']] scaler = StandardScaler() X = scaler.fit_transform(X) # Setting the range of k that we want to test between 2 - 8 k_values = range(2, 8) # For each value in our lenght, do the K-means clustering algorithm and display the results for i in k_values: kmeans_clustering(X, i)

distortions = [] n_clusters = range(1,10) # We try out different k:s and get the inertia for cluster in n_clusters: kmean_model = KMeans(n_clusters = cluster) kmean_model.fit(X) distortions.append(kmean_model.inertia_)

plt.figure(figsize=(6,4)) plt.plot(n_clusters, distortions, 'bx-') plt.xlabel('Number of Clusters') plt.ylabel('Sum of Squared Distance') plt.title('Elbow Method Showing The Optimal # Clusters') plt.grid(True) plt.show()

# Selecting k as 4 k = 4 percentages = [1,0.75,0.5,0.25,0.10] plots = len(percentages) fig, axs = plt.subplots(1, plots, figsize=(20,4)) for i in range(0, plots): # choosing different sample lengths for the plots to show difference in centroids being selected n_sample_size = math.floor(len(X) * percentages[i]) # Creating blobs of the data in order to change the sample sizes X, y = make_blobs(n_samples = n_sample_size, centers = df[['phi', 'psi']]) kmeans = KMeans(n_clusters = k, random_state = 0) y_pred = kmeans.fit_predict(X) axs[i].scatter(X[:,0], X[:,1], c = y_pred, cmap = 'gist_rainbow', edgecolor='black', s = 20) # Plot kmeans cluster centers axs[i].scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', c= 'black') axs[i].set_title(f'Displaying {percentages[i]*10}% of the total data points') plt.tight_layout() plt.show()

This plot illustrates how the clustering is affected by the removal of random data points. The results show a strong consistency in the clustering, using k=4. The groups remain consistent through out the iterations and keep the overarching structure, where you can clearly see that points which were together in a previous iteration remain clustered together.

# Selecting k as 4 X = df[['phi', 'psi']] scaler = StandardScaler() X = scaler.fit_transform(X) random_init = [0, 1, 2, 3, 4, 5] plots = len(random_init) fig, axs = plt.subplots(1, plots, figsize=(20,4)) for i in range(0, plots): kmeans = KMeans(n_clusters = k, random_state = i) y_pred = kmeans.fit_predict(X) axs[i].scatter(X[:,0], X[:,1], c = y_pred, cmap= 'gist_rainbow', edgecolor='black', s = 20) axs[i].scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker = 'x',c = 'black') axs[i].set_title(f'Random state {i}') plt.tight_layout() plt.show()

The plot displays an incredible consistency in the clustering, when using 4 groups, regardless of initial position for the centroids in K-means algorithm. This speaks very highly of the consistency of k=4 and the validity of the clusters.

X = df[['phi', 'psi']] k_values = range(2, 8) k_opt = 0 high_score = 0 for k in k_values: kmeans = KMeans(n_clusters = k, random_state = 0).fit(X) labels = kmeans.labels_ score = metrics.silhouette_score(X, labels, metric = 'euclidean') print(f"Silhoutte score for k = {k} is: {score}") if (score > high_score): k_opt = k high_score = score print(f"The optimal silhoutte score is for k = {k_opt} and is: {high_score}")

The silhoutte score ranges from -1 to 1. Values closer to 1 indicate a good clustering method. Looking for the different suggestions for k (ranging from 2 to 8) we find that k=3 and k=4 provides the best sihoutte scores, as they are so similar it is hard to make a judgement. Using the findings from the 'elbow method' and visual inspection of plots, we choose k=4 as the best fit.

df_mod = df.copy() df_mod['phi']=(df['phi']+360)%360 df_mod['psi']=(df['psi']+360)%360 # Extracting the variables X_phi_mod = df_mod['phi'] X_psi_mod = df_mod['psi'] # Making a scatterplot plt.figure(figsize=(14,9)) plt.scatter(X_phi_mod, X_psi_mod, s = 10, c = 'darkcyan', label = 'Distribution of phi and psi') plt.grid(True) plt.title('Distribution of Phi and Psi combinations for protein molecules (shifted by 360 degrees)') plt.xlabel('Phi, in degrees') plt.ylabel('Psi, in degrees') plt.legend(loc ='upper left') plt.show()

X = df_mod[['phi', 'psi']] X = StandardScaler().fit_transform(X) k_values = range(2, 8) for value in k_values: kmeans_clustering(X, value)

It appears as though the most intuitive fit is now instead 3 clusters. Let's see how this looks for the silhoutte score. Let's compare k=3 and k=4 (which was the most effective before shifting the data set).

X = df_mod[['phi', 'psi']] k_values = [3, 4] k_opt=0 high_score = 0 for k in k_values: kmeans = KMeans(n_clusters=k, random_state=0).fit(X) labels = kmeans.labels_ score = metrics.silhouette_score(X, labels, metric='euclidean') print(f"Silhoutte score for k = {k} is: {score}") if (score > high_score): k_opt = k high_score = score print(f"The optimal silhoutte score is for k = {k_opt} and is: {high_score}")

The silhoutte score is now clearly optimal for k=3 instead. Which is consistent with the graphical displays above.

from sklearn.cluster import DBSCAN from sklearn import metrics from sklearn.preprocessing import StandardScaler from sklearn.neighbors import NearestNeighbors import collections

X = df[['phi', 'psi']] scaler = StandardScaler() X = scaler.fit_transform(X)

# Function for creating and plotting a DBSCAN for different values of eps and min_samples def createDBSCAN(X, eps = 0.5, min_samples = 100, add_bar_plot = False): # Fitting and predicting given values provided for eps and min_samples dbscan = DBSCAN(eps = eps, min_samples = min_samples) y = dbscan.fit_predict(X) labels = dbscan.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) outliers_df = df[labels == -1] clusters_df = df[labels != -1] color_clusters = labels[labels != -1] color_outliers = 'black' plt.figure(figsize=(7,4)) plt.scatter(clusters_df['phi'], clusters_df['psi'], c = color_clusters, edgecolors = 'black', cmap='gist_rainbow', s = 30) plt.scatter(outliers_df['phi'], outliers_df['psi'],c = color_outliers, edgecolors = 'black', label = 'Outliers', s = 30) plt.title(f"Datapoints with DBSCAN, minimum samples:{min_samples}, eps: {eps}") plt.xlabel('Phi, in degrees') plt.ylabel('Psi, in degrees') plt.legend(loc ='upper left') plt.show() if add_bar_plot == True: bar = outliers_df['residue name'].value_counts(sort=True).plot.bar() bar.set_title('Amino acid residue types that are most frequently outliers') # Tryng the function with eps = 0.5 and different values for min_samples createDBSCAN(X, min_samples = 10) createDBSCAN(X, min_samples = 100) createDBSCAN(X, min_samples = 500)

# Minimum samples to test min_samples = [200, 250, 300]

color_list = ['orchid', 'darkcyan', 'darkviolet'] i = 0 for value in min_samples: neigh = NearestNeighbors(n_neighbors = value) # Fitting NearestNeighbors to the data nbrs = neigh.fit(X) # Retrieving the distances and indices from Kneigbors distances, indices = nbrs.kneighbors(X) distances = np.sort(distances, axis=0) distances = distances[:,1] plt.plot(distances, c = color_list[i], linewidth = 3) plt.xlabel('Number of points') plt.ylabel('Average Distance') plt.title(f'Finding optimal eps, # of nearest neighbours:{value}') plt.grid(True) plt.show() i = i + 1

eps = [0.3, 0.4, 0.5] for i in eps: print(f'DBSCAN with eps = {i} and various values for min_samples') for j in min_samples: createDBSCAN(X, eps = i, min_samples = j)

# Plotting the cluster found with DBSCAN with epsilon = 0.4 and min_samples = 150 createDBSCAN(X, 0.4, 200, True)

print('For non-translated data, k = 4 is optimal') kmeans_clustering(X, 4) createDBSCAN(X, 0.4, 200)

pro_df = df[(df['residue name'] == 'PRO')].copy() X = pro_df[['phi', 'psi']] X = StandardScaler().fit_transform(X) min_samples = 200 eps = 0.5 # Fitting and predicting given values provided for eps and min_samples dbscan = DBSCAN(eps = eps, min_samples = min_samples) y = dbscan.fit_predict(X) labels = dbscan.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) outliers_df = pro_df[labels == -1] clusters_df = pro_df[labels != -1] color_clusters = labels[labels != -1] color_outliers = 'black' plt.figure(figsize=(7,4)) plt.scatter(clusters_df['phi'], clusters_df['psi'], c = color_clusters, edgecolors = 'black', cmap='gist_rainbow', s = 30) plt.scatter(outliers_df['phi'], outliers_df['psi'],c = color_outliers, edgecolors = 'black', label = 'Outliers', s = 30) plt.title(f"Datapoints with DBSCAN for PRO, minimum samples:{min_samples}, eps: {eps}") plt.xlabel('Phi, in degrees') plt.ylabel('Psi, in degrees') plt.legend(loc ='upper left') plt.show()

The initial parameters seem to produce consistent results, even varying them slightly does not impact the solution.

The clustering using only the residue type PRO differs from the general DBSCAN clustering by not having any clusters with positive Phi values. Furthermore, it produces two well defined clusters, and does not find any values in the top left corner, which was very prevalent in previous DBSCAN clusters. This is interesting, as DBSCAN never seems to cluster these exact spots, however, for large k, the k means algorithm seems to find these clusters (found in residue type PRO) more accurately.

pro_df = df[(df['residue name'] == 'GLY')].copy() X = pro_df[['phi', 'psi']] X = StandardScaler().fit_transform(X) min_samples = 200 eps = 0.5 # Fitting and predicting given values provided for eps and min_samples dbscan = DBSCAN(eps = eps, min_samples = min_samples) y = dbscan.fit_predict(X) labels = dbscan.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) outliers_df = pro_df[labels == -1] clusters_df = pro_df[labels != -1] color_clusters = labels[labels != -1] color_outliers = 'black' plt.figure(figsize=(7,4)) plt.scatter(clusters_df['phi'], clusters_df['psi'], c = color_clusters, edgecolors = 'black', cmap='gist_rainbow', s = 30) plt.scatter(outliers_df['phi'], outliers_df['psi'],c = color_outliers, edgecolors = 'black', label = 'Outliers', s = 30) plt.title(f"Datapoints with DBSCAN for GLY, minimum samples:{min_samples}, eps: {eps}") plt.xlabel('Phi, in degrees') plt.ylabel('Psi, in degrees') plt.legend(loc ='upper left') plt.show()

The initial parameters seem to produce consistent results, even varying them slightly does not impact the solution.

The residue type GLY seems to represent somewhat more of the clusters found in the general case. We see one cluster with phi>0, we find clusters both in the upper left and the middle left. However, some data points fall in the remaining clusters found in the general case, however, these are deemed outliers by the DBSCAN method.

It is important to consider that in previous tasks we found that the GLY residue had the highest number of outliers, by multiple factors. This can be displayed in the clustering of only GLY residues also, as we can see there are no clear clusters, there seems to be data points in each quadrant of the graph, and some almost randomly scattered.