airline = pd.read_csv('EastWestAirlinesCluster.csv') #Import dataset EastWestAirlinesClusters.csv
airline = airline.iloc[:,1:] #remove first column from dataset
normalized_data_array = (airline.to_numpy() - airline.to_numpy().min(axis=0)) \
/ airline.to_numpy().std(axis=0) #normalize the dataset
airline['cluster number'] = np.zeros(len(airline),dtype=int) #add new column 'cluster number'
#Define distance
dist = lambda x, y: (np.power((x-y), 2)).sum()
def k_means_clustering(k, dataset, data_norm):
"""Cluster the dataset (where data_norm is the normalised dataset) into k clusters
using K-means clustering with the 2-norm as distance function."""
start_time = time.time()
#Define number of clusters k and select k centroids
c_k = np.random.randint(len(dataset), size = k) #select random centroids
centroids = data_norm[c_k] #store centroids in array
#Starting values of the while loop
obj_new = 10**50 #Placeholder big value
obj_old = float('inf')
while obj_new < obj_old:
#Generate an array with distances to centroids for every row
distance_to_centroid = np.asarray(
[[dist(centroid, row) for centroid in centroids]
for row in data_norm])
#Assign to a row the cluster number of the nearest centroid
dataset['cluster number'] = np.argmin(distance_to_centroid, axis=1)
#Create an array with the distances of the rows to it's centroid
min_distance = distance_to_centroid.min(axis=1)
for k_ in range(0,k):
#Consider rows with cluster number k_
cluster = data_norm[list(dataset[dataset['cluster number'] == k_].index)]
#Calculate the centroid
centroids[k_] = np.sum(cluster, axis = 0) / len(cluster)
obj_old = obj_new
obj_new = np.sum(min_distance)
return(obj_old)
#Define kernel matrix
def Kernel_matrix(kernel_type, data_norm, gamma = 1):
if kernel_type == "2-norm":
K = np.matmul(data_norm, data_norm.T)
elif kernel_type == "gaussian":
kernel_distance = lambda x, y: np.exp(- gamma * np.square(x-y).sum())
K = np.asarray([[kernel_distance(p,q) for p in data_norm] for q in data_norm])
else:
print("Error in defining Kernel: invalid kernel_type")
return K
def k_means_clustering_kernelised(k, dataset, data_norm, kernel_type, gamma = 1):
"""Cluster the dataset (where data_norm is the normalised dataset) into k clusters
using kernelised K-means clustering with the kernel_type is the 2-norm or gaussian kernel."""
start_time = time.time()
#Initialise the necessary variables
cluster_distance = np.zeros(k, dtype = float) #Array of length to each cluster
min_centroid_distance = np.zeros(len(dataset), dtype = float)
no_iterations = 0
obj_old = float('inf')
obj_new = 10**50 #Placeholder big value
#Calculate the kernel value in advance of every row pair without 'cluster number' column
K = Kernel_matrix(kernel_type, data_norm, gamma)
#Assign random cluster to rows
for i in range(len(dataset)):
dataset['cluster number'][i] = np.random.randint(k)
#Generate a list of lists with the index of each row in a cluster
index_cluster = []
for k_ in range(k):
index_cluster.append(list(dataset[dataset['cluster number'] == k_].index))
#Calculate the double sum of the kernels in each cluster in advance
double_sum_cluster = np.zeros(k, dtype = float)
for k_ in range(k):
for i in index_cluster[k_]:
double_sum_cluster[k_] += np.sum(K[i][index_cluster[k_]])
#While there is improvement in the objective function
while(obj_new < obj_old):
for j in range(len(dataset)):
#For each cluster calculate the cluster distance to the row using the kernel function
for k_ in range(k):
cluster_distance[k_] = K[j][j]
cluster_distance[k_] += (-2/len(index_cluster[k_])) * np.sum(K[j][index_cluster[k_]])
cluster_distance[k_] += len(index_cluster[k_])**(-2) * double_sum_cluster[k_]
#If the row number changed, then update the double sum element of the kernel function
old_cluster = dataset['cluster number'].iloc[j]
new_cluster = np.argmin(cluster_distance)
if(old_cluster != new_cluster):
#Update the double sum variable
double_sum_cluster[new_cluster] += 2 * np.sum(K[j][index_cluster[new_cluster]]) \
+ K[j][j]
double_sum_cluster[old_cluster] += -2 * np.sum(K[j][index_cluster[old_cluster]]) \
+ K[j][j]
#Update the cluster number of the row
dataset['cluster number'][j] = new_cluster
#Update the index lists of the
index_cluster[old_cluster].remove(j)
index_cluster[new_cluster].append(j)
#Update the centroid distance
min_centroid_distance[j] = cluster_distance[new_cluster]
#Update the value of the objective function
obj_old = obj_new
obj_new = np.sum(min_centroid_distance)
no_iterations += 1
print("Kernelised K-means clustering based on the %s kernel into %i clusters took %s seconds" \
% (kernel_type, k, float("{:.3f}".format(time.time() - start_time))))
print("Objective function value on iteration ", no_iterations-1 ," equals ", \
float("{:.4f}".format(obj_old)))
return obj_old
n = 5
min_k, max_k = 2, 7
#for every cluster number k from min_k to max_k preform n clusterings
list_distances = np.zeros([n, max_k-min_k+1], dtype = float)
for i in range(min_k, max_k+1):
for j in range(n):
list_distances[j, i-min_k] = k_means_clustering(i, airline, normalized_data_array)
fig = px.box(pd.DataFrame(list_distances, columns=[i for i in range(min_k, max_k+1)]))
fig.update_layout(
title = 'Boxplot of the error sum of squares of '+ str(n) + ' clusterings for values of k',
xaxis_title = 'k',
yaxis_title = 'Error sum of squares',
autosize=False,
width=600,
height=600,)
fig.show()
def boxplot(k, dataset, data_norm):
"""Make a boxplot of the normalised data clustered into k clusters."""
#Normalize airline for the plot
dataset_norm = dataset.copy()
dataset_norm.iloc[:,:-1] = data_norm
dataset_norm = dataset_norm.sort_values(by='cluster number', ascending=True)
#Make the title including the number of points inside each cluster.
title_box = ""
for k_ in range(k):
text = str(len(dataset[dataset['cluster number']==k_]['cluster number']))
title_box += " Cluster "+str(k_)+": "+text+ " per."
fig = px.box(dataset_norm,
x = dataset_norm.columns,
color='cluster number',
facet_col_spacing = 0.2,
points=False,title = title_box)
fig.update_layout(
autosize=False,
width=800,
height=600,
title_font_size = 14)
fig.show()
return
k=4
k_means_clustering(k, airline, normalized_data_array)
boxplot(k, airline, normalized_data_array)
k = 4
kernel_type = "2-norm"
obj = k_means_clustering_kernelised(k, airline, normalized_data_array, kernel_type, gamma = 1)
boxplot(k, airline, normalized_data_array)
kernel_type = "gaussian"
gamma = 10**(-1)
obj = k_means_clustering_kernelised(k, airline, normalized_data_array, kernel_type, gamma)
boxplot(k, airline, normalized_data_array)
def spectral_clustering(k, dataset, data_norm, kernel_type, gamma = 0.0001, normalized = 1):
"""Spectral clustering using the scipy eigh function to compute eigenvalues.
Input 0 or False as normalized when normalized spectral clustering is required."""
start_time = time.time()
L = construct_Laplacians(data_norm, kernel_type, gamma, normalized)
eigenvalues_time = time.time()
eigenvalues, eigenvectors = scipy.linalg.eigh(L, subset_by_index=[0,k-1])
print('calculating the eigenvalues took %s seconds' % (time.time() - eigenvalues_time))
print('eigenvalues = ', eigenvalues)
k_means_clustering(k, dataset, eigenvectors)
print("(Normalized = %r) Spectral clustering using kernel %s into %i clusters took %s seconds"\
% (normalized, kernel_type, k, time.time() - start_time))
k = 4
gamma = 10**(-1)
spectral_clustering(k,airline,normalized_data_array,'gaussian', gamma, normalized = 0)
boxplot(k, airline, normalized_data_array)
spectral_clustering(k,airline,normalized_data_array,'gaussian', gamma, normalized = 1)
boxplot(k, airline, normalized_data_array)
boxplot(k, airline, normalized_data_array)
normalized = 0
own_spectral_clustering(k, airline, normalized_data_array, "gaussian", gamma, normalized, eps)
boxplot(k, airline, normalized_data_array)
#Construct new data with reduced ranks
def construct_reduced_data(rank, original_data):
Sigma = np.zeros(original_data.shape)
U, singular_values, VT = np.linalg.svd(original_data)
#set an amount of singular values to zero
singular_values[-(len(singular_values) - rank):] = 0
#set the reduced singular values on the diagonal of the zero's matrix Sigma
np.fill_diagonal(Sigma, singular_values)
#construct the new matrix by using the new reduced Sigma
reduced_rank_data = np.matmul(U,np.matmul(Sigma,VT))
return reduced_rank_data
#Plot singular values sigma_1, sigma_2, ..., sigma_11
list_singular_values = np.linalg.svd(normalized_data_array)[1]
plt.plot(range(1,12), list_singular_values, 'ro')
plt.title('Singular values')
plt.ylabel(''r'$\sigma_i$')
plt.xlabel('i')
plt.show()
#Check how many singular values is the best to keep
percentages = []
sing_values_remove = []
for i in range(1,normalized_data_array.shape[1]):
#construct new data matrix with reduced rank
reduced_rank_data_i = construct_reduced_data(normalized_data_array.shape[1] \
-i, normalized_data_array)
#calculate sum of squares of the singular values using the frobenius norm
Fr_norm_orig = np.linalg.norm(normalized_data_array, 'fro')
Fr_norm_new = np.linalg.norm(reduced_rank_data_i, 'fro')
percentages.append(Fr_norm_new/Fr_norm_orig)
sing_values_remove.append(i)
#Plot percentage of the total sum of squares of singular values of
#the new matrices with reduced ranks
plt.plot(sing_values_remove, percentages, 'ro')
plt.title('Percentages of new Frobenius norm of the matrix with reduced rank')
plt.xlabel('Number of singular values removed')
plt.ylabel('Percentages')
plt.show()
#Perform clustering with new reduced data
new_reduced_data_rank3 = construct_reduced_data(3, normalized_data_array)
k_means_clustering(4, airline, new_reduced_data_rank3)
boxplot(4,airline,normalized_data_array)