project1_LA_and_Opt_for

airline = pd.read_csv('EastWestAirlinesCluster.csv') #Import dataset EastWestAirlinesClusters.csv airline = airline.iloc[:,1:] #remove first column from dataset normalized_data_array = (airline.to_numpy() - airline.to_numpy().min(axis=0)) \ / airline.to_numpy().std(axis=0) #normalize the dataset airline['cluster number'] = np.zeros(len(airline),dtype=int) #add new column 'cluster number' #Define distance dist = lambda x, y: (np.power((x-y), 2)).sum()

def k_means_clustering(k, dataset, data_norm): """Cluster the dataset (where data_norm is the normalised dataset) into k clusters using K-means clustering with the 2-norm as distance function.""" start_time = time.time() #Define number of clusters k and select k centroids c_k = np.random.randint(len(dataset), size = k) #select random centroids centroids = data_norm[c_k] #store centroids in array #Starting values of the while loop obj_new = 10**50 #Placeholder big value obj_old = float('inf') while obj_new < obj_old: #Generate an array with distances to centroids for every row distance_to_centroid = np.asarray( [[dist(centroid, row) for centroid in centroids] for row in data_norm]) #Assign to a row the cluster number of the nearest centroid dataset['cluster number'] = np.argmin(distance_to_centroid, axis=1) #Create an array with the distances of the rows to it's centroid min_distance = distance_to_centroid.min(axis=1) for k_ in range(0,k): #Consider rows with cluster number k_ cluster = data_norm[list(dataset[dataset['cluster number'] == k_].index)] #Calculate the centroid centroids[k_] = np.sum(cluster, axis = 0) / len(cluster) obj_old = obj_new obj_new = np.sum(min_distance) return(obj_old)

#Define kernel matrix def Kernel_matrix(kernel_type, data_norm, gamma = 1): if kernel_type == "2-norm": K = np.matmul(data_norm, data_norm.T) elif kernel_type == "gaussian": kernel_distance = lambda x, y: np.exp(- gamma * np.square(x-y).sum()) K = np.asarray([[kernel_distance(p,q) for p in data_norm] for q in data_norm]) else: print("Error in defining Kernel: invalid kernel_type") return K def k_means_clustering_kernelised(k, dataset, data_norm, kernel_type, gamma = 1): """Cluster the dataset (where data_norm is the normalised dataset) into k clusters using kernelised K-means clustering with the kernel_type is the 2-norm or gaussian kernel.""" start_time = time.time() #Initialise the necessary variables cluster_distance = np.zeros(k, dtype = float) #Array of length to each cluster min_centroid_distance = np.zeros(len(dataset), dtype = float) no_iterations = 0 obj_old = float('inf') obj_new = 10**50 #Placeholder big value #Calculate the kernel value in advance of every row pair without 'cluster number' column K = Kernel_matrix(kernel_type, data_norm, gamma) #Assign random cluster to rows for i in range(len(dataset)): dataset['cluster number'][i] = np.random.randint(k) #Generate a list of lists with the index of each row in a cluster index_cluster = [] for k_ in range(k): index_cluster.append(list(dataset[dataset['cluster number'] == k_].index)) #Calculate the double sum of the kernels in each cluster in advance double_sum_cluster = np.zeros(k, dtype = float) for k_ in range(k): for i in index_cluster[k_]: double_sum_cluster[k_] += np.sum(K[i][index_cluster[k_]]) #While there is improvement in the objective function while(obj_new < obj_old): for j in range(len(dataset)): #For each cluster calculate the cluster distance to the row using the kernel function for k_ in range(k): cluster_distance[k_] = K[j][j] cluster_distance[k_] += (-2/len(index_cluster[k_])) * np.sum(K[j][index_cluster[k_]]) cluster_distance[k_] += len(index_cluster[k_])**(-2) * double_sum_cluster[k_] #If the row number changed, then update the double sum element of the kernel function old_cluster = dataset['cluster number'].iloc[j] new_cluster = np.argmin(cluster_distance) if(old_cluster != new_cluster): #Update the double sum variable double_sum_cluster[new_cluster] += 2 * np.sum(K[j][index_cluster[new_cluster]]) \ + K[j][j] double_sum_cluster[old_cluster] += -2 * np.sum(K[j][index_cluster[old_cluster]]) \ + K[j][j] #Update the cluster number of the row dataset['cluster number'][j] = new_cluster #Update the index lists of the index_cluster[old_cluster].remove(j) index_cluster[new_cluster].append(j) #Update the centroid distance min_centroid_distance[j] = cluster_distance[new_cluster] #Update the value of the objective function obj_old = obj_new obj_new = np.sum(min_centroid_distance) no_iterations += 1 print("Kernelised K-means clustering based on the %s kernel into %i clusters took %s seconds" \ % (kernel_type, k, float("{:.3f}".format(time.time() - start_time)))) print("Objective function value on iteration ", no_iterations-1 ," equals ", \ float("{:.4f}".format(obj_old))) return obj_old

n = 5 min_k, max_k = 2, 7 #for every cluster number k from min_k to max_k preform n clusterings list_distances = np.zeros([n, max_k-min_k+1], dtype = float) for i in range(min_k, max_k+1): for j in range(n): list_distances[j, i-min_k] = k_means_clustering(i, airline, normalized_data_array) fig = px.box(pd.DataFrame(list_distances, columns=[i for i in range(min_k, max_k+1)])) fig.update_layout( title = 'Boxplot of the error sum of squares of '+ str(n) + ' clusterings for values of k', xaxis_title = 'k', yaxis_title = 'Error sum of squares', autosize=False, width=600, height=600,) fig.show()

def boxplot(k, dataset, data_norm): """Make a boxplot of the normalised data clustered into k clusters.""" #Normalize airline for the plot dataset_norm = dataset.copy() dataset_norm.iloc[:,:-1] = data_norm dataset_norm = dataset_norm.sort_values(by='cluster number', ascending=True) #Make the title including the number of points inside each cluster. title_box = "" for k_ in range(k): text = str(len(dataset[dataset['cluster number']==k_]['cluster number'])) title_box += " Cluster "+str(k_)+": "+text+ " per." fig = px.box(dataset_norm, x = dataset_norm.columns, color='cluster number', facet_col_spacing = 0.2, points=False,title = title_box) fig.update_layout( autosize=False, width=800, height=600, title_font_size = 14) fig.show() return k=4 k_means_clustering(k, airline, normalized_data_array) boxplot(k, airline, normalized_data_array)

k = 4 kernel_type = "2-norm" obj = k_means_clustering_kernelised(k, airline, normalized_data_array, kernel_type, gamma = 1)