airline = pd.read_csv('EastWestAirlinesCluster.csv')   #Import dataset EastWestAirlinesClusters.csv
airline = airline.iloc[:,1:]    #remove first column from dataset
normalized_data_array = (airline.to_numpy() - airline.to_numpy().min(axis=0)) \
/ airline.to_numpy().std(axis=0)    #normalize the dataset
airline['cluster number'] = np.zeros(len(airline),dtype=int)  #add new column 'cluster number'
#Define distance
dist = lambda x, y: (np.power((x-y), 2)).sum()
def k_means_clustering(k, dataset, data_norm): 
    """Cluster the dataset (where data_norm is the normalised dataset) into k clusters 
    using K-means clustering with the 2-norm as distance function."""
    start_time = time.time()   
    #Define number of clusters k and select k centroids
    c_k = np.random.randint(len(dataset), size = k) #select random centroids
    centroids = data_norm[c_k]  #store centroids in array
    #Starting values of the while loop
    obj_new = 10**50   #Placeholder big value
    obj_old = float('inf')
    
    while obj_new < obj_old:     
        #Generate an array with distances to centroids for every row   
        distance_to_centroid = np.asarray(
            [[dist(centroid, row) for centroid in centroids]
            for row in data_norm])
        #Assign to a row the cluster number of the nearest centroid
        dataset['cluster number'] = np.argmin(distance_to_centroid, axis=1)  
        #Create an array with the distances of the rows to it's centroid
        min_distance = distance_to_centroid.min(axis=1)
        for k_ in range(0,k):
            #Consider rows with cluster number k_
            cluster = data_norm[list(dataset[dataset['cluster number'] == k_].index)] 
            #Calculate the centroid
            centroids[k_] = np.sum(cluster, axis = 0) / len(cluster)
        obj_old = obj_new
        obj_new = np.sum(min_distance)
    return(obj_old)
#Define kernel matrix 
def Kernel_matrix(kernel_type, data_norm, gamma = 1):
    if kernel_type == "2-norm":
        K = np.matmul(data_norm, data_norm.T)
    elif kernel_type == "gaussian":
        kernel_distance = lambda x, y: np.exp(- gamma * np.square(x-y).sum())
        K = np.asarray([[kernel_distance(p,q) for p in data_norm] for q in data_norm])
    else:
        print("Error in defining Kernel: invalid kernel_type")
    return K
def k_means_clustering_kernelised(k, dataset, data_norm, kernel_type, gamma = 1):
    """Cluster the dataset (where data_norm is the normalised dataset) into k clusters 
    using kernelised K-means clustering with the kernel_type is the 2-norm or gaussian kernel."""
    start_time = time.time()
    #Initialise the necessary variables
    cluster_distance = np.zeros(k, dtype = float)    #Array of length to each cluster
    min_centroid_distance = np.zeros(len(dataset), dtype = float)   
    no_iterations = 0
    obj_old = float('inf')
    obj_new = 10**50   #Placeholder big value
    #Calculate the kernel value in advance of every row pair without 'cluster number' column
    K = Kernel_matrix(kernel_type, data_norm, gamma)
    #Assign random cluster to rows
    for i in range(len(dataset)):
        dataset['cluster number'][i] = np.random.randint(k)    
    #Generate a list of lists with the index of each row in a cluster
    index_cluster = []
    for k_ in range(k):
        index_cluster.append(list(dataset[dataset['cluster number'] == k_].index))
    #Calculate the double sum of the kernels in each cluster in advance
    double_sum_cluster = np.zeros(k, dtype = float)
    for k_ in range(k):
        for i in index_cluster[k_]:
            double_sum_cluster[k_] += np.sum(K[i][index_cluster[k_]])
    #While there is improvement in the objective function
    while(obj_new < obj_old):
        for j in range(len(dataset)):     
            #For each cluster calculate the cluster distance to the row using the kernel function
            for k_ in range(k):     
                cluster_distance[k_] = K[j][j]
                cluster_distance[k_] += (-2/len(index_cluster[k_])) * np.sum(K[j][index_cluster[k_]])
                cluster_distance[k_] += len(index_cluster[k_])**(-2) * double_sum_cluster[k_]
            
            #If the row number changed, then update the double sum element of the kernel function
            old_cluster = dataset['cluster number'].iloc[j]
            new_cluster = np.argmin(cluster_distance) 
            if(old_cluster != new_cluster):
                #Update the double sum variable
                double_sum_cluster[new_cluster] +=  2 * np.sum(K[j][index_cluster[new_cluster]]) \
                + K[j][j]
                double_sum_cluster[old_cluster] += -2 * np.sum(K[j][index_cluster[old_cluster]]) \
                + K[j][j]
                #Update the cluster number of the row
                dataset['cluster number'][j] = new_cluster
                #Update the index lists of the 
                index_cluster[old_cluster].remove(j)
                index_cluster[new_cluster].append(j)
            #Update the centroid distance
            min_centroid_distance[j] = cluster_distance[new_cluster]
        #Update the value of the objective function
        obj_old = obj_new
        obj_new = np.sum(min_centroid_distance) 
        no_iterations += 1
    print("Kernelised K-means clustering based on the %s kernel into %i clusters took %s seconds"  \
    % (kernel_type, k, float("{:.3f}".format(time.time() - start_time))))
    print("Objective function value on iteration ", no_iterations-1 ," equals ", \
    float("{:.4f}".format(obj_old)))
    return obj_old
n = 5
min_k, max_k = 2, 7
#for every cluster number k from min_k to max_k preform n clusterings
list_distances = np.zeros([n, max_k-min_k+1], dtype = float)
for i in range(min_k, max_k+1):
    for j in range(n):
        list_distances[j, i-min_k] = k_means_clustering(i, airline, normalized_data_array)
fig = px.box(pd.DataFrame(list_distances, columns=[i for i in range(min_k, max_k+1)]))
fig.update_layout(
    title = 'Boxplot of the error sum of squares of '+ str(n) + ' clusterings for values of k',
    xaxis_title = 'k',
    yaxis_title = 'Error sum of squares',
    autosize=False,
    width=600,
    height=600,)
fig.show()
def boxplot(k, dataset, data_norm):
    """Make a boxplot of the normalised data clustered into k clusters."""
    
    #Normalize airline for the plot
    dataset_norm = dataset.copy()
    dataset_norm.iloc[:,:-1] = data_norm
    dataset_norm = dataset_norm.sort_values(by='cluster number', ascending=True)
    #Make the title including the number of points inside each cluster.
    title_box = ""
    for k_ in range(k):
        text = str(len(dataset[dataset['cluster number']==k_]['cluster number']))
        title_box += " Cluster "+str(k_)+": "+text+ " per."
    fig = px.box(dataset_norm,
        x = dataset_norm.columns, 
        color='cluster number',
        facet_col_spacing = 0.2,
        points=False,title = title_box)
    fig.update_layout(
        autosize=False,
        width=800,
        height=600,
        title_font_size = 14)
    fig.show()
    return
k=4
k_means_clustering(k, airline, normalized_data_array)
boxplot(k, airline, normalized_data_array)
k = 4
kernel_type = "2-norm"
obj = k_means_clustering_kernelised(k, airline, normalized_data_array, kernel_type, gamma = 1)
boxplot(k, airline, normalized_data_array)
kernel_type = "gaussian"
gamma = 10**(-1)
obj = k_means_clustering_kernelised(k, airline, normalized_data_array, kernel_type, gamma)
boxplot(k, airline, normalized_data_array)
def spectral_clustering(k, dataset, data_norm, kernel_type, gamma = 0.0001, normalized = 1):
    """Spectral clustering using the scipy eigh function to compute eigenvalues. 
    Input 0 or False as normalized when normalized spectral clustering is required."""
    start_time = time.time()
    L = construct_Laplacians(data_norm, kernel_type, gamma, normalized)
    eigenvalues_time = time.time()
    eigenvalues, eigenvectors = scipy.linalg.eigh(L, subset_by_index=[0,k-1])
    print('calculating the eigenvalues took %s seconds' % (time.time() - eigenvalues_time))
    print('eigenvalues = ', eigenvalues)
    k_means_clustering(k, dataset, eigenvectors)
    print("(Normalized = %r) Spectral clustering using kernel %s into %i clusters took %s seconds"\
     % (normalized, kernel_type, k, time.time() - start_time))
k = 4
gamma = 10**(-1)
spectral_clustering(k,airline,normalized_data_array,'gaussian', gamma, normalized = 0)
boxplot(k, airline, normalized_data_array)
spectral_clustering(k,airline,normalized_data_array,'gaussian', gamma, normalized = 1)
boxplot(k, airline, normalized_data_array)
boxplot(k, airline, normalized_data_array)
normalized = 0
own_spectral_clustering(k, airline, normalized_data_array, "gaussian", gamma, normalized, eps)
boxplot(k, airline, normalized_data_array)
#Construct new data with reduced ranks
def construct_reduced_data(rank, original_data):
    Sigma = np.zeros(original_data.shape)
    U, singular_values, VT = np.linalg.svd(original_data)
    #set an amount of singular values to zero
    singular_values[-(len(singular_values) - rank):] = 0
    #set the reduced singular values on the diagonal of the zero's matrix Sigma
    np.fill_diagonal(Sigma, singular_values)
    #construct the new matrix by using the new reduced Sigma
    reduced_rank_data = np.matmul(U,np.matmul(Sigma,VT))
    return reduced_rank_data
#Plot singular values sigma_1, sigma_2, ..., sigma_11
list_singular_values = np.linalg.svd(normalized_data_array)[1]
plt.plot(range(1,12), list_singular_values, 'ro')
plt.title('Singular values')
plt.ylabel(''r'$\sigma_i$')
plt.xlabel('i')
plt.show()
#Check how many singular values is the best to keep
percentages = []
sing_values_remove = []
for i in range(1,normalized_data_array.shape[1]):
    #construct new data matrix with reduced rank
    reduced_rank_data_i = construct_reduced_data(normalized_data_array.shape[1] \
    -i, normalized_data_array)
    #calculate sum of squares of the singular values using the frobenius norm
    Fr_norm_orig = np.linalg.norm(normalized_data_array, 'fro')
    Fr_norm_new = np.linalg.norm(reduced_rank_data_i, 'fro')
    percentages.append(Fr_norm_new/Fr_norm_orig)
    sing_values_remove.append(i)
#Plot percentage of the total sum of squares of singular values of 
#the new matrices with reduced ranks
plt.plot(sing_values_remove, percentages, 'ro')
plt.title('Percentages of new Frobenius norm of the matrix with reduced rank')
plt.xlabel('Number of singular values removed')
plt.ylabel('Percentages')
plt.show()
#Perform clustering with new reduced data
new_reduced_data_rank3 = construct_reduced_data(3, normalized_data_array)
k_means_clustering(4, airline, new_reduced_data_rank3)
boxplot(4,airline,normalized_data_array)