colors = ['red',
          'green',
          'blue',
          'yellow',
         'darkblue',
         'brown',
         'purple',
         'cyan',
         'yellowgreen',
         'silver',
         'hotpink',] #will use for visualisation
def fit(data,k, N =100): # k - number of centroids, data - data,N - max interations, if no value given its 100 (no real reson)
        np.random.shuffle(data) #want diffrent centroids on initiation
        #print(data)
        
        centroids = {} #empty dich
        
        
        for i in range(k): # make k random points centroids
            centroids[i] = data[i]
            
        for run in range(N): #we run until system converges OR N times.
            #print(run)
            classifications = {} #want to classify to centroids-class
            #print(i)
            for i in range(k):
                
                classifications[i] = [] #how many classes
            
            
            for point in data: #loop though data and caluclate distance to all centroids for all points
                distances = [
                    np.linalg.norm(point-centroids[c]) for c in centroids #idea to use np.linalg.norm from: https://medium.com/@rishit.dagli/build-k-means-from-scratch-in-python-e46bf68aa875
                ]
                
                classification = distances.index(min(distances)) #assing point to closest centroid
                classifications[classification].append(point) #add point to classification
                
            #make copy
            copy_centroids = dict(centroids) 
            
             
            # make new/move centroids
            for i in range(k): #for all
                centroids[i] = np.average(classifications[i],axis=0) #move to middle of all points of class k
            
            #################### Check if done ##################
            
            best = True # if we survive next loop for all c we have best centroids
            for i in range(k):
                
                prev_centroid = copy_centroids[i]
                current_centroid = centroids[i]
                #print("previous centroid: ",prev_centroid)
                #print("current centroid:  ",current_centroid)
                #print(current_centroid)
                if (prev_centroid !=  current_centroid).any():
                    #print("they are not the same!")
                    best = False #if not the same, we break!
                    break #break loop
               
            if best: #if we got best centroids we are done
                break
                
        return centroids, classifications
def kmeans(x, K, n_init): 
    
    
    for i in range(n_init): #do reruns and pick best
        centroids, labels = fit(x,K)
        #print(i)
        
        SSE = 0
        for centroid in centroids:
            for point in labels[centroid]: #loop though data and caluclate distance to all centroids for all points
                
                SSE = SSE + np.linalg.norm(point-centroids[centroid])
        #print("SSE is: ", SSE)
        if i == 0 or SSE < prevSSE: #first time we skip check and make "best"
            bestCentroids = {}
            bestLabels = {}
            bestCentroids = centroids
            bestLabels = labels
            prevSSE = SSE
            #print("new best SSE! ",prevSSE)
    
        
    # x: input data     
    # K: number of centroids
    # n_init: the number of initial guesses for the centroids
    
    
    
    print("best SSE for ",K, " clusters is: ", prevSSE)
    return bestCentroids, bestLabels, prevSSE
#fit(X,3)
#kmeans(X, 10, 10)
import matplotlib.pyplot as plt
# Generate test data # make_blods code from https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_plusplus.html#sphx-glr-auto-examples-cluster-plot-kmeans-plusplus-py
from sklearn.datasets import make_blobs
n_samples = 300
n_components = 3
X, y_true = make_blobs(
    n_samples=n_samples, centers=n_components, cluster_std=0.65, random_state=0
)
X = X[:, ::-1]
plt.scatter(X[:,0], X[:,1], s=150)
plt.show()
import numpy as np
# to help us decide K we make elbow plot of SSE
allSSE = []
for k in range(1,11):
    centroids, labels, prevSSE = kmeans(X,k,10)
    allSSE.append(prevSSE)
    
    
plt.plot(range(1,11), allSSE)
plt.title('Elbow plot')
plt.xlabel('clusters')
plt.ylabel('Within cluster scatter')
plt.show()   
# plot kmeans for best: #plot code based on  https://medium.com/@rishit.dagli/build-k-means-from-scratch-in-python-e46bf68aa875
centroids, labels, prevSSE = kmeans(X,3,1)
for classification in labels:
    color = colors[classification]
    for featureset in labels[classification]:
        plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5)
    
for centroid in centroids:
    plt.scatter(centroids[centroid][0], centroids[centroid][1],
                    marker="o", color="k", s=150, linewidths=5)
                
plt.show()
prevSSE
# Generate test data # make_blods code from https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_plusplus.html#sphx-glr-auto-examples-cluster-plot-kmeans-plusplus-py
from sklearn.datasets import make_blobs
n_samples = 300
n_components = 3
X, y_true = make_blobs(
    n_samples=n_samples, centers=n_components, cluster_std=1, random_state=0
)
X = X[:, ::-1]
plt.scatter(X[:,0], X[:,1], s=150)
plt.show()
# to help us decide K we make elbow plot of SSE
allSSE = []
for k in range(1,11):
    centroids, labels, prevSSE = kmeans(X,k,15)
    allSSE.append(prevSSE)
    
    
plt.plot(range(1,11), allSSE)
plt.title('Elbow plot')
plt.xlabel('clusters')
plt.ylabel('Within cluster scatter')
plt.show() 
from skimage import io
#image = io.imread('C:/Users/maru7/Min enhet/bodyflight.png') 4mbr1x6t7k251
image = io.imread('C:/Users/maru7/Min enhet/4mbr1x6t7k251.jpg') 
io.imshow(image)
io.show()
#image.shape #shape is 600x900x3
#Dimension of the original image
rows = image.shape[0]
cols = image.shape[1]
image = image.reshape(rows*cols, 3) # 4 features from RGBA
image.shape
#code source: https://towardsdatascience.com/image-compression-using-k-means-clustering-aa0c91bb0eeb
from sklearn.cluster import KMeans 
import numpy as np
for k in [4,8,16,32]: 
    kmeans = KMeans(n_clusters = k)
    
    kmeans.fit(image)
    # replace with closest centroid color
    compressed = kmeans.cluster_centers_[kmeans.labels_]
    
    compressed = np.clip(compressed.astype('uint8'), 0, 255)
    #Reshape the image to original dimension
    compressed = compressed.reshape(rows, cols, 3)
    
    #Save and display output image
    name = "compressed_"+str(k)
    io.imsave(name+'.jpg', compressed)
    io.imshow(compressed)
    print('Kmeans compression with '+str(k)+' clusters')
    io.show()
def fit2(data,k, N =100): # k - number of centroids, data - data,N - max interations, if no value given its 100 (no real reson)
        np.random.shuffle(data) #want diffrent centroids on initiation
        #print(data)
        
        centroids = {} #empty dich
        
        
        for i in range(k): # make k random points centroids
            centroids[i] = data[i]
        for run in range(N): #we run until system converges OR N times.
            
            classifications = {} #want to classify to centroids-class
            print("run: ",run)
            for o in range(k):
                
                labels = np.zeros(len(data))
                classifications[o] = [] #how many classes
            
            
            for j,point in enumerate(data): #loop though data and caluclate distance to all centroids for all points
                
                distances = [
                    np.linalg.norm(point-centroids[c]) for c in centroids #idea to use np.linalg.norm from: https://medium.com/@rishit.dagli/build-k-means-from-scratch-in-python-e46bf68aa875
                ]
                
                classification = distances.index(min(distances)) #assing point to closest centroid
                #print(classification)
                labels[j] =  classification
                classifications[classification].append(point) #add point to classification
                
            #make copy
            copy_centroids = dict(centroids) 
            
                
            # make new/move centroids
            for i in range(k): #for all
                centroids[i] = np.average(classifications[i],axis=0) #move to middle of all points of class k
            
            #################### Check if done ##################
            
            best = True # if we survive next loop for all c we have best centroids
            for i in range(k):
                
                prev_centroid = copy_centroids[i]
                current_centroid = centroids[i]
                #print("previous centroid: ",prev_centroid)
                #print("current centroid:  ",current_centroid)
                #print(current_centroid)
                if (prev_centroid !=  current_centroid).any():
                    #print("they are not the same!")
                    best = False #if not the same, we break!
                    break #break loop
               
            if best: #if we got best centroids we are done
                break
            #print("check 4 !")   
        return centroids, classifications, labels
def kmeans2(x, K, n_init): 
    
    
    for i in range(n_init):
        centroids, labels, realLabels = fit2(x,K)
        print(i)
        
        SSE = 0
        for centroid in centroids:
            for point in labels[centroid]: #loop though data and caluclate distance to all centroids for all points
                
                SSE = SSE + np.linalg.norm(point-centroids[centroid])
        #print("SSE is: ", SSE)
        if i == 0 or SSE < prevSSE: #first time we skip check and make "best"
            bestCentroids = centroids
            bestLabels = labels
            bestRealLabels = realLabels
            prevSSE = SSE
            
    
        
    # x: input data     
    # K: number of centroids
    # n_init: the number of initial guesses for the centroids
    
    
    
    print("best SSE for ",K, " clusters is: ", prevSSE)
    return bestCentroids, bestLabels, prevSSE, bestRealLabels
#fit2(X,3)
#kmeans2(X, 10, 10)
centroids, labels, prevSSE, trueLabels = kmeans2(image, 4,1)#data, clusters, reruns
#make into array
dat = list(centroids.values())
an_array = np.array(dat)
an_array
#Replace each pixel value with its nearby centroid
compressed_image = an_array[trueLabels.astype(int)]
#print(compressed_image)
compressed_image = np.clip(compressed_image.astype('uint8'), 0, 255) #make into rgb format
#Reshape the image to original dimension
compressed_image = compressed_image.reshape(rows, cols, 3)
#Save and display output image
#io.imsave('compressed_image_64.png', compressed_image)
io.imshow(compressed_image)
io.show()
from sklearn.datasets import load_breast_cancer
import pandas as pd
data = load_breast_cancer().data
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data", 
    sep=",",
    header=None,
    names=[
        "id_number", "diagnosis", "radius_mean",
        "texture_mean", "perimeter_mean", "area_mean",
        "smoothness_mean", "compactness_mean",
        "concavity_mean","concave_points_mean",
        "symmetry_mean", "fractal_dimension_mean",
        "radius_se", "texture_se", "perimeter_se",
        "area_se", "smoothness_se", "compactness_se",
        "concavity_se", "concave_points_se",
        "symmetry_se", "fractal_dimension_se",
        "radius_worst", "texture_worst",
        "perimeter_worst", "area_worst",
        "smoothness_worst", "compactness_worst",
        "concavity_worst", "concave_points_worst",
        "symmetry_worst", "fractal_dimension_worst"
    ],).drop("id_number", axis=1) #här försvinner idnr
y = data.diagnosis.map({"B": 0, "M": 1}).to_numpy()
X = data.drop("diagnosis", axis=1).to_numpy() #drop diagnosis tyo get same data as in task
from sklearn.mixture import GaussianMixture
m=100 #number of repeates to avg over
Kupper = 10
Klower = 2
# metrics
aic_Gmm = np.zeros((Kupper-Klower, m))
bic_Gmm = np.zeros((Kupper-Klower, m))
aic2_Gmm = np.zeros((Kupper-Klower, m))
bic2_Gmm = np.zeros((Kupper-Klower, m))
for i in range(Klower,Kupper):
    print(f"loop {i}")
    for k in range(m):
        Gmm = GaussianMixture(n_components=i).fit(X)   
        y_gmm = Gmm.predict(X)
        
        aic_Gmm[i-Klower,k] = Gmm.aic(X)
        bic_Gmm[i-Klower,k] = Gmm.bic(X)
        
        ####### did not get function to work
        #logLike = Gmm.score(X)
        #size = X.shape[0]
        #aic2_Gmm[i-Klower,k] = calc_aic(size, logLike ,k)
        #bic2_Gmm[i-Klower,k] = calc_bic(size, logLike ,k)
fig, ax = plt.subplots(1,2, figsize = (24,6))
# aic
ax[0].errorbar(
     [2,3,4,5,6,7,8,9], #clusters we checked
    np.median(aic_Gmm,axis=1),
    [
        np.median(aic_Gmm,axis=1)-np.min(aic_Gmm, axis=1),
        np.max(aic_Gmm, axis=1)-np.median(aic_Gmm,axis=1),
    ], fmt='-o', ecolor='r'
)
ax[0].set_xlabel("Clusters")
ax[0].set_ylabel("AIC")
ax[0].set_title("aic_Gmm scores")
# BIC
ax[1].errorbar(
     [2,3,4,5,6,7,8,9], #clusters we checked
    np.median(bic_Gmm,axis=1),
    [
        np.median(bic_Gmm,axis=1)-np.min(bic_Gmm, axis=1),
        np.max(bic_Gmm, axis=1)-np.median(bic_Gmm,axis=1),
    ], fmt='-o', ecolor='r'
)
ax[1].set_xlabel("Clusters")
ax[1].set_ylabel("BIC")
ax[1].set_title("bic_Gmm scores")
from sklearn.mixture import GaussianMixture
k= 3
Gmm = GaussianMixture(n_components=k).fit(X) #we fit X to the model. The model now has parameters and stuff.
Gmm.aic(X) #check how good parameters are
logLike = Gmm.score(X)
size = X.shape[0]
AIC =  -2 * logLike  * size + 2 * k  #we get log-likelihood from .score(). 
print(AIC)
Gmm.aic(X) #Why are they not the same..?=?
# does not work
# calculate aic for regression
def calc_aic(n, logl, k):
    aic = n *-2*logl + 2 * k    # Q(thetha) is logl
    return aic
def calc_bic(n, logl, k):
    bic = n * -2*logl + k * np.log(n)
    return bic
from sklearn.mixture import GaussianMixture
m=100 #number of repeates to avg over
Kupper = 10
Klower = 2
# metrics
aic_Gmm = np.zeros((Kupper-Klower, m))
bic_Gmm = np.zeros((Kupper-Klower, m))
aic2_Gmm = np.zeros((Kupper-Klower, m))
bic2_Gmm = np.zeros((Kupper-Klower, m))
for i in range(Klower,Kupper):
    print(f"loop {i}")
    for k in range(m):
        Gmm = GaussianMixture(n_components=i).fit(X)   
        y_gmm = Gmm.predict(X)
        
        aic_Gmm[i-Klower,k] = Gmm.aic(X)
        bic_Gmm[i-Klower,k] = Gmm.bic(X)
        
        logLike = Gmm.score(X)
        size = X.shape[0]
        aic2_Gmm[i-Klower,k] = calc_aic(size, logLike ,k)
        bic2_Gmm[i-Klower,k] = -2 * logLike * size + k * np.log(
            size)