Untitled Python Project

colors = ['red', 'green', 'blue', 'yellow', 'darkblue', 'brown', 'purple', 'cyan', 'yellowgreen', 'silver', 'hotpink',] #will use for visualisation

def fit(data,k, N =100): # k - number of centroids, data - data,N - max interations, if no value given its 100 (no real reson) np.random.shuffle(data) #want diffrent centroids on initiation #print(data) centroids = {} #empty dich for i in range(k): # make k random points centroids centroids[i] = data[i] for run in range(N): #we run until system converges OR N times. #print(run) classifications = {} #want to classify to centroids-class #print(i) for i in range(k): classifications[i] = [] #how many classes for point in data: #loop though data and caluclate distance to all centroids for all points distances = [ np.linalg.norm(point-centroids[c]) for c in centroids #idea to use np.linalg.norm from: https://medium.com/@rishit.dagli/build-k-means-from-scratch-in-python-e46bf68aa875 ] classification = distances.index(min(distances)) #assing point to closest centroid classifications[classification].append(point) #add point to classification #make copy copy_centroids = dict(centroids) # make new/move centroids for i in range(k): #for all centroids[i] = np.average(classifications[i],axis=0) #move to middle of all points of class k #################### Check if done ################## best = True # if we survive next loop for all c we have best centroids for i in range(k): prev_centroid = copy_centroids[i] current_centroid = centroids[i] #print("previous centroid: ",prev_centroid) #print("current centroid: ",current_centroid) #print(current_centroid) if (prev_centroid != current_centroid).any(): #print("they are not the same!") best = False #if not the same, we break! break #break loop if best: #if we got best centroids we are done break return centroids, classifications def kmeans(x, K, n_init): for i in range(n_init): #do reruns and pick best centroids, labels = fit(x,K) #print(i) SSE = 0 for centroid in centroids: for point in labels[centroid]: #loop though data and caluclate distance to all centroids for all points SSE = SSE + np.linalg.norm(point-centroids[centroid]) #print("SSE is: ", SSE) if i == 0 or SSE < prevSSE: #first time we skip check and make "best" bestCentroids = {} bestLabels = {} bestCentroids = centroids bestLabels = labels prevSSE = SSE #print("new best SSE! ",prevSSE) # x: input data # K: number of centroids # n_init: the number of initial guesses for the centroids print("best SSE for ",K, " clusters is: ", prevSSE) return bestCentroids, bestLabels, prevSSE #fit(X,3) #kmeans(X, 10, 10)

import matplotlib.pyplot as plt # Generate test data # make_blods code from https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_plusplus.html#sphx-glr-auto-examples-cluster-plot-kmeans-plusplus-py from sklearn.datasets import make_blobs n_samples = 300 n_components = 3 X, y_true = make_blobs( n_samples=n_samples, centers=n_components, cluster_std=0.65, random_state=0 ) X = X[:, ::-1] plt.scatter(X[:,0], X[:,1], s=150) plt.show()

import numpy as np # to help us decide K we make elbow plot of SSE allSSE = [] for k in range(1,11): centroids, labels, prevSSE = kmeans(X,k,10) allSSE.append(prevSSE) plt.plot(range(1,11), allSSE) plt.title('Elbow plot') plt.xlabel('clusters') plt.ylabel('Within cluster scatter') plt.show()

# plot kmeans for best: #plot code based on https://medium.com/@rishit.dagli/build-k-means-from-scratch-in-python-e46bf68aa875 centroids, labels, prevSSE = kmeans(X,3,1) for classification in labels: color = colors[classification] for featureset in labels[classification]: plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5) for centroid in centroids: plt.scatter(centroids[centroid][0], centroids[centroid][1], marker="o", color="k", s=150, linewidths=5) plt.show() prevSSE

# Generate test data # make_blods code from https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_plusplus.html#sphx-glr-auto-examples-cluster-plot-kmeans-plusplus-py from sklearn.datasets import make_blobs n_samples = 300 n_components = 3 X, y_true = make_blobs( n_samples=n_samples, centers=n_components, cluster_std=1, random_state=0 ) X = X[:, ::-1] plt.scatter(X[:,0], X[:,1], s=150) plt.show()

# to help us decide K we make elbow plot of SSE allSSE = [] for k in range(1,11): centroids, labels, prevSSE = kmeans(X,k,15) allSSE.append(prevSSE) plt.plot(range(1,11), allSSE) plt.title('Elbow plot') plt.xlabel('clusters') plt.ylabel('Within cluster scatter') plt.show()

from skimage import io #image = io.imread('C:/Users/maru7/Min enhet/bodyflight.png') 4mbr1x6t7k251 image = io.imread('C:/Users/maru7/Min enhet/4mbr1x6t7k251.jpg') io.imshow(image) io.show() #image.shape #shape is 600x900x3

#Dimension of the original image rows = image.shape[0] cols = image.shape[1] image = image.reshape(rows*cols, 3) # 4 features from RGBA image.shape

#code source: https://towardsdatascience.com/image-compression-using-k-means-clustering-aa0c91bb0eeb from sklearn.cluster import KMeans import numpy as np for k in [4,8,16,32]: kmeans = KMeans(n_clusters = k) kmeans.fit(image) # replace with closest centroid color compressed = kmeans.cluster_centers_[kmeans.labels_] compressed = np.clip(compressed.astype('uint8'), 0, 255) #Reshape the image to original dimension compressed = compressed.reshape(rows, cols, 3) #Save and display output image name = "compressed_"+str(k) io.imsave(name+'.jpg', compressed) io.imshow(compressed) print('Kmeans compression with '+str(k)+' clusters') io.show()

def fit2(data,k, N =100): # k - number of centroids, data - data,N - max interations, if no value given its 100 (no real reson) np.random.shuffle(data) #want diffrent centroids on initiation #print(data) centroids = {} #empty dich for i in range(k): # make k random points centroids centroids[i] = data[i] for run in range(N): #we run until system converges OR N times. classifications = {} #want to classify to centroids-class print("run: ",run) for o in range(k): labels = np.zeros(len(data)) classifications[o] = [] #how many classes for j,point in enumerate(data): #loop though data and caluclate distance to all centroids for all points distances = [ np.linalg.norm(point-centroids[c]) for c in centroids #idea to use np.linalg.norm from: https://medium.com/@rishit.dagli/build-k-means-from-scratch-in-python-e46bf68aa875 ] classification = distances.index(min(distances)) #assing point to closest centroid #print(classification) labels[j] = classification classifications[classification].append(point) #add point to classification #make copy copy_centroids = dict(centroids) # make new/move centroids for i in range(k): #for all centroids[i] = np.average(classifications[i],axis=0) #move to middle of all points of class k #################### Check if done ################## best = True # if we survive next loop for all c we have best centroids for i in range(k): prev_centroid = copy_centroids[i] current_centroid = centroids[i] #print("previous centroid: ",prev_centroid) #print("current centroid: ",current_centroid) #print(current_centroid) if (prev_centroid != current_centroid).any(): #print("they are not the same!") best = False #if not the same, we break! break #break loop if best: #if we got best centroids we are done break #print("check 4 !") return centroids, classifications, labels def kmeans2(x, K, n_init): for i in range(n_init): centroids, labels, realLabels = fit2(x,K) print(i) SSE = 0 for centroid in centroids: for point in labels[centroid]: #loop though data and caluclate distance to all centroids for all points SSE = SSE + np.linalg.norm(point-centroids[centroid]) #print("SSE is: ", SSE) if i == 0 or SSE < prevSSE: #first time we skip check and make "best" bestCentroids = centroids bestLabels = labels bestRealLabels = realLabels prevSSE = SSE # x: input data # K: number of centroids # n_init: the number of initial guesses for the centroids print("best SSE for ",K, " clusters is: ", prevSSE) return bestCentroids, bestLabels, prevSSE, bestRealLabels #fit2(X,3) #kmeans2(X, 10, 10)

centroids, labels, prevSSE, trueLabels = kmeans2(image, 4,1)#data, clusters, reruns

#make into array dat = list(centroids.values()) an_array = np.array(dat) an_array #Replace each pixel value with its nearby centroid compressed_image = an_array[trueLabels.astype(int)] #print(compressed_image) compressed_image = np.clip(compressed_image.astype('uint8'), 0, 255) #make into rgb format #Reshape the image to original dimension compressed_image = compressed_image.reshape(rows, cols, 3) #Save and display output image #io.imsave('compressed_image_64.png', compressed_image) io.imshow(compressed_image) io.show()

from sklearn.datasets import load_breast_cancer import pandas as pd data = load_breast_cancer().data data = pd.read_csv( "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data", sep=",", header=None, names=[ "id_number", "diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean","concave_points_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst" ],).drop("id_number", axis=1) #här försvinner idnr y = data.diagnosis.map({"B": 0, "M": 1}).to_numpy() X = data.drop("diagnosis", axis=1).to_numpy() #drop diagnosis tyo get same data as in task

from sklearn.mixture import GaussianMixture m=100 #number of repeates to avg over Kupper = 10 Klower = 2 # metrics aic_Gmm = np.zeros((Kupper-Klower, m)) bic_Gmm = np.zeros((Kupper-Klower, m)) aic2_Gmm = np.zeros((Kupper-Klower, m)) bic2_Gmm = np.zeros((Kupper-Klower, m)) for i in range(Klower,Kupper): print(f"loop {i}") for k in range(m): Gmm = GaussianMixture(n_components=i).fit(X) y_gmm = Gmm.predict(X) aic_Gmm[i-Klower,k] = Gmm.aic(X) bic_Gmm[i-Klower,k] = Gmm.bic(X) ####### did not get function to work #logLike = Gmm.score(X) #size = X.shape[0] #aic2_Gmm[i-Klower,k] = calc_aic(size, logLike ,k) #bic2_Gmm[i-Klower,k] = calc_bic(size, logLike ,k)

fig, ax = plt.subplots(1,2, figsize = (24,6)) # aic ax[0].errorbar( [2,3,4,5,6,7,8,9], #clusters we checked np.median(aic_Gmm,axis=1), [ np.median(aic_Gmm,axis=1)-np.min(aic_Gmm, axis=1), np.max(aic_Gmm, axis=1)-np.median(aic_Gmm,axis=1), ], fmt='-o', ecolor='r' ) ax[0].set_xlabel("Clusters") ax[0].set_ylabel("AIC") ax[0].set_title("aic_Gmm scores") # BIC ax[1].errorbar( [2,3,4,5,6,7,8,9], #clusters we checked np.median(bic_Gmm,axis=1), [ np.median(bic_Gmm,axis=1)-np.min(bic_Gmm, axis=1), np.max(bic_Gmm, axis=1)-np.median(bic_Gmm,axis=1), ], fmt='-o', ecolor='r' ) ax[1].set_xlabel("Clusters") ax[1].set_ylabel("BIC") ax[1].set_title("bic_Gmm scores")

from sklearn.mixture import GaussianMixture k= 3 Gmm = GaussianMixture(n_components=k).fit(X) #we fit X to the model. The model now has parameters and stuff. Gmm.aic(X) #check how good parameters are

logLike = Gmm.score(X) size = X.shape[0] AIC = -2 * logLike * size + 2 * k #we get log-likelihood from .score(). print(AIC) Gmm.aic(X) #Why are they not the same..?=?

# does not work # calculate aic for regression def calc_aic(n, logl, k): aic = n *-2*logl + 2 * k # Q(thetha) is logl return aic def calc_bic(n, logl, k): bic = n * -2*logl + k * np.log(n) return bic

from sklearn.mixture import GaussianMixture m=100 #number of repeates to avg over Kupper = 10 Klower = 2 # metrics aic_Gmm = np.zeros((Kupper-Klower, m)) bic_Gmm = np.zeros((Kupper-Klower, m)) aic2_Gmm = np.zeros((Kupper-Klower, m)) bic2_Gmm = np.zeros((Kupper-Klower, m)) for i in range(Klower,Kupper): print(f"loop {i}") for k in range(m): Gmm = GaussianMixture(n_components=i).fit(X) y_gmm = Gmm.predict(X) aic_Gmm[i-Klower,k] = Gmm.aic(X) bic_Gmm[i-Klower,k] = Gmm.bic(X) logLike = Gmm.score(X) size = X.shape[0] aic2_Gmm[i-Klower,k] = calc_aic(size, logLike ,k) bic2_Gmm[i-Klower,k] = -2 * logLike * size + k * np.log( size)