colors = ['red',
'green',
'blue',
'yellow',
'darkblue',
'brown',
'purple',
'cyan',
'yellowgreen',
'silver',
'hotpink',] #will use for visualisation
def fit(data,k, N =100): # k - number of centroids, data - data,N - max interations, if no value given its 100 (no real reson)
np.random.shuffle(data) #want diffrent centroids on initiation
#print(data)
centroids = {} #empty dich
for i in range(k): # make k random points centroids
centroids[i] = data[i]
for run in range(N): #we run until system converges OR N times.
#print(run)
classifications = {} #want to classify to centroids-class
#print(i)
for i in range(k):
classifications[i] = [] #how many classes
for point in data: #loop though data and caluclate distance to all centroids for all points
distances = [
np.linalg.norm(point-centroids[c]) for c in centroids #idea to use np.linalg.norm from: https://medium.com/@rishit.dagli/build-k-means-from-scratch-in-python-e46bf68aa875
]
classification = distances.index(min(distances)) #assing point to closest centroid
classifications[classification].append(point) #add point to classification
#make copy
copy_centroids = dict(centroids)
# make new/move centroids
for i in range(k): #for all
centroids[i] = np.average(classifications[i],axis=0) #move to middle of all points of class k
#################### Check if done ##################
best = True # if we survive next loop for all c we have best centroids
for i in range(k):
prev_centroid = copy_centroids[i]
current_centroid = centroids[i]
#print("previous centroid: ",prev_centroid)
#print("current centroid: ",current_centroid)
#print(current_centroid)
if (prev_centroid != current_centroid).any():
#print("they are not the same!")
best = False #if not the same, we break!
break #break loop
if best: #if we got best centroids we are done
break
return centroids, classifications
def kmeans(x, K, n_init):
for i in range(n_init): #do reruns and pick best
centroids, labels = fit(x,K)
#print(i)
SSE = 0
for centroid in centroids:
for point in labels[centroid]: #loop though data and caluclate distance to all centroids for all points
SSE = SSE + np.linalg.norm(point-centroids[centroid])
#print("SSE is: ", SSE)
if i == 0 or SSE < prevSSE: #first time we skip check and make "best"
bestCentroids = {}
bestLabels = {}
bestCentroids = centroids
bestLabels = labels
prevSSE = SSE
#print("new best SSE! ",prevSSE)
# x: input data
# K: number of centroids
# n_init: the number of initial guesses for the centroids
print("best SSE for ",K, " clusters is: ", prevSSE)
return bestCentroids, bestLabels, prevSSE
#fit(X,3)
#kmeans(X, 10, 10)
import matplotlib.pyplot as plt
# Generate test data # make_blods code from https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_plusplus.html#sphx-glr-auto-examples-cluster-plot-kmeans-plusplus-py
from sklearn.datasets import make_blobs
n_samples = 300
n_components = 3
X, y_true = make_blobs(
n_samples=n_samples, centers=n_components, cluster_std=0.65, random_state=0
)
X = X[:, ::-1]
plt.scatter(X[:,0], X[:,1], s=150)
plt.show()
import numpy as np
# to help us decide K we make elbow plot of SSE
allSSE = []
for k in range(1,11):
centroids, labels, prevSSE = kmeans(X,k,10)
allSSE.append(prevSSE)
plt.plot(range(1,11), allSSE)
plt.title('Elbow plot')
plt.xlabel('clusters')
plt.ylabel('Within cluster scatter')
plt.show()
# plot kmeans for best: #plot code based on https://medium.com/@rishit.dagli/build-k-means-from-scratch-in-python-e46bf68aa875
centroids, labels, prevSSE = kmeans(X,3,1)
for classification in labels:
color = colors[classification]
for featureset in labels[classification]:
plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5)
for centroid in centroids:
plt.scatter(centroids[centroid][0], centroids[centroid][1],
marker="o", color="k", s=150, linewidths=5)
plt.show()
prevSSE
# Generate test data # make_blods code from https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_plusplus.html#sphx-glr-auto-examples-cluster-plot-kmeans-plusplus-py
from sklearn.datasets import make_blobs
n_samples = 300
n_components = 3
X, y_true = make_blobs(
n_samples=n_samples, centers=n_components, cluster_std=1, random_state=0
)
X = X[:, ::-1]
plt.scatter(X[:,0], X[:,1], s=150)
plt.show()
# to help us decide K we make elbow plot of SSE
allSSE = []
for k in range(1,11):
centroids, labels, prevSSE = kmeans(X,k,15)
allSSE.append(prevSSE)
plt.plot(range(1,11), allSSE)
plt.title('Elbow plot')
plt.xlabel('clusters')
plt.ylabel('Within cluster scatter')
plt.show()
from skimage import io
#image = io.imread('C:/Users/maru7/Min enhet/bodyflight.png') 4mbr1x6t7k251
image = io.imread('C:/Users/maru7/Min enhet/4mbr1x6t7k251.jpg')
io.imshow(image)
io.show()
#image.shape #shape is 600x900x3
#Dimension of the original image
rows = image.shape[0]
cols = image.shape[1]
image = image.reshape(rows*cols, 3) # 4 features from RGBA
image.shape
#code source: https://towardsdatascience.com/image-compression-using-k-means-clustering-aa0c91bb0eeb
from sklearn.cluster import KMeans
import numpy as np
for k in [4,8,16,32]:
kmeans = KMeans(n_clusters = k)
kmeans.fit(image)
# replace with closest centroid color
compressed = kmeans.cluster_centers_[kmeans.labels_]
compressed = np.clip(compressed.astype('uint8'), 0, 255)
#Reshape the image to original dimension
compressed = compressed.reshape(rows, cols, 3)
#Save and display output image
name = "compressed_"+str(k)
io.imsave(name+'.jpg', compressed)
io.imshow(compressed)
print('Kmeans compression with '+str(k)+' clusters')
io.show()
def fit2(data,k, N =100): # k - number of centroids, data - data,N - max interations, if no value given its 100 (no real reson)
np.random.shuffle(data) #want diffrent centroids on initiation
#print(data)
centroids = {} #empty dich
for i in range(k): # make k random points centroids
centroids[i] = data[i]
for run in range(N): #we run until system converges OR N times.
classifications = {} #want to classify to centroids-class
print("run: ",run)
for o in range(k):
labels = np.zeros(len(data))
classifications[o] = [] #how many classes
for j,point in enumerate(data): #loop though data and caluclate distance to all centroids for all points
distances = [
np.linalg.norm(point-centroids[c]) for c in centroids #idea to use np.linalg.norm from: https://medium.com/@rishit.dagli/build-k-means-from-scratch-in-python-e46bf68aa875
]
classification = distances.index(min(distances)) #assing point to closest centroid
#print(classification)
labels[j] = classification
classifications[classification].append(point) #add point to classification
#make copy
copy_centroids = dict(centroids)
# make new/move centroids
for i in range(k): #for all
centroids[i] = np.average(classifications[i],axis=0) #move to middle of all points of class k
#################### Check if done ##################
best = True # if we survive next loop for all c we have best centroids
for i in range(k):
prev_centroid = copy_centroids[i]
current_centroid = centroids[i]
#print("previous centroid: ",prev_centroid)
#print("current centroid: ",current_centroid)
#print(current_centroid)
if (prev_centroid != current_centroid).any():
#print("they are not the same!")
best = False #if not the same, we break!
break #break loop
if best: #if we got best centroids we are done
break
#print("check 4 !")
return centroids, classifications, labels
def kmeans2(x, K, n_init):
for i in range(n_init):
centroids, labels, realLabels = fit2(x,K)
print(i)
SSE = 0
for centroid in centroids:
for point in labels[centroid]: #loop though data and caluclate distance to all centroids for all points
SSE = SSE + np.linalg.norm(point-centroids[centroid])
#print("SSE is: ", SSE)
if i == 0 or SSE < prevSSE: #first time we skip check and make "best"
bestCentroids = centroids
bestLabels = labels
bestRealLabels = realLabels
prevSSE = SSE
# x: input data
# K: number of centroids
# n_init: the number of initial guesses for the centroids
print("best SSE for ",K, " clusters is: ", prevSSE)
return bestCentroids, bestLabels, prevSSE, bestRealLabels
#fit2(X,3)
#kmeans2(X, 10, 10)
centroids, labels, prevSSE, trueLabels = kmeans2(image, 4,1)#data, clusters, reruns
#make into array
dat = list(centroids.values())
an_array = np.array(dat)
an_array
#Replace each pixel value with its nearby centroid
compressed_image = an_array[trueLabels.astype(int)]
#print(compressed_image)
compressed_image = np.clip(compressed_image.astype('uint8'), 0, 255) #make into rgb format
#Reshape the image to original dimension
compressed_image = compressed_image.reshape(rows, cols, 3)
#Save and display output image
#io.imsave('compressed_image_64.png', compressed_image)
io.imshow(compressed_image)
io.show()
from sklearn.datasets import load_breast_cancer
import pandas as pd
data = load_breast_cancer().data
data = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",
sep=",",
header=None,
names=[
"id_number", "diagnosis", "radius_mean",
"texture_mean", "perimeter_mean", "area_mean",
"smoothness_mean", "compactness_mean",
"concavity_mean","concave_points_mean",
"symmetry_mean", "fractal_dimension_mean",
"radius_se", "texture_se", "perimeter_se",
"area_se", "smoothness_se", "compactness_se",
"concavity_se", "concave_points_se",
"symmetry_se", "fractal_dimension_se",
"radius_worst", "texture_worst",
"perimeter_worst", "area_worst",
"smoothness_worst", "compactness_worst",
"concavity_worst", "concave_points_worst",
"symmetry_worst", "fractal_dimension_worst"
],).drop("id_number", axis=1) #här försvinner idnr
y = data.diagnosis.map({"B": 0, "M": 1}).to_numpy()
X = data.drop("diagnosis", axis=1).to_numpy() #drop diagnosis tyo get same data as in task
from sklearn.mixture import GaussianMixture
m=100 #number of repeates to avg over
Kupper = 10
Klower = 2
# metrics
aic_Gmm = np.zeros((Kupper-Klower, m))
bic_Gmm = np.zeros((Kupper-Klower, m))
aic2_Gmm = np.zeros((Kupper-Klower, m))
bic2_Gmm = np.zeros((Kupper-Klower, m))
for i in range(Klower,Kupper):
print(f"loop {i}")
for k in range(m):
Gmm = GaussianMixture(n_components=i).fit(X)
y_gmm = Gmm.predict(X)
aic_Gmm[i-Klower,k] = Gmm.aic(X)
bic_Gmm[i-Klower,k] = Gmm.bic(X)
####### did not get function to work
#logLike = Gmm.score(X)
#size = X.shape[0]
#aic2_Gmm[i-Klower,k] = calc_aic(size, logLike ,k)
#bic2_Gmm[i-Klower,k] = calc_bic(size, logLike ,k)
fig, ax = plt.subplots(1,2, figsize = (24,6))
# aic
ax[0].errorbar(
[2,3,4,5,6,7,8,9], #clusters we checked
np.median(aic_Gmm,axis=1),
[
np.median(aic_Gmm,axis=1)-np.min(aic_Gmm, axis=1),
np.max(aic_Gmm, axis=1)-np.median(aic_Gmm,axis=1),
], fmt='-o', ecolor='r'
)
ax[0].set_xlabel("Clusters")
ax[0].set_ylabel("AIC")
ax[0].set_title("aic_Gmm scores")
# BIC
ax[1].errorbar(
[2,3,4,5,6,7,8,9], #clusters we checked
np.median(bic_Gmm,axis=1),
[
np.median(bic_Gmm,axis=1)-np.min(bic_Gmm, axis=1),
np.max(bic_Gmm, axis=1)-np.median(bic_Gmm,axis=1),
], fmt='-o', ecolor='r'
)
ax[1].set_xlabel("Clusters")
ax[1].set_ylabel("BIC")
ax[1].set_title("bic_Gmm scores")
from sklearn.mixture import GaussianMixture
k= 3
Gmm = GaussianMixture(n_components=k).fit(X) #we fit X to the model. The model now has parameters and stuff.
Gmm.aic(X) #check how good parameters are
logLike = Gmm.score(X)
size = X.shape[0]
AIC = -2 * logLike * size + 2 * k #we get log-likelihood from .score().
print(AIC)
Gmm.aic(X) #Why are they not the same..?=?
# does not work
# calculate aic for regression
def calc_aic(n, logl, k):
aic = n *-2*logl + 2 * k # Q(thetha) is logl
return aic
def calc_bic(n, logl, k):
bic = n * -2*logl + k * np.log(n)
return bic
from sklearn.mixture import GaussianMixture
m=100 #number of repeates to avg over
Kupper = 10
Klower = 2
# metrics
aic_Gmm = np.zeros((Kupper-Klower, m))
bic_Gmm = np.zeros((Kupper-Klower, m))
aic2_Gmm = np.zeros((Kupper-Klower, m))
bic2_Gmm = np.zeros((Kupper-Klower, m))
for i in range(Klower,Kupper):
print(f"loop {i}")
for k in range(m):
Gmm = GaussianMixture(n_components=i).fit(X)
y_gmm = Gmm.predict(X)
aic_Gmm[i-Klower,k] = Gmm.aic(X)
bic_Gmm[i-Klower,k] = Gmm.bic(X)
logLike = Gmm.score(X)
size = X.shape[0]
aic2_Gmm[i-Klower,k] = calc_aic(size, logLike ,k)
bic2_Gmm[i-Klower,k] = -2 * logLike * size + k * np.log(
size)