import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt

#Load Data and print
iris = datasets.load_iris()
print(iris.data)

#The iris.target values give the ground truth for the Iris dataset.
#Ground truth, in this case, is the number corresponding to the flower that we are trying to learn.
# take a look at the target values
print(iris.target)

#Read the description of the data
print(iris.DESCR)

#Next step is to visualize the data
# we create a 2D scatterplot using two of its features sepal length(col 0) Vs petal length(col2)
# add alpha=0.5 - It makes some points look darker than others. The darker spots is where there is an overlap.
samples = iris.data
# x contains all the sepal length and y contains the sepal width
x = samples[: , 0]
y = samples[: , 1]
plt.scatter(x, y, alpha=0.5)
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.show()

# Use k=3
k = 3
centroids_x = np.random.uniform(min(x), max(x), size=k)
centroids_y = np.random.uniform(min(y), max(y), size=k)
centroids = np.array(list(zip(centroids_x, centroids_y)))
print(centroids)
plt.scatter(x, y, alpha=0.5)
plt.scatter(centroids_x, centroids_y)
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.show()

# assign data samples to the nearest centroid
# here we will using the distance formula, iterate through the data samples,
#then calculate the distance of each datapoint to each of the three centroids
# in addition to that we use argmin(distances), that would return index of the lowest distance
x = samples[:,0]
y = samples[:,1]
sepal_length_width = np.array(list(zip(x, y)))
def distance(a, b):
one = (a[0] - b[0]) ** 2
two = (a[1] - b[1]) ** 2
distance = (one+two) ** 0.5
return distance
# create an array using numpy called labels of the size of samples and another array distances of the size k
# cluster label for each point
labels = np.zeros(len(samples))
# Distances to each centroid
distances = np.zeros(k)
# We iterate through each data in the sample
for i in range(len(samples)):
distances[0] = distance(sepal_length_width[i], centroids[0])
distances[1] = distance(sepal_length_width[i], centroids[1])
distances[2] = distance(sepal_length_width[i], centroids[2])
cluster = np.argmin(distances) # minimum of the three distances
labels[i] = cluster
# print the labels
print(labels)

# Update centroids based on the above-assigned data samples.
from copy import deepcopy
# save the old centroids before updating
centroids_old = deepcopy(centroids)
# create a for loop that iterates k times, we can calculate the mean of the points that have the same cluster label.
# create an array named points where we get all the data points that have the cluster label i.
#A deep copy constructs a new compound object and then, recursively, inserts copies into it of the objects found in the original.
for i in range(k):
points = []
for j in range(len(sepal_length_width)):
if labels[j] == i:
points.append(sepal_length_width[j])
centroids[i] = np.mean(points, axis=0)
print(centroids_old)
print(centroids)

# Now we iterate till the centroids stabilize
#step1 - we create an array error
error = np.zeros(3)
error = np.zeros(3)
error[0] = distance(centroids[0], centroids_old[0])
error[1] = distance(centroids[1], centroids_old[1])
error[2] = distance(centroids[2], centroids_old[2])
while error.all() != 0:
# Step 2: Assign samples to nearest centroid
for i in range(len(samples)):
distances[0] = distance(sepal_length_width[i], centroids[0])
distances[1] = distance(sepal_length_width[i], centroids[1])
distances[2] = distance(sepal_length_width[i], centroids[2])
cluster = np.argmin(distances)
labels[i] = cluster
# Step 3: Update centroids
centroids_old = deepcopy(centroids)
for i in range(3):
points = [sepal_length_width[j] for j in range(len(sepal_length_width)) if labels[j] == i]
centroids[i] = np.mean(points, axis=0)
# Add this again:
error[0] = distance(centroids[0], centroids_old[0])
error[1] = distance(centroids[1], centroids_old[1])
error[2] = distance(centroids[2], centroids_old[2])
colors = ['r', 'g', 'b']
for i in range(k):
points = np.array([sepal_length_width[j] for j in range(len(samples)) if labels[j] == i])
plt.scatter(points[:, 0], points[:, 1], c=colors[i], alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='D', s=150)
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.show()

#K-means using scikit learn
from sklearn.cluster import KMeans
iris = datasets.load_iris()
samples = iris.data
model = KMeans(n_clusters=3)
model.fit(samples)
labels = model.predict(samples)
print(labels)

#To test the data
new_samples = np.array([[5.7, 4.4, 1.5, 0.4],
[6.5, 3. , 5.5, 0.4],
[5.8, 2.7, 5.1, 1.9]])
print(new_samples)

new_labels = model.predict(new_samples)
print(new_labels)
# plot and Visualize
x = samples[:,0]
y = samples[:,1]
plt.scatter(x, y, c=labels, alpha=0.5)
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.show()

#evaluation
#
import pandas as pd
target = iris.target
species = np.chararray(target.shape, itemsize=150)
for i in range(len(samples)):
if target[i] == 0:
species[i] = 'setosa'
elif target[i] == 1:
species[i] = 'veriscolor'
elif target[i] == 2:
species[i] = 'virginica'
df = pd.DataFrame({'labels': labels, 'species': species})
print(df)
#crosstab- allows to examine the relationships within the data that is not readily apparent
ct = pd.crosstab(df['labels'], df['species'])
print(ct)

digits = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra", header=None)
test_set = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes", header=None)

digits, test_set

samples = digits
num_clusters = list(range(1, 50))
inertias = []
for k in num_clusters:
model = KMeans(n_clusters=k)
model.fit(samples)
inertias.append(model.inertia_)
plt.plot(num_clusters, inertias, '-o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.show()

model = KMeans(n_clusters=20)
model.fit(samples)
labels = model.predict(samples)
print(labels)

import numpy as np
from matplotlib import pyplot as plt
from sklearn import datasets
digits = datasets.load_digits()
print(digits)

print(digits.DESCR)

print(digits.data)

print(digits.target)

# Visualize the image
# Figure size (width, height)
fig = plt.figure(figsize=(6, 6))
# Adjust the subplots
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# For each of the 64 images
for i in range(64):
# Initialize the subplots: add a subplot in the grid of 8 by 8, at the i+1-th position
ax = fig.add_subplot(8, 8, i+1, xticks=[], yticks=[])
# Display an image at the i-th position
ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')
# Label the image with the target value
ax.text(0, 7, str(digits.target[i]))
plt.show()

from sklearn.cluster import KMeans

# because we have 10 digits,w e need n_cluster as 10
model = KMeans(n_clusters=10, random_state=42)
model.fit(digits.data)

fig = plt.figure(figsize=(8, 3))
fig.suptitle('Cluser Center Images', fontsize=14, fontweight='bold')

for i in range(10):
# Initialize subplots in a grid of 2X5, at i+1th position
ax = fig.add_subplot(2, 5, 1 + i)
# Display images
ax.imshow(model.cluster_centers_[i].reshape((8, 8)), cmap=plt.cm.binary)
plt.show()

#2221
new_samples1 = np.array([
[0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.69,0.61,0.00,0.00,0.00,0.00,0.00,4.25,7.62,7.62,3.70,0.00,0.00,0.00,0.00,5.33,6.55,5.23,7.62,2.13,0.00,0.00,0.00,4.64,6.24,0.38,7.62,3.58,0.00,0.00,0.00,0.15,0.38,0.07,7.61,3.80,0.00,0.00,0.00,0.00,0.61,6.39,7.62,7.24,6.86,0.00,0.00,0.00,0.30,4.41,4.57,4.27,3.81],
[0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.14,2.29,2.29,1.22,0.00,0.00,0.00,0.00,7.47,7.62,7.62,7.62,6.63,3.80,0.00,0.00,6.77,0.99,1.37,3.04,6.63,7.62,3.79,0.00,6.63,0.45,0.00,0.00,0.15,5.63,6.85,0.00,7.54,7.01,4.49,2.51,2.73,6.78,5.86,0.00,1.52,4.95,7.23,7.62,7.62,6.78,1.73,0.00,0.00,0.00,0.22,0.76,0.76,0.15,0.00,0.00],
[0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.67,4.41,3.80,0.07,0.00,0.00,0.00,0.53,7.54,7.53,7.62,3.11,0.00,0.00,0.00,0.76,7.62,3.04,7.61,4.11,0.00,0.00,0.00,0.30,6.47,2.66,6.46,6.17,0.00,0.00,0.00,0.00,0.00,0.00,4.56,6.85,0.00,0.00,5.18,5.34,5.34,5.41,7.09,6.17,0.00,0.00,5.18,5.34,5.34,5.34,4.87,1.20,0.00,0.00],
[0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,4.57,5.33,0.00,0.00,0.00,0.00,0.00,0.00,5.33,6.10,0.00,0.00,0.00,0.00,0.00,0.00,5.33,6.10,0.00,0.00,0.00,0.00,0.00,0.00,5.33,6.10,0.00,0.00,0.00,0.00,0.00,0.00,5.33,6.09,0.00,0.00,0.00,0.00,0.00,0.00,2.97,3.50,0.00,0.00,0.00]
])
#2223
new_samples2 = np.array([
[0.00,5.72,7.55,6.64,7.62,4.35,0.00,0.00,0.00,1.68,1.37,0.08,6.10,6.10,0.00,0.00,0.00,0.00,0.08,3.74,7.55,5.03,0.00,0.00,0.00,0.00,4.96,7.62,5.26,0.46,0.00,0.00,0.00,1.14,7.62,4.50,0.00,0.00,0.00,0.00,0.00,2.90,7.62,4.80,3.81,3.81,3.36,0.08,0.00,4.80,7.55,6.86,6.86,6.86,6.41,0.38,0.00,0.23,0.53,0.00,0.00,0.00,0.00,0.00],
[0.00,2.29,7.55,7.62,7.62,7.24,0.99,0.00,0.00,3.58,7.55,1.76,2.21,7.62,2.82,0.00,0.00,0.23,1.30,3.13,6.40,7.62,1.98,0.00,0.00,2.13,6.71,7.62,6.25,2.21,0.00,0.00,1.52,7.55,7.55,5.26,3.05,3.05,3.58,0.46,2.29,7.62,7.62,7.62,7.62,7.62,7.55,1.60,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00],
[1.98,7.40,7.40,7.40,7.48,3.97,0.00,0.00,0.76,3.74,0.84,1.14,5.65,7.32,0.69,0.00,0.00,0.00,0.00,0.00,1.52,7.55,3.97,0.00,0.00,0.00,0.00,0.00,0.23,7.09,4.58,0.00,0.00,0.15,1.30,3.74,6.94,7.63,2.44,0.00,3.05,7.09,7.62,7.62,7.62,7.47,6.86,4.73,4.04,6.86,6.33,5.34,4.65,4.57,4.04,2.29,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00],
[0.00,2.67,7.09,7.62,7.40,2.82,0.00,0.00,0.00,5.64,6.10,2.75,6.25,6.86,0.00,0.00,0.00,0.31,0.23,1.68,6.03,6.79,0.00,0.00,0.00,1.14,6.63,7.62,7.62,6.94,0.23,0.00,0.00,0.69,4.50,3.89,3.74,7.63,3.74,0.00,0.69,1.98,0.69,0.00,0.38,7.09,4.58,0.00,5.87,7.62,3.89,1.22,6.03,7.55,2.82,0.00,5.56,7.62,7.24,7.62,7.25,2.60,0.00,0.00]
])
new_labels = model.predict(new_samples2)
print(new_labels, end='')