# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
#
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# # Create a data set in 2 dimensions using make_blobs
#
# The function make_blobs stores the data in x and the cluster each point belongs to in y
#
(x,y)= make_blobs (
n_samples =200, n_features=2,centers=5,cluster_std=0.5,
shuffle=True,random_state=0
)
plt.scatter(
x[:,0],x[:,1],
c='white', edgecolor='black',marker='o', s=50)
plt.show()
# Calculate centroids of blobs; remember that the array y contains cluster number
n = len(y)
n_blobs=5
n_points_blob = 40.
blob_mean = np.zeros( ( n_blobs,2) )
#
# Loop to sum up x and y coordinates of each point in blob
for i in range (0,n):
blob=y[i] # given blob index (0,1,2,3,4)
blob_mean [blob,0] = blob_mean [blob,0] + x[i,0]
blob_mean [blob,1] = blob_mean [blob,1] + x[i,1]
# Loop to divide by number of points in blob and print out
for k in range (0,n_blobs) :
blob_mean [k,:] = blob_mean [k,:] / n_points_blob
print (f"Blob {k+1} has centroid: ( { blob_mean [k,0] }, { blob_mean [k,1] } ) " )
# Create KMeans model and fit data in array x which is in the correct shape: 200 by 2
km = KMeans(n_clusters=3, init='random', random_state=0)
km.fit(x)
# Plot clusters using colors and compare visually with blob plot above
plt.scatter(x[:,0],x[:,1],c=km.labels_,cmap='rainbow')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],marker='*',c='black',label='centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.show()
plt.scatter(x[:,0],x[:,1],c='white',edgecolors='black',marker='o',s=50)
plt.show()
# Print out final clusters centroids and print out actual centroids to compare
#
print (" KMeans Centroids ", " Original Centroids ")
print(f" {km.cluster_centers_} {blob_mean} " )
# Note that Kmeans cluster 3 corresponds to our cluster 5 and vice versa
# 1 iteration
centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ])
print ()
print ("******************************************")
print ("Results for 1st iteration of KMeans")
km = KMeans(n_clusters=5, init=centroids, max_iter=1)
km.fit(x)
print (f'Centroids are {km.cluster_centers_}')
plt.scatter(x[:,0],x[:,1],c=km.labels_,cmap='rainbow')
plt.title('1 iteration')
plt.grid()
plt.show()
# 2 iterations
centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ])
print ()
print ("******************************************")
print ("Results for 2nd iteration of KMeans")
km = KMeans(n_clusters=5, init=centroids, max_iter=2)
km.fit(x)
print (f'Centroids are {km.cluster_centers_}')
plt.scatter(x[:,0],x[:,1],c=km.labels_,cmap='rainbow')
plt.title('2 iterations')
plt.grid()
plt.show()
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
# Import models
from sklearn.cluster import KMeans
# Create dataframe from file insurance.csv
df = pd.read_csv('insurance.csv')
df.head(10)
#Create a new dataframe which only has the two features (age and bmi)and the charges
#
newdf = df[['age','bmi','charges']].copy()
newdf.head()
# Determine how many data instances there are
newdf.info()
# so 1338 data instances
# Find the maximum & minimum value of the charges
charges = newdf.charges.values
print (f'maximum {max(charges)}')
print (f'minimum {min(charges)}')
# Print out the data instance where the max occurs
print (max(charges))
# Print out all data instances where charges are >60,000
print(newdf[(newdf['charges']>60000)])
# Print out all data instances where bmi > 35 and charges > 50,000
print (newdf[(newdf['bmi']>35) & (newdf['charges']>50000)])
# Delete all data instances where the charges are more than $60,000
newdf = newdf[newdf.charges < 60000]
# Print out all data instances where charges are >60000 to check
print(newdf[(newdf['charges']>60000)])
# Scale costs between $0 and $100; round costs to 2 decimal places (for cents)
# print out the first few entries to check
cmin = min(charges)
cmax = max(charges)
scaledcost = 100.*(charges-cmin)/(cmax-cmin)
scost = np.round_(scaledcost,2)
print (scost)
# Add column to dataframe with scaled charges and remove column with full charges
df['scaled costs'] = scost
df.drop(columns = ['charges'])
# Create 2D array and use KMeans with 4 clusters
x = np.array(scost)
n = len(x)
x = np.reshape(x,(n,1))
km = KMeans(n_clusters=4)
km.fit(x)
# Print out the labels
print (km.labels_)
#
# add column to dataframe giving cluster labels
df['clusters'] = km.labels_
# Set white background grid for Seaborn plots
sns.set_style ( "whitegrid")
# Create scatterplot of charges vs bmi with cluster indiciated by hue using Seaborn's relplot
sns.relplot(x='charges',y='bmi',data=df,hue='clusters')
# Create scatterplot of charges vs age with cluster indiciated by hue using Seaborn's relplot
sns.relplot(x='charges',y='age',data=df,hue='clusters')