# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
#
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# # Create a data set in 2 dimensions using make_blobs
#
# The function make_blobs stores the data in x and the cluster each point belongs to in y
#
(x,y)= make_blobs (
n_samples =200, n_features=2,centers=5,cluster_std=0.5,
shuffle=True,random_state=0
)
plt.scatter(
x[:,0],x[:,1],
c='white', edgecolor='black',marker='o', s=50)
plt.show()
# Calculate centroids of blobs; remember that the array y contains cluster number
n = len(y)
n_blobs=5
n_points_blob = 40.
blob_mean = np.zeros( ( n_blobs,2) )
#
# Loop to sum up x and y coordinates of each point in blob
for i in range (0,n):
blob=y[i] # given blob index (0,1,2,3,4)
blob_mean [blob,0] = blob_mean [blob,0] + x[i,0]
blob_mean [blob,1] = blob_mean [blob,1] + x[i,1]
# Loop to divide by number of points in blob and print out
for k in range (0,n_blobs) :
blob_mean [k,:] = blob_mean [k,:] / n_points_blob
print (f"Blob {k+1} has centroid: ( { blob_mean [k,0] }, { blob_mean [k,1] } ) " )
Blob 1 has centroid: ( 0.9148227503116141, 4.280336857833884 )
Blob 2 has centroid: ( 2.054873265582555, 1.1110568334393494 )
Blob 3 has centroid: ( -1.543516197703645, 2.8134551104965806 )
Blob 4 has centroid: ( -1.3675452152752192, 7.891534958387625 )
Blob 5 has centroid: ( 9.210599887923255, -2.473923314565855 )
# Create KMeans model and fit data in array x which is in the correct shape: 200 by 2
km = KMeans( n_clusters=5, init ='random' , random_state=0 )
km.fit(x)
print (km.cluster_centers_)
[[ 2.05487327 1.11105683]
[-1.5435162 2.81345511]
[-1.36754522 7.89153496]
[ 0.91482275 4.28033686]
[ 9.21059989 -2.47392331]]
# Plot clusters using colors and compare visually with blob plot above
plt.scatter(
x[:,0],x[:,1],c=km.labels_,cmap='rainbow'
)
plt.scatter(
km.cluster_centers_[:,0],km.cluster_centers_[:,1],
marker='*',c='black',label='centroids'
)
plt.legend(scatterpoints=1)
plt.grid()
plt.show()
plt.scatter(
x[:,0],x[:,1],
c='white', edgecolor='black',marker='o', s=50)
plt.show()
#they are the same bacuse the centroids are the same.
# Print out final clusters centroids and print out actual centroids to compare
#
print (" KMeans Centroids ", " Original Centroids ")
print( f"{ km.cluster_centers_ }" , f"( { blob_mean [k,0] }, { blob_mean [k,1] } )")
# Note that Kmeans cluster 3 corresponds to our cluster 5 and vice versa
KMeans Centroids Original Centroids
[[ 2.05487327 1.11105683]
[-1.5435162 2.81345511]
[-1.36754522 7.89153496]
[ 0.91482275 4.28033686]
[ 9.21059989 -2.47392331]] ( 9.210599887923255, -2.473923314565855 )
# 1 iteration
centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ])
print ()
print ("******************************************")
print ("Results for 1st iteration of KMeans")
km = KMeans(
n_clusters=5, init =centroids,max_iter=1
)
km.fit(x)
print( "Centroids are ", km.cluster_centers_ )
plt.scatter(
x[:,0],x[:,1],c=km.labels_,cmap='rainbow'
)
******************************************
Results for 1st iteration of KMeans
Centroids are [[ 0.12781901 1.96326963]
[-1.58138962 8.0665792 ]
[ 0.53251796 4.92290316]
[ 9.21059989 -2.47392331]
[ 2.989047 1.35068599]]
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/cluster/_kmeans.py:1146: RuntimeWarning: Explicit initial center position passed: performing only one init in KMeans instead of n_init=10.
self._check_params(X)
# 2 iterations
centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ])
print ()
print ("******************************************")
print ("Results for 2nd iteration of KMeans")
km = KMeans(
n_clusters=5, init =centroids,max_iter=2
)
km.fit(x)
print( "Centroids are ", km.cluster_centers_ )
plt.scatter(
x[:,0],x[:,1],c=km.labels_,cmap='rainbow'
)
******************************************
Results for 2nd iteration of KMeans
Centroids are [[-1.25277421 2.68078382]
[-1.36754522 7.89153496]
[ 0.89038117 4.28715579]
[ 9.21059989 -2.47392331]
[ 2.12650031 1.07647868]]
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/cluster/_kmeans.py:1146: RuntimeWarning: Explicit initial center position passed: performing only one init in KMeans instead of n_init=10.
self._check_params(X)
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
# Import models
from sklearn.cluster import KMeans
# Create dataframe from file insurance.csv
df = pd.read_csv ('insurance.csv')
n = 10
print (df [0 : 10])
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
5 31 female 25.740 0 no southeast 3756.62160
6 46 female 33.440 1 no southeast 8240.58960
7 37 female 27.740 3 no northwest 7281.50560
8 37 male 29.830 2 no northeast 6406.41070
9 60 female 25.840 0 no northwest 28923.13692
#Create a new dataframe which only has the two features (age and bmi)and the charges
#
new_df = DataFrame (df, columns= ["age", "bmi", "charges"])
new_df.head()
print (new_df [0 : 10])
age bmi charges
0 19 27.900 16884.92400
1 18 33.770 1725.55230
2 28 33.000 4449.46200
3 33 22.705 21984.47061
4 32 28.880 3866.85520
5 31 25.740 3756.62160
6 46 33.440 8240.58960
7 37 27.740 7281.50560
8 37 29.830 6406.41070
9 60 25.840 28923.13692
# Determine how many data instances there are
my_data = new_df.value
# so 1338 data instances
Execution error
AttributeError: 'DataFrame' object has no attribute 'value'
# Find the maximum & minimum value of the charges
print (max(new_df.charges.values))
print (min(new_df.charges.values))
63770.42801
1121.8739
# Print out the data instance where the max occurs
df [max(df)]
# Print out all data instances where charges are >60,000
df [df.charges > 60000]
ageint64
sexobject
543
54
female
1230
52
male
1300
45
male
# Print out all data instances where bmi > 35 and charges > 50,000
df [(df.charges > 50000) & (df.bmi > 35)]
ageint64
sexobject
34
28
male
543
54
female
577
31
female
819
33
female
# Delete all data instances where the charges are more than $60,000
n = len(new_df)
for i in range (0,n):
if (df [(df.charges > 60000)]):
df.drop [n]
# Print out all data instances where charges are >60000 to check
Execution error
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
# Scale costs between $0 and $100; round costs to 2 decimal places (for cents)
# print out the first few entries to check
# Add column to dataframe with scaled charges and remove column with full charges
# Create 2D array and use KMeans with 4 clusters
# Print out the labels
#
# add column to dataframe giving cluster labels
# Set white background grid for Seaborn plots
sns.set_style ( "whitegrid")
# Create scatterplot of charges vs bmi with cluster indiciated by hue using Seaborn's relplot
# Create scatterplot of charges vs age with cluster indiciated by hue using Seaborn's relplot