# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# # Create a data set in 2 dimensions using make_blobs
# The function make_blobs stores the data in x and the cluster each point belongs to in y
(x,y)= make_blobs (
n_samples =200, n_features=2,centers=5,cluster_std=0.5,
c='white', edgecolor='black',marker='o', s=50)
# Calculate centroids of blobs; remember that the array y contains cluster number
n = len(y)
n_points_blob = 40.
blob_mean = np.zeros( ( n_blobs,2) )
# Loop to sum up x and y coordinates of each point in blob
for i in range (0,n):
blob=y[i] # given blob index (0,1,2,3,4)
blob_mean [blob,0] = blob_mean [blob,0] + x[i,0]
blob_mean [blob,1] = blob_mean [blob,1] + x[i,1]
# Loop to divide by number of points in blob and print out
for k in range (0,n_blobs) :
blob_mean [k,:] = blob_mean [k,:] / n_points_blob
print (f"Blob {k+1} has centroid: ( { blob_mean [k,0] }, { blob_mean [k,1] } ) " )
# Create KMeans model and fit data in array x which is in the correct shape: 200 by 2
km = KMeans( n_clusters=5, init ='random' , random_state=0 )
print (km.cluster_centers_)
# Plot clusters using colors and compare visually with blob plot above
c='white', edgecolor='black',marker='o', s=50)
#they are the same bacuse the centroids are the same.
# Print out final clusters centroids and print out actual centroids to compare
print (" KMeans Centroids ", " Original Centroids ")
print( f"{ km.cluster_centers_ }" , f"( { blob_mean [k,0] }, { blob_mean [k,1] } )")
# Note that Kmeans cluster 3 corresponds to our cluster 5 and vice versa
# 1 iteration
centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ])
print ()
print ("******************************************")
print ("Results for 1st iteration of KMeans")
km = KMeans(
n_clusters=5, init =centroids,max_iter=1
print( "Centroids are ", km.cluster_centers_ )
# 2 iterations
centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ])
print ()
print ("******************************************")
print ("Results for 2nd iteration of KMeans")
km = KMeans(
n_clusters=5, init =centroids,max_iter=2
print( "Centroids are ", km.cluster_centers_ )
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
# Import models
from sklearn.cluster import KMeans
# Create dataframe from file insurance.csv
df = pd.read_csv ('insurance.csv')
n = 10
print (df [0 : 10])
#Create a new dataframe which only has the two features (age and bmi)and the charges
new_df = DataFrame (df, columns= ["age", "bmi", "charges"])
print (new_df [0 : 10])
# Determine how many data instances there are
my_data = new_df.value
# so 1338 data instances
# Find the maximum & minimum value of the charges
print (max(new_df.charges.values))
print (min(new_df.charges.values))
# Print out the data instance where the max occurs
df [max(df)]
# Print out all data instances where charges are >60,000
df [df.charges > 60000]
# Print out all data instances where bmi > 35 and charges > 50,000
df [(df.charges > 50000) & (df.bmi > 35)]
# Delete all data instances where the charges are more than $60,000
n = len(new_df)
for i in range (0,n):
if (df [(df.charges > 60000)]):
df.drop [n]
# Print out all data instances where charges are >60000 to check
# Scale costs between $0 and $100; round costs to 2 decimal places (for cents)
# print out the first few entries to check
# Add column to dataframe with scaled charges and remove column with full charges
# Create 2D array and use KMeans with 4 clusters
# Print out the labels
# add column to dataframe giving cluster labels
# Set white background grid for Seaborn plots
sns.set_style ( "whitegrid")
# Create scatterplot of charges vs bmi with cluster indiciated by hue using Seaborn's relplot
# Create scatterplot of charges vs age with cluster indiciated by hue using Seaborn's relplot