Week 11 Practice Notebooks

# Import libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series import seaborn as sns # from sklearn.cluster import KMeans from sklearn.datasets import make_blobs

# # Create a data set in 2 dimensions using make_blobs # # The function make_blobs stores the data in x and the cluster each point belongs to in y # (x,y)= make_blobs ( n_samples =200, n_features=2,centers=5,cluster_std=0.5, shuffle=True,random_state=0 ) plt.scatter( x[:,0],x[:,1], c='white', edgecolor='black',marker='o', s=50) plt.show()

# Calculate centroids of blobs; remember that the array y contains cluster number n = len(y) n_blobs=5 n_points_blob = 40. blob_mean = np.zeros( ( n_blobs,2) ) # # Loop to sum up x and y coordinates of each point in blob for i in range (0,n): blob=y[i] # given blob index (0,1,2,3,4) blob_mean [blob,0] = blob_mean [blob,0] + x[i,0] blob_mean [blob,1] = blob_mean [blob,1] + x[i,1] # Loop to divide by number of points in blob and print out for k in range (0,n_blobs) : blob_mean [k,:] = blob_mean [k,:] / n_points_blob print (f"Blob {k+1} has centroid: ( { blob_mean [k,0] }, { blob_mean [k,1] } ) " )

# Create KMeans model and fit data in array x which is in the correct shape: 200 by 2 km = KMeans(n_clusters=3, init='random', random_state=0) km.fit(x)

# Plot clusters using colors and compare visually with blob plot above plt.scatter(x[:,0], x[:,1], c=km.labels_, cmap='rainbow') plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], marker='*', c='black', label='centroids') plt.legend(scatterpoints=1) plt.grid() plt.show() plt.scatter(x[:,0], x[:,1], c='white', edgecolors='black', marker='o', s=50) plt.show()

# Print out final clusters centroids and print out actual centroids to compare # print (" KMeans Centroids ", " Original Centroids ") print(f" {km.cluster_centers_}", f" {blob_mean}") # Note that Kmeans cluster 3 corresponds to our cluster 5 and vice versa

# 1 iteration centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ]) print () print ("******************************************") print ("Results for 1st iteration of KMeans") km = KMeans(n_clusters=5, init=centroids, max_iter=1) km.fit(x) print (f'Centroids: {km.cluster_centers_}') plt.scatter(x[:,0],x[:,1],c=km.labels_,cmap='rainbow') plt.title('1 Iteration') plt.grid() plt.show()

# 2 iterations centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ]) print () print ("******************************************") print ("Results for 2nd iteration of KMeans") km = KMeans(n_clusters=5, init=centroids, max_iter=2) km.fit(x) print (f'Centroids: {km.cluster_centers_}') plt.scatter(x[:,0],x[:,1],c=km.labels_,cmap='rainbow') plt.title('2 iterations') plt.grid() plt.show()

# Import libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series import seaborn as sns # Import models from sklearn.cluster import KMeans

# Create dataframe from file insurance.csv dataframe = pd.read_csv('insurance.csv') dataframe.head(10)

#Create a new dataframe which only has the two features (age and bmi)and the charges # newdata = dataframe[['age','bmi','charges']].copy() newdata.head()

# Determine how many data instances there are newdata.info() # so 1338 data instances

# Find the maximum & minimum value of the charges charges = newdata.charges.values print (f'Maximum: {max(charges)}') print (f'Minimum: {min(charges)}')

# Print out the data instance where the max occurs print (max(charges))

# Print out all data instances where charges are >60,000 print(newdata[(newdata['charges']>60000)])

# Print out all data instances where bmi > 35 and charges > 50,000 print (newdata[(newdata['bmi']>35) & (newdata['charges']>50000)])

# Delete all data instances where the charges are more than $60,000 # Print out all data instances where charges are >60000 to check newdata = newdata[newdata.charges < 60000] print(newdata[(newdata['charges']>60000)])

# Scale costs between $0 and $100; round costs to 2 decimal places (for cents) # print out the first few entries to check minimum = min(charges) maximum = max(charges) scaledcost = 100.*(charges-minimum)/(maximum-minimum) cost = np.round_(scaledcost,2) print (cost)

# Add column to dataframe with scaled charges and remove column with full charges dataframe['scaled costs'] = cost dataframe.drop(columns = ['charges'])

# Create 2D array and use KMeans with 4 clusters x = np.array(cost) n = len(x) x = np.reshape(x,(n,1)) kmean = KMeans(n_clusters=4) kmean.fit(x)

# Print out the labels # # add column to dataframe giving cluster labels print (kmean.labels_) dataframe['clusters'] = kmean.labels_

# Set white background grid for Seaborn plots sns.set_style ("whitegrid")

# Create scatterplot of charges vs bmi with cluster indiciated by hue using Seaborn's relplot sns.relplot(x='charges', y='bmi', data=dataframe, hue='clusters')

# Create scatterplot of charges vs age with cluster indiciated by hue using Seaborn's relplot sns.relplot(x='charges', y='age', data=dataframe, hue='clusters')