Week 11 Practice Notebooks

# Import libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series import seaborn as sns # from sklearn.cluster import KMeans from sklearn.datasets import make_blobs

# Create a data set in 2 dimensions using make_blobs # The function make_blobs stores the data in x and the cluster each point belongs to in y (x,y) = make_blobs(n_samples =200, n_features=2, centers=5, cluster_std=0.5, shuffle=True, random_state=0) plt.scatter(x[:,0], x[:,1], c='white', edgecolor='black', marker='o', s=50) plt.show()

# Calculate centroids of blobs; remember that the array y contains cluster number n = len(y) n_blobs = 5 n_points_blob = 40 blob_mean = np.zeros((n_blobs, 2)) # Loop to sum up x and y coordinates of each point in blob for i in range(0, n): blob = y[i] # given blob index (0,1,2,3,4) blob_mean[blob,0] = blob_mean[blob,0] + x[i,0] blob_mean[blob,1] = blob_mean[blob,1] + x[i,1] # Loop to divide by number of points in blob and print out for k in range(0, n_blobs): blob_mean[k,:] = blob_mean[k,:] / n_points_blob print (f"Blob {k+1} has centroid: ( {blob_mean[k,0]}, {blob_mean[k,1]} ) " )

# Create KMeans model and fit data in array x which is in the correct shape: 200 by 2 # n_clusters = 3 for 2D array km = KMeans (n_clusters = 3, init = "random", random_state = 0) km.fit(x)

# Plot clusters using colors and compare visually with blob plot above # Original inputs plt.scatter (x [:, 0], x [:, 1], marker = "o", c = km.labels_, cmap = "rainbow") # Centroid inputs plt.scatter (km.cluster_centers_ [:, 0], km.cluster_centers_ [:, 1], marker = "+", c = "black", label = "centroids") # Show legend plt.legend(scatterpoints = 1) plt.show() # Plot the original blob plot (copy from previous example) plt.scatter(x[:,0], x[:,1], c='white', edgecolor='black', marker='o', s=50) plt.show()

# Print out final clusters centroids and print out actual centroids to compare print(f" KMeans Centroids Original Centroids") print(f" {km.cluster_centers_} {blob_mean}")

# 1 iteration centroids = np.array([[-3,-2], [-2,12], [3,8], [12,0], [2,12]]) print() print("******************************************") print("Results for 1st iteration of KMeans") km = KMeans (n_clusters = 5, init = centroids, max_iter = 1) km.fit(x) print(f"The centroids are {km.cluster_centers_}") # Plot plt.scatter ( x[:, 0], x[:, 1], c = km.labels_, cmap = "rainbow") plt.title ("1st Iteration of KMeans") plt.show()

# 2 iterations centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ]) print() print("******************************************") print("Results for 2nd iteration of KMeans") km2 = KMeans(n_clusters = 5, init = centroids, max_iter = 2) km2.fit(x) # Plot plt.scatter (x[:, 0], x[:, 1], c = km2.labels_, cmap= "rainbow") plt.title("2nd Iteration of KMeans") plt.show()

# Import libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series import seaborn as sns # Import models from sklearn.cluster import KMeans

# Create dataframe from file insurance.csv df = pd.read_csv("insurance.csv") df.head(10)

#Create a new dataframe which only has the two features (age and bmi) and the charges new_df = df.drop(columns = ["sex", "children", "smoker", "region"])

# Determine how many data instances there are new_df.info() # so 1338 data instances print("***********************************") print(f"There are 1338 data instances")

# Find the maximum & minimum value of the charges charges = new_df.charges print(f" The minimum charge is {min(charges)}") print(f" The maximum charge is {max(charges)}")

# Print out the data instance where the max occurs df[new_df.charges == max(charges)]

# Print out all data instances where charges are >60,000 df[new_df.charges > 60000]

# Print out all data instances where bmi > 35 and charges > 50,000 df[(new_df.charges > 50000) & (new_df.bmi > 35)]

# Delete all data instances where the charges are more than $60,000 # LT60k = Less Than 60 k df_LT60k = new_df.drop ([543, 1230, 1300]) # Rows to delete # Print out all data instances where charges are >60000 to check df_LT60k [df.charges > 60000]

# Scale costs between $0 and $100; round costs to 2 decimal places (for cents) # print out the first few entries to check charge_min = min(df_LT60k.charges) charge_max = max(df_LT60k.charges) scaledcosts = 100. * (df_LT60k.charges - charge_min) / (charge_max - charge_min) costs = np.round_(scaledcosts, 2) print(costs) # Not sure if I did this correctly

# Add column to dataframe with scaled charges and remove column with full charges df_LT60k["scaled charges"] = costs df.drop(columns = ["charges"])

# Create 2D array and use KMeans with 4 clusters x = np.array(costs) n = len(x) x = np.reshape(x, (n, 1)) km = KMeans (n_clusters = 4) km.fit(x)

# Print out the labels print(km.labels_) # add column to dataframe giving cluster labels df_LT60k["clusters"] = km.labels_

# Set white background grid for Seaborn plots sns.set_style("whitegrid")

# Create scatterplot of charges vs bmi with cluster indiciated by hue using Seaborn's relplot sns.relplot(x = "bmi", y = "charges", data = df_LT60k, hue = "clusters") .set(title = "BMI vs Charges") # Made x-values bmi because it made more sense personally

# Create scatterplot of charges vs age with cluster indiciated by hue using Seaborn's relplot sns.relplot(x = "age", y = "charges", data = df_LT60k, hue = "clusters") .set(title = "Age vs Charges") # Made x-values age because it made more sense personally