# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
#
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

# Create a data set in 2 dimensions using make_blobs
# The function make_blobs stores the data in x and the cluster each point belongs to in y
(x,y) = make_blobs(n_samples =200, n_features=2, centers=5, cluster_std=0.5,
shuffle=True, random_state=0)
plt.scatter(x[:,0], x[:,1], c='white', edgecolor='black', marker='o', s=50)
plt.show()

# Calculate centroids of blobs; remember that the array y contains cluster number
n = len(y)
n_blobs = 5
n_points_blob = 40
blob_mean = np.zeros((n_blobs, 2))
# Loop to sum up x and y coordinates of each point in blob
for i in range(0, n):
blob = y[i] # given blob index (0,1,2,3,4)
blob_mean[blob,0] = blob_mean[blob,0] + x[i,0]
blob_mean[blob,1] = blob_mean[blob,1] + x[i,1]
# Loop to divide by number of points in blob and print out
for k in range(0, n_blobs):
blob_mean[k,:] = blob_mean[k,:] / n_points_blob
print (f"Blob {k+1} has centroid: ( {blob_mean[k,0]}, {blob_mean[k,1]} ) " )

```
Blob 1 has centroid: ( 0.9148227503116141, 4.280336857833884 )
Blob 2 has centroid: ( 2.054873265582555, 1.1110568334393494 )
Blob 3 has centroid: ( -1.543516197703645, 2.8134551104965806 )
Blob 4 has centroid: ( -1.3675452152752192, 7.891534958387625 )
Blob 5 has centroid: ( 9.210599887923255, -2.473923314565855 )
```

# Create KMeans model and fit data in array x which is in the correct shape: 200 by 2
# n_clusters = 3 for 2D array
km = KMeans (n_clusters = 3, init = "random", random_state = 0)
km.fit(x)

# Plot clusters using colors and compare visually with blob plot above
# Original inputs
plt.scatter (x [:, 0], x [:, 1], marker = "o", c = km.labels_, cmap = "rainbow")
# Centroid inputs
plt.scatter (km.cluster_centers_ [:, 0], km.cluster_centers_ [:, 1], marker = "+", c = "black", label = "centroids")
# Show legend
plt.legend(scatterpoints = 1)
plt.show()
# Plot the original blob plot (copy from previous example)
plt.scatter(x[:,0], x[:,1], c='white', edgecolor='black', marker='o', s=50)
plt.show()

# Print out final clusters centroids and print out actual centroids to compare
print(f" KMeans Centroids Original Centroids")
print(f" {km.cluster_centers_} {blob_mean}")

```
KMeans Centroids Original Centroids
[[ 0.47539327 2.7349496 ]
[ 9.21059989 -2.47392331]
[-1.36754522 7.89153496]] [[ 0.91482275 4.28033686]
[ 2.05487327 1.11105683]
[-1.5435162 2.81345511]
[-1.36754522 7.89153496]
[ 9.21059989 -2.47392331]]
```

# 1 iteration
centroids = np.array([[-3,-2], [-2,12], [3,8], [12,0], [2,12]])
print()
print("******************************************")
print("Results for 1st iteration of KMeans")
km = KMeans (n_clusters = 5, init = centroids, max_iter = 1)
km.fit(x)
print(f"The centroids are {km.cluster_centers_}")
# Plot
plt.scatter ( x[:, 0], x[:, 1], c = km.labels_, cmap = "rainbow")
plt.title ("1st Iteration of KMeans")
plt.show()

```
******************************************
Results for 1st iteration of KMeans
The centroids are [[ 0.12781901 1.96326963]
[-1.58138962 8.0665792 ]
[ 0.53251796 4.92290316]
[ 9.21059989 -2.47392331]
[ 2.989047 1.35068599]]
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/cluster/_kmeans.py:1146: RuntimeWarning: Explicit initial center position passed: performing only one init in KMeans instead of n_init=10.
self._check_params(X)
```

# 2 iterations
centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ])
print()
print("******************************************")
print("Results for 2nd iteration of KMeans")
km2 = KMeans(n_clusters = 5, init = centroids, max_iter = 2)
km2.fit(x)
# Plot
plt.scatter (x[:, 0], x[:, 1], c = km2.labels_, cmap= "rainbow")
plt.title("2nd Iteration of KMeans")
plt.show()

```
******************************************
Results for 2nd iteration of KMeans
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/cluster/_kmeans.py:1146: RuntimeWarning: Explicit initial center position passed: performing only one init in KMeans instead of n_init=10.
self._check_params(X)
```

# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
# Import models
from sklearn.cluster import KMeans

# Create dataframe from file insurance.csv
df = pd.read_csv("insurance.csv")
df.head(10)

ageint64

18 - 60

sexobject

female50%

male50%

0

19

female

1

18

male

2

28

male

3

33

male

4

32

male

5

31

female

6

46

female

7

37

female

8

37

male

9

60

female

#Create a new dataframe which only has the two features (age and bmi) and the charges
new_df = df.drop(columns = ["sex", "children", "smoker", "region"])

# Determine how many data instances there are
new_df.info()
# so 1338 data instances
print("***********************************")
print(f"There are 1338 data instances")

```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 1338 non-null int64
1 bmi 1338 non-null float64
2 charges 1338 non-null float64
dtypes: float64(2), int64(1)
memory usage: 31.5 KB
***********************************
There are 1338 data instances
```

# Find the maximum & minimum value of the charges
charges = new_df.charges
print(f" The minimum charge is {min(charges)}")
print(f" The maximum charge is {max(charges)}")

```
The minimum charge is 1121.8739
The maximum charge is 63770.42801
```

# Print out the data instance where the max occurs
df[new_df.charges == max(charges)]

ageint64

sexobject

543

54

female

# Print out all data instances where charges are >60,000
df[new_df.charges > 60000]

ageint64

sexobject

543

54

female

1230

52

male

1300

45

male

# Print out all data instances where bmi > 35 and charges > 50,000
df[(new_df.charges > 50000) & (new_df.bmi > 35)]

ageint64

sexobject

34

28

male

543

54

female

577

31

female

819

33

female

# Delete all data instances where the charges are more than $60,000
# LT60k = Less Than 60 k
df_LT60k = new_df.drop ([543, 1230, 1300]) # Rows to delete
# Print out all data instances where charges are >60000 to check
df_LT60k [df.charges > 60000]

```
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:7: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
import sys
```

ageint64

bmifloat64

# Scale costs between $0 and $100; round costs to 2 decimal places (for cents)
# print out the first few entries to check
charge_min = min(df_LT60k.charges)
charge_max = max(df_LT60k.charges)
scaledcosts = 100. * (df_LT60k.charges - charge_min) / (charge_max - charge_min)
costs = np.round_(scaledcosts, 2)
print(costs)
# Not sure if I did this correctly

```
0 27.44
1 1.05
2 5.79
3 36.31
4 4.78
...
1333 16.50
1334 1.89
1335 0.88
1336 1.54
1337 48.77
Name: charges, Length: 1335, dtype: float64
```

# Add column to dataframe with scaled charges and remove column with full charges
df_LT60k["scaled charges"] = costs
df.drop(columns = ["charges"])

ageint64

18 - 64

sexobject

male50.5%

female49.5%

0

19

female

1

18

male

2

28

male

3

33

male

4

32

male

5

31

female

6

46

female

7

37

female

8

37

male

9

60

female

# Create 2D array and use KMeans with 4 clusters
x = np.array(costs)
n = len(x)
x = np.reshape(x, (n, 1))
km = KMeans (n_clusters = 4)
km.fit(x)

# Print out the labels
print(km.labels_)
# add column to dataframe giving cluster labels
df_LT60k["clusters"] = km.labels_

```
[2 0 0 ... 0 0 1]
```

# Set white background grid for Seaborn plots
sns.set_style("whitegrid")

# Create scatterplot of charges vs bmi with cluster indiciated by hue using Seaborn's relplot
sns.relplot(x = "bmi", y = "charges", data = df_LT60k, hue = "clusters") .set(title = "BMI vs Charges")
# Made x-values bmi because it made more sense personally

# Create scatterplot of charges vs age with cluster indiciated by hue using Seaborn's relplot
sns.relplot(x = "age", y = "charges", data = df_LT60k, hue = "clusters") .set(title = "Age vs Charges")
# Made x-values age because it made more sense personally