# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
#
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# # Create a data set in 2 dimensions using make_blobs
#
# The function make_blobs stores the data in x and the cluster each point belongs to in y
#
(x,y)= make_blobs (
n_samples =200, n_features=2,centers=5,cluster_std=0.5,
shuffle=True,random_state=0
)
plt.scatter(
x[:,0],x[:,1],
c='white', edgecolor='black',marker='o', s=50)
plt.show()
# Calculate centroids of blobs; remember that the array y contains cluster number
n = len(y)
n_blobs=5
n_points_blob = 40.
blob_mean = np.zeros( ( n_blobs,2) )
#
# Loop to sum up x and y coordinates of each point in blob
for i in range (0,n):
blob=y[i] # given blob index (0,1,2,3,4)
blob_mean [blob,0] = blob_mean [blob,0] + x[i,0]
blob_mean [blob,1] = blob_mean [blob,1] + x[i,1]
# Loop to divide by number of points in blob and print out
for k in range (0,n_blobs) :
blob_mean [k,:] = blob_mean [k,:] / n_points_blob
print (f"Blob {k+1} has centroid: ( { blob_mean [k,0] }, { blob_mean [k,1] } ) " )
Blob 1 has centroid: ( 0.9148227503116141, 4.280336857833884 )
Blob 2 has centroid: ( 2.054873265582555, 1.1110568334393494 )
Blob 3 has centroid: ( -1.543516197703645, 2.8134551104965806 )
Blob 4 has centroid: ( -1.3675452152752192, 7.891534958387625 )
Blob 5 has centroid: ( 9.210599887923255, -2.473923314565855 )
# Create KMeans model and fit data in array x which is in the correct shape: 200 by 2
kmeans = KMeans(n_clusters=3,init='random',random_state=0)
kmeans.fit(x)
# Plot clusters using colors and compare visually with blob plot above
plt.scatter(x[:,0],x[:,1],c=kmeans.labels_,cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],marker='x',c='black',label='centroids')
plt.legend(scatterpoints=1)
plt.show()
plt.scatter(x[:,0],x[:,1],c='white',edgecolors='black',marker='o',s=50)
plt.show()
# Print out final clusters centroids and print out actual centroids to compare
#
print (" KMeans Centroids ", " Original Centroids ")
print(f" {kmeans.cluster_centers_} {blob_mean}" )
# Note that Kmeans cluster 3 corresponds to our cluster 5 and vice versa
KMeans Centroids Original Centroids
[[ 0.47539327 2.7349496 ]
[ 9.21059989 -2.47392331]
[-1.36754522 7.89153496]] [[ 0.91482275 4.28033686]
[ 2.05487327 1.11105683]
[-1.5435162 2.81345511]
[-1.36754522 7.89153496]
[ 9.21059989 -2.47392331]]
# 1 iteration
centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ])
print ()
print ("******************************************")
print ("Results for 1st iteration of KMeans")
kmeans_model = KMeans(n_clusters=5,init=centroids,max_iter=1)
kmeans_model.fit(x)
print (f'The Centroids are {kmeans_model.cluster_centers_}')
plt.scatter(x[:,0],x[:,1],c=kmeans_model.labels_,cmap='viridis')
plt.title('1 iteration')
plt.show()
******************************************
Results for 1st iteration of KMeans
Centroids are [[ 0.12781901 1.96326963]
[-1.58138962 8.0665792 ]
[ 0.53251796 4.92290316]
[ 9.21059989 -2.47392331]
[ 2.989047 1.35068599]]
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/cluster/_kmeans.py:1146: RuntimeWarning: Explicit initial center position passed: performing only one init in KMeans instead of n_init=10.
self._check_params(X)
# 2 iterations
centroids = np.array( [ [-3,-2], [-2,12], [3,8], [12,0], [2,12] ])
print ()
print ("******************************************")
print ("Results for 2nd iteration of KMeans")
kmeans_model_2 = KMeans(n_clusters=5,init=centroids,max_iter=2)
kmeans_model_2.fit(x)
print (f'The Centroids are {kmeans_model_2.cluster_centers_}')
plt.scatter(x[:,0],x[:,1],c=kmeans_model_2.labels_,cmap='viridis')
plt.title('2 iterations')
plt.show()
******************************************
Results for 2nd iteration of KMeans
The Centroids are [[-1.25277421 2.68078382]
[-1.36754522 7.89153496]
[ 0.89038117 4.28715579]
[ 9.21059989 -2.47392331]
[ 2.12650031 1.07647868]]
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/cluster/_kmeans.py:1146: RuntimeWarning: Explicit initial center position passed: performing only one init in KMeans instead of n_init=10.
self._check_params(X)
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
# Import models
from sklearn.cluster import KMeans
# Create dataframe from file insurance.csv
df = pd.read_csv('insurance.csv')
#Create a new dataframe which only has the two features (age and bmi)and the charges
#
new_df = df[['age','bmi','charges']].copy()
# Determine how many data instances there are
new_df.info()
# so 1338 data instances
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 1338 non-null int64
1 bmi 1338 non-null float64
2 charges 1338 non-null float64
dtypes: float64(2), int64(1)
memory usage: 31.5 KB
# Find the maximum & minimum value of the charges
print(f'The minimum of charges is {min(new_df.charges)}')
print(f'The maximum of charges is {max(new_df.charges)}')
The minimum of charges is 1121.8739
The maximum of charges is 63770.42801
# Print out the data instance where the max occurs
print(max(new_df.charges))
63770.42801
# Print out all data instances where charges are >60,000
charges = new_df.charges.values
bmi = new_df.bmi.values
for charge in charges:
if charge > 60000:
print(charge)
63770.42801
60021.39897
62592.87309
# Print out all data instances where bmi > 35 and charges > 50,000
print (new_df[(new_df['bmi']>35) & (new_df['charges']>50000)])
age bmi charges
34 28 36.400 51194.55914
543 54 47.410 63770.42801
577 31 38.095 58571.07448
819 33 35.530 55135.40209
# Delete all data instances where the charges are more than $60,000
df_without_charge_outliers = new_df[new_df.charges < 60000]
new_charges_column = df_without_charge_outliers['charges']
# Print out all data instances where charges are >60000 to check
for i in new_charges_column:
if i > 60000:
print(i)
# Scale costs between $0 and $100; round costs to 2 decimal places (for cents)
# print out the first few entries to check
charge_min=min(new_charges_column)
charge_max=max(new_charges_column)
scaled_cost = 100 * (new_charges_column-charge_min)/(charge_max-charge_min)
rounded_cost=np.round(scaled_cost,2)
print(rounded_cost)
0 27.44
1 1.05
2 5.79
3 36.31
4 4.78
...
1333 16.50
1334 1.89
1335 0.88
1336 1.54
1337 48.77
Name: charges, Length: 1335, dtype: float64
# Add column to dataframe with scaled charges and remove column with full charges
df_without_charge_outliers['scaled and rounded costs'] = rounded_cost
df.drop(columns = ['charges'])
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
ageint64
18 - 64
sexobject
male50.5%
female49.5%
0
19
female
1
18
male
2
28
male
3
33
male
4
32
male
5
31
female
6
46
female
7
37
female
8
37
male
9
60
female
# Create 2D array and use KMeans with 4 clusters
x = np.array(rounded_cost)
n=len(x)
x=np.reshape(x,(n,1))
km = KMeans(n_clusters=4)
km.fit(x)
# Print out the labels
print(km.labels_)
#
# add column to dataframe giving cluster labels
df_without_charge_outliers['clusters'] = km.labels_
[2 0 0 ... 0 0 2]
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Set white background grid for Seaborn plots
sns.set_style ( "whitegrid")
# Create scatterplot of charges vs bmi with cluster indiciated by hue using Seaborn's relplot
sns.relplot(x='charges',y='bmi',data=df_without_charge_outliers,hue='clusters')
# Create scatterplot of charges vs age with cluster indiciated by hue using Seaborn's relplot
sns.relplot(x='charges',y='age',data=df_without_charge_outliers,hue='clusters')