Efficient K-means Clustering for Data-Driven Modelling

Read the full article from here.

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import plotly as py import plotly.graph_objs as go from sklearn.cluster import KMeans

df = pd.read_csv('/work/cars.csv') df.head()

df.columns

df.info()

Here, the brand column holds categorical value. So we need to convert it to numeric value.

mapping={" US.":1,' Europe.':2,' Japan.':3} df['Brand']=df[' brand'].map(mapping)

df=df.drop(columns=[' brand'])

df.head()

df.info()

car=df.copy() car[' weightlbs']=pd.to_numeric(car[' weightlbs'], errors='coerce') car[' cubicinches']=pd.to_numeric(car[' cubicinches'], errors='coerce')

car.describe()

car.isnull().sum()

Some missing values are found. We need to fill up the missing values. I will do it in the next code cell.

car[' cubicinches'].fillna((car[' cubicinches'].mean()), inplace=True) car[' weightlbs'].fillna((car[' weightlbs'].mean()), inplace=True)

car.info()

car.columns

plt.figure(1 , figsize = (15 , 6)) n = 0 for x in car.columns: n += 1 plt.subplot(2 , 4 , n) plt.subplots_adjust(hspace = 0.5 , wspace = 0.5) sns.distplot(car[x] , bins = 15) plt.title('Distplot of {}'.format(x)) plt.show()

Finding Optimum K values

X3 = car[car.columns].iloc[: , :].values inertia = [] for n in range(1 , 11): algorithm = (KMeans(n_clusters = n, init='k-means++', n_init = 10, max_iter=300, tol=0.0001, random_state= 111, algorithm='elkan')) algorithm.fit(X3) inertia.append(algorithm.inertia_)

plt.figure(1 , figsize = (15 ,6)) plt.plot(np.arange(1 , 11) , inertia , 'o') plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5) plt.xlabel('Number of Clusters') , plt.ylabel('Inertia') plt.show()

Number of optimal clusters is 3 as the bend of elbow started at that point.

from sklearn.decomposition import PCA pca = PCA(n_components=2) principalComponents = pca.fit_transform(X3) principalDf = pd.DataFrame(data = principalComponents, columns = ['Component 1', 'Component 2'])

import matplotlib.pyplot as plt fig = plt.figure(figsize = (6,4)) ax = fig.add_subplot(1,1,1) ax.set_xlabel('x-axis', fontsize = 15) ax.set_ylabel('y-axis', fontsize = 15) # ax.set_title('2D component PCA', fontsize = 20) ax.scatter(principalDf['Component 1'],principalDf['Component 2']) #ax.legend(targets) # ax.grid() plt.xticks([]) plt.yticks([]) # plt.savefig("health.png")

# sorted_car=principalDf.sort_values(['Component 1', 'Component 2'], ascending=[True, True])

# sorted_car

Applying Percentile for the dataset to Split the data

So we will create 3 cluster. so we need to split the dataset into 3 equal part. As percentile is calculated within 100%. So, 100/3=33.3% in each cluster may be a optimum value..

clusters={} percen=0 for i in range(3): percen+=33.3 clusters[i]=np.percentile(principalDf, percen, axis=0)

clusters

fig = plt.figure(figsize = (6,4)) ax = fig.add_subplot(1,1,1) ax.set_xlabel('Principal Component 1', fontsize = 15) ax.set_ylabel('Principal Component 2', fontsize = 15) ax.set_title('2D component PCA', fontsize = 20) ax.scatter(principalDf['Component 1'],principalDf['Component 2']) xposition = [clusters[0][0],clusters[1][0],clusters[2][0]] for xc in xposition: plt.axvline(x=xc, color='k', linestyle='--') plt.show()

Now, it's time to find the cluster centroids using the proposed method.

################### #list for holding the data for each part cluster_value1=[] cluster_value2=[] cluster_value3=[] for p in principalDf['Component 1']: if p<=clusters[0][0]: cluster_value1.append(p) elif (p>clusters[0][0] and p<=clusters[1][0]): cluster_value2.append(p) elif (p>clusters[1][0] and p<=clusters[2][0]): cluster_value3.append(p) ############################# data1=principalDf[principalDf['Component 1'].isin(cluster_value1)] data2=principalDf[principalDf['Component 1'].isin(cluster_value2)] data3=principalDf[principalDf['Component 1'].isin(cluster_value3)] ############################### #merging the segregated part to the main data for extracting the real data. first=pd.merge(car, data1, left_index=True, right_index=True).iloc[:,:8] second=pd.merge(car, data2,left_index=True, right_index=True).iloc[:,:8] third=pd.merge(car, data3, left_index=True, right_index=True).iloc[:,:8] # ################################

iteration=[] time_execution=[] import time for i in range(10): start=time.time() centers=[] j=0 for i in [first,second,third]: #calculating mean values of each segment for finding the optimum cluster centroids centers.append(np.mean(i).values) # ############################## centers=np.array(centers) #feeding the centroids to the main k-means clustering algorithm kmeans = KMeans(3, init = centers,max_iter = 100000) kmeans.fit(car) end=time.time() execution_time=end-start no_of_iteration=kmeans.n_iter_ iteration.append(no_of_iteration) time_execution.append(execution_time) # print((execution_time)) # print(no_of_iteration)

iteration

time_execution

iteration_default=[] execution_time_default=[] for i in range(10): start1=time.time() kmeans = KMeans(3,max_iter = 100000) kmeans.fit(car) iteration_default.append(kmeans.n_iter_) # print(kmeans.n_iter_) end1=time.time() execution_time_default.append(end1-start1) # print(end1-start1)

iteration_default execution_time_default

Plotting the comparison graph

import matplotlib.pyplot as plt plt.plot(iteration, 'o-', color='green', label='Improved') plt.plot(iteration_default, 'o-', color='black', label='Default') plt.legend() plt.title('Iteration Comparison') plt.show()

import matplotlib.pyplot as plt plt.plot(time_execution, 'o-', color='green', label='Improved') plt.plot(execution_time_default, 'o-', color='black', label='Default') plt.legend() plt.title('Execution Time Comparison') plt.show()

predict = kmeans.predict(car) car['cluster'] = predict pd.plotting.parallel_coordinates(car, 'cluster')