Read the full article from here.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.graph_objs as go
from sklearn.cluster import KMeans
df = pd.read_csv('/work/cars.csv')
df.head()
df.columns
df.info()
Here, the brand column holds categorical value. So we need to convert it to numeric value.
mapping={" US.":1,' Europe.':2,' Japan.':3}
df['Brand']=df[' brand'].map(mapping)
df=df.drop(columns=[' brand'])
df.head()
df.info()
car=df.copy()
car[' weightlbs']=pd.to_numeric(car[' weightlbs'], errors='coerce')
car[' cubicinches']=pd.to_numeric(car[' cubicinches'], errors='coerce')
car.describe()
car.isnull().sum()
Some missing values are found. We need to fill up the missing values. I will do it in the next code cell.
car[' cubicinches'].fillna((car[' cubicinches'].mean()), inplace=True)
car[' weightlbs'].fillna((car[' weightlbs'].mean()), inplace=True)
car.info()
car.columns
plt.figure(1 , figsize = (15 , 6))
n = 0
for x in car.columns:
n += 1
plt.subplot(2 , 4 , n)
plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
sns.distplot(car[x] , bins = 15)
plt.title('Distplot of {}'.format(x))
plt.show()
Finding Optimum K values
X3 = car[car.columns].iloc[: , :].values
inertia = []
for n in range(1 , 11):
algorithm = (KMeans(n_clusters = n, init='k-means++', n_init = 10, max_iter=300,
tol=0.0001, random_state= 111, algorithm='elkan'))
algorithm.fit(X3)
inertia.append(algorithm.inertia_)
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()
Number of optimal clusters is 3 as the bend of elbow started at that point.
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X3)
principalDf = pd.DataFrame(data = principalComponents, columns = ['Component 1', 'Component 2'])
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (6,4))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('x-axis', fontsize = 15)
ax.set_ylabel('y-axis', fontsize = 15)
# ax.set_title('2D component PCA', fontsize = 20)
ax.scatter(principalDf['Component 1'],principalDf['Component 2'])
#ax.legend(targets)
# ax.grid()
plt.xticks([])
plt.yticks([])
# plt.savefig("health.png")
# sorted_car=principalDf.sort_values(['Component 1', 'Component 2'], ascending=[True, True])
# sorted_car
Applying Percentile for the dataset to Split the data
So we will create 3 cluster. so we need to split the dataset into 3 equal part. As percentile is calculated within 100%. So, 100/3=33.3% in each cluster may be a optimum value..
clusters={}
percen=0
for i in range(3):
percen+=33.3
clusters[i]=np.percentile(principalDf, percen, axis=0)
clusters
fig = plt.figure(figsize = (6,4))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2D component PCA', fontsize = 20)
ax.scatter(principalDf['Component 1'],principalDf['Component 2'])
xposition = [clusters[0][0],clusters[1][0],clusters[2][0]]
for xc in xposition:
plt.axvline(x=xc, color='k', linestyle='--')
plt.show()
Now, it's time to find the cluster centroids using the proposed method.
###################
#list for holding the data for each part
cluster_value1=[]
cluster_value2=[]
cluster_value3=[]
for p in principalDf['Component 1']:
if p<=clusters[0][0]:
cluster_value1.append(p)
elif (p>clusters[0][0] and p<=clusters[1][0]):
cluster_value2.append(p)
elif (p>clusters[1][0] and p<=clusters[2][0]):
cluster_value3.append(p)
#############################
data1=principalDf[principalDf['Component 1'].isin(cluster_value1)]
data2=principalDf[principalDf['Component 1'].isin(cluster_value2)]
data3=principalDf[principalDf['Component 1'].isin(cluster_value3)]
###############################
#merging the segregated part to the main data for extracting the real data.
first=pd.merge(car, data1, left_index=True, right_index=True).iloc[:,:8]
second=pd.merge(car, data2,left_index=True, right_index=True).iloc[:,:8]
third=pd.merge(car, data3, left_index=True, right_index=True).iloc[:,:8]
# ################################
iteration=[]
time_execution=[]
import time
for i in range(10):
start=time.time()
centers=[]
j=0
for i in [first,second,third]:
#calculating mean values of each segment for finding the optimum cluster centroids
centers.append(np.mean(i).values)
# ##############################
centers=np.array(centers)
#feeding the centroids to the main k-means clustering algorithm
kmeans = KMeans(3, init = centers,max_iter = 100000)
kmeans.fit(car)
end=time.time()
execution_time=end-start
no_of_iteration=kmeans.n_iter_
iteration.append(no_of_iteration)
time_execution.append(execution_time)
# print((execution_time))
# print(no_of_iteration)
iteration
time_execution
iteration_default=[]
execution_time_default=[]
for i in range(10):
start1=time.time()
kmeans = KMeans(3,max_iter = 100000)
kmeans.fit(car)
iteration_default.append(kmeans.n_iter_)
# print(kmeans.n_iter_)
end1=time.time()
execution_time_default.append(end1-start1)
# print(end1-start1)
iteration_default
execution_time_default
Plotting the comparison graph
import matplotlib.pyplot as plt
plt.plot(iteration, 'o-', color='green', label='Improved')
plt.plot(iteration_default, 'o-', color='black', label='Default')
plt.legend()
plt.title('Iteration Comparison')
plt.show()
import matplotlib.pyplot as plt
plt.plot(time_execution, 'o-', color='green', label='Improved')
plt.plot(execution_time_default, 'o-', color='black', label='Default')
plt.legend()
plt.title('Execution Time Comparison')
plt.show()
predict = kmeans.predict(car)
car['cluster'] = predict
pd.plotting.parallel_coordinates(car, 'cluster')