import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
data=pd.read_csv('/work/crime_data2.csv')
data = pd.read_csv("crime_data2.csv")
data.head()
data.isnull().any()
#Dropping the categorical feature and copy the remaining data to another data frame
mydata = data.iloc[:, data.columns!='Unnamed: 0']
mydata.head()
#Scaling Data (used MinMaxScaler)
scaler = MinMaxScaler()
norm_mydata = mydata.copy()
def minmaxscaler(x):
for columnName, columnData in x.iteritems():
x[columnName] = scaler.fit_transform(np.array(columnData).reshape(-1, 1))
minmaxscaler(norm_mydata)
norm_mydata.head()
#Scree plot or Elbow plot to find K
from matplotlib import style
style.use('fivethirtyeight')
k = list(range(2,11))
sum_of_squared_distances = []
for i in k:
kmeans = KMeans(n_clusters=i)
kmeans.fit(norm_mydata)
sum_of_squared_distances.append(kmeans.inertia_)
plt.figure(figsize=(10, 5))
plt.plot(k, sum_of_squared_distances, 'go--')
plt.xlabel('Number of Clusters')
plt.ylabel('Within Cluster Sum of squares')
plt.title('Elbow Curve to find optimum K')
#Building KMeans model with K=4 (Training and Predicting)
# Instantiating
kmeans4 = KMeans(n_clusters = 4)
# Training the model
kmeans4.fit(norm_mydata)
# predicting
y_pred = kmeans4.fit_predict(norm_mydata)
print(y_pred)
# Storing the y_pred values in a new column
data['Cluster'] = y_pred+1 #to start the cluster number from 1
#Storing the centroids to a data frame
centroids = kmeans4.cluster_centers_
centroids = pd.DataFrame(centroids, columns=['Murder', 'Assault', 'UrbanPop', 'Rape'])
centroids.index = np.arange(1, len(centroids)+1) # Start the index from 1
centroids
#Sample visualization of clusters
#Let’s just take any two of the features and plot to see how
#the observations are clustered.
import seaborn as sns
plt.figure(figsize=(12,6))
sns.set_palette("pastel")
sns.scatterplot(x=data['Murder'], y = data['Assault'], hue=data['Cluster'],
palette='bright')