import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics.cluster import adjusted_rand_score
irisdata = pd.read_csv("/work/Assignment5/iris.csv")
x= irisdata.iloc[:,:4]
y= irisdata.iloc[:,-1]
sc =StandardScaler()
sc.fit(x)
std_array =sc.transform(x)
X = pd.DataFrame(std_array,columns = x.columns)
cluster =GaussianMixture(n_components=3)
cluster.fit(X)
y_pred =cluster.predict(X)
score = adjusted_rand_score(y,y_pred)
score
from sklearn.decomposition import PCA
pca =PCA(n_components=2)
pca_array =pca.fit_transform(irisdata.drop(['species'],axis=1))
pca_df =pd.DataFrame(pca_array,columns=["PC1","PC2"])
pca_df.head()
PC1float64
PC2float64
0
-2.684125625969541
0.3193972465851031
1
-2.7141416872943274
-0.17700122506477925
2
-2.888990569059296
-0.14494942608555722
3
-2.745342855641409
-0.31829897925191586
4
-2.7287165365545287
0.32675451293492047
col_code = {0:"yellow",1:"darkblue",2:"green"}
label = {0:"setosa",1:"versicolor",2:"virginica"}
pca_df["labels"]= pd.DataFrame(y_pred)
groups = pca_df.groupby('labels')
# Grouping instances based on species
groups.mean()
PC1float64
PC2float64
0
-2.6424154639468496
0.19088504677007037
1
2.0319540748561025
0.029531467490063378
2
0.45251775733903987
-0.24818851223237914
fig, ax =plt.subplots(1,1,figsize =(15,10))
for name, group in groups:
ax.plot(group.PC1,group.PC2,color =col_code[name],label =label[name],marker='o',linestyle='',ms=10)
ax.legend()
plt.show()