import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
print('Setup Completed ^^')
data = pd.read_csv('abalone.csv')
data.head().T
data.shape
data.dtypes
data.Sex.value_counts(normalize=True)
# Number of Rings distributions :
data.Rings.value_counts().sort_index()
# seaborn styles
sns.set_context('notebook')
sns.set_style('white')
# custom colors
male = sns.color_palette()[1]
female = sns.color_palette()[4]
infant = sns.color_palette()[8]
# set bins for histogram
bin_range = np.array(range(1, 31))
# plot histogram of quality counts for red and white wines
ax = plt.axes()
for color, plot_color in zip(['M', 'F', 'I'], [male, female, infant]):
q_data = data.loc[data.Sex==color, 'Rings']
q_data.hist(bins=bin_range,
alpha=0.5, ax=ax,
color=plot_color, label=color)
ax.legend()
ax.set(xlabel='Number of Rings', ylabel='Frequency')
# force tick labels to be in middle of region
ax.set_xlim(3,30)
ax.set_xticks(bin_range+0.5)
ax.set_xticklabels(bin_range);
ax.grid('on')
sns.set_style('white')
sns.countplot(data=data, y='Sex')
sns.barplot(x=data['Sex'], y=data['Rings'])
float_columns = [x for x in data.columns if x not in ['Sex', 'Rings']]
# correlation matrix :
corr_mat = data[float_columns].corr()
# Strip out the diagnoal values for next step:
for x in range(len(float_columns)):
corr_mat.iloc[x,x] = 0.0
corr_mat
sns.heatmap(data=corr_mat,annot=True)
# pairwise maximal correlations:
corr_mat.abs().idxmax()
skew_columns = (data[float_columns].skew().sort_values(ascending=False))
skew_columns = skew_columns.loc[skew_columns > 0.75]
skew_columns
for col in skew_columns.index.tolist():
data[col] = np.log1p(data[col])
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
data[float_columns] = sc.fit_transform(data[float_columns])
data.head()
sns.set_context('notebook')
sns.pairplot(data[float_columns])
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans = kmeans.fit(data[float_columns])
data['kmeans'] = kmeans.predict(data[float_columns])
(data[['Sex', 'kmeans']].groupby(['kmeans','Sex']).size().to_frame().rename(columns={0: 'number'}))
# create and fit a range of models:
km_list = list()
for clust in range(1, 21):
km = KMeans(n_clusters=clust, random_state=42)
km = km.fit(data[float_columns])
km_list.append(pd.Series({'clusters': clust,
'inertia':km.inertia_,
'model':km}))
plot_data = (pd.concat(km_list, axis=1)
.T
[['clusters','inertia']]
.set_index('clusters'))
ax = plot_data.plot(marker='o', ls='-', color='purple')
ax.set_xticks(range(0, 21, 2))
ax.set_xlim(0,21)
ax.set(xlabel='Cluster', ylabel='Inertia');
from sklearn.cluster import AgglomerativeClustering
ag = AgglomerativeClustering(n_clusters=3, linkage='ward', compute_full_tree=True)
ag = ag.fit(data[float_columns])
data['agglom'] = ag.fit_predict(data[float_columns])
data[['Sex', 'agglom', 'kmeans']].groupby(['Sex', 'agglom']).size().to_frame().rename(columns={0:'number'})
(data[['Sex','agglom','kmeans']]
.groupby(['Sex','agglom','kmeans'])
.size()
.to_frame()
.rename(columns={0:'number'}))
data.describe(include='object').T
data.describe().T
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit
y = (data['Rings'] > 9).astype(int)
X_with_kmeans = data.drop(['agglom', 'Sex', 'Rings'], axis=1)
X_without_kmeans = X_with_kmeans.drop('kmeans', axis=1)
sss = StratifiedShuffleSplit(n_splits=10, random_state=6535)
def get_avg_roc_10splits(estimator, X, y):
roc_auc_list = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X.loc[train_index], X.iloc[test_index]
y_train, y_test = y.loc[train_index], y.iloc[test_index]
estimator.fit(X_train, y_train)
y_predictor = estimator.predict(X_test)
y_scored = estimator.predict_proba(X_test)[:, 1]
roc_auc_list.append(roc_auc_score(y_test, y_scored))
return np.mean(roc_auc_list)
estimator = RandomForestClassifier()
roc_with_kmeans = get_avg_roc_10splits(estimator, X_with_kmeans, y)
roc_without_kmeans = get_avg_roc_10splits(estimator, X_without_kmeans, y)
print("Without kmeans cluster as input to Random Forest, roc-auc is \"{0}\"".format(roc_without_kmeans))
print("Using kmeans cluster as input to Random Forest, roc-auc is \"{0}\"".format(roc_with_kmeans))