!python -m pip install -q --upgrade pip
!pip install -q -r requirements.txt
# Miscellaneous
from __future__ import print_function
#Importing Requierd Libraries
import pandas as pd
import numpy as np
from itertools import product
# For interactive graphics
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
from yellowbrick.cluster import KElbowVisualizer
from ydata_profiling import ProfileReport
# Sklearn
from sklearn.preprocessing import StandardScaler
import sklearn.impute
import sklearn.compose
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, AffinityPropagation, MeanShift, SpectralClustering, estimate_bandwidth
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.neighbors import NearestNeighbors
# Magics Funtions
%load_ext autoreload
%autoreload 2
%run "template_visualitation.ipynb"
%run "pandas-missing-extension.ipynb"
# Install libraries with pip
# !pip install pyarrow
# !pip install pandarallel
# !pip install PivotTableJS
%%time
df_campaign = pd.read_csv('marketing_campaign.csv', sep='\t')
df_campaign.to_feather("marketing_campaign.feather")
%%time
df_campaign = pd.read_feather("marketing_campaign.feather")
df_campaign.describe()
df_campaign.info(memory_usage = "deep")
memory_usage = df_campaign.memory_usage(deep=True) / 1024 ** 2
print(f'memory usage of features:\n{memory_usage.head(7)}')
print('memory usage sum:',memory_usage.sum())
def reduce_memory_usage(df, verbose=True):
numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
start_mem = df.memory_usage().sum() / 1024 ** 2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == "int":
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if (
c_min > np.finfo(np.float16).min
and c_max < np.finfo(np.float16).max
):
df[col] = df[col].astype(np.float16)
elif (
c_min > np.finfo(np.float32).min
and c_max < np.finfo(np.float32).max
):
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024 ** 2
if verbose:
print(
"Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
end_mem, 100 * (start_mem - end_mem) / start_mem
)
)
return df
df_campaign = reduce_memory_usage(df_campaign, verbose=True)
df_campaign.info(memory_usage="deep")
df_campaign_transformed = df_campaign.copy(deep=True)
print(df_campaign_transformed['Education'].value_counts(), '\n')
print(df_campaign_transformed['Marital_Status'].value_counts())
df_campaign_transformed['Education'] = df_campaign_transformed['Education'].replace(['PhD','2n Cycle','Graduation', 'Master'],'Post Graduate')
df_campaign_transformed['Education'] = df_campaign_transformed['Education'].replace(['Basic'], 'Under Graduate')
df_campaign_transformed['Marital_Status'].value_counts()
df_campaign_transformed['Marital_Status'] = df_campaign_transformed['Marital_Status'].replace(['Married', 'Together'],'Relationship')
df_campaign_transformed['Marital_Status'] = df_campaign_transformed['Marital_Status'].replace(['Divorced', 'Widow', 'Alone', 'YOLO', 'Absurd'],'Single')
df_campaign_transformed['Dt_Customer'].head(1)
df_campaign_transformed['Dt_Customer'] = pd.to_datetime(df_campaign_transformed['Dt_Customer'], dayfirst=True, format='%d/%m/%Y', infer_datetime_format=True)
df_campaign_transformed.sort_values(by='Dt_Customer', inplace=True)
df_campaign_transformed = df_campaign_transformed.set_index('Dt_Customer')
df_campaign_transformed['Year_Customer'] = df_campaign_transformed.index.year
df_campaign_transformed['Month_Customer'] = df_campaign_transformed.index.month
df_campaign_transformed
categorical_columns = df_campaign_transformed.select_dtypes(include='object').columns
categorical_columns
categorical_transformer = sklearn.compose.make_column_transformer(
(sklearn.preprocessing.OrdinalEncoder(), [categorical_columns[0]]),
(sklearn.preprocessing.OrdinalEncoder(), [categorical_columns[1]]),
remainder="passthrough"
)
df_campaign_transformed = (
pd.DataFrame(
categorical_transformer.fit_transform(df_campaign_transformed),
columns = categorical_transformer.get_feature_names_out(),
index = df_campaign_transformed.index
)
)
df_campaign_transformed.columns = df_campaign_transformed.columns.str.replace('remainder__', '')
df_campaign_transformed.columns = df_campaign_transformed.columns.str.replace('ordinalencoder-1__', '')
df_campaign_transformed.columns = df_campaign_transformed.columns.str.replace('ordinalencoder-2__', '')
df_campaign_transformed
print(
categorical_transformer
.named_transformers_
.get("ordinalencoder-2")
.categories_
)
print(
categorical_transformer
.named_transformers_
.get('ordinalencoder-2')
.inverse_transform(
X = [[1], [0]]
)
)
df_campaign_transformed.info()
df_campaign_transformed['Kidhome'].value_counts()
df_campaign_transformed['Teenhome'].value_counts()
df_campaign_transformed['Children'] = df_campaign_transformed['Kidhome'] + df_campaign_transformed['Teenhome']
df_campaign_transformed=df_campaign_transformed.drop(columns=["Kidhome", "Teenhome", 'ID'],axis=1)
df_campaign_transformed.missing.missing_variable_summary()
missingno.bar(df=df_campaign_transformed)
missingno.matrix(df=df_campaign_transformed)
knn_imputer = sklearn.impute.KNNImputer()
df_campaign_imputeknn = df_campaign_transformed.copy(deep=True)
df_campaign_imputeknn.iloc[:, :] = knn_imputer.fit_transform(df_campaign_imputeknn).round()
(
pd.concat(
[
df_campaign_imputeknn,
df_campaign_transformed.missing.create_shadow_matrix(True, False, suffix="_imp", only_missing=True)
],
axis=1
)
.pipe(
lambda df: (
px.scatter(
df,
x="Income",
y="MntWines",
color='Income_imp',
marginal_x="box",
marginal_y="box"
)
)
)
)
df_campaign_imputeknn.missing.number_missing()
df_campaign_imputeknn[df_campaign_imputeknn.duplicated()]
df_campaign_imputeknn.duplicated().value_counts()
df_campaign_imputeknn.drop_duplicates(inplace=True)
df_campaign_imputeknn.nunique()
df_campaign_imputeknn = df_campaign_imputeknn.drop(columns=["Z_CostContact", "Z_Revenue"],axis=1)
df_campaign_imputeknn = reduce_memory_usage(df_campaign_imputeknn, verbose=True)
profile = ProfileReport(
df_campaign_imputeknn, title="Pandas Profiling Report", html={"style": {"primary_color": "#FA0087"}},
minimal=True
)
profile
columns_numeric = ['Year_Birth',
'Income', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 10, 'color': 'white'}
# plotly setup
plot_rows=2
plot_cols=7
fig = make_subplots(rows=plot_rows, cols=plot_cols, shared_yaxes=False)
# add traces
x = 0
for i in range(1, plot_rows + 1):
for j in range(1, plot_cols + 1):
fig.add_trace(go.Box(y=df_campaign_imputeknn[columns_numeric[x]].values,
name = df_campaign_imputeknn[columns_numeric].columns[x],
),
row=i,
col=j)
x=x+1
fig.update_layout(
width=1500,
height=800)
fig.show()
df_campaign_imputeknn[df_campaign_imputeknn['Income'] > 300000]
df_campaign_imputeknn['Income'].mean()
df_campaign_imputeknn['Income'][df_campaign_imputeknn['Income'] < 300000].mean()
df_campaign_imputeknn['Income'] = df_campaign_imputeknn['Income'].replace(666666.0, df_campaign_imputeknn['Income'][df_campaign_imputeknn['Income'] < 300000].mean())
df_campaign_imputeknn[df_campaign_imputeknn['Income'] > 300000]
g = sns.swarmplot(data=df_campaign_imputeknn, x='Income', s=4.5, orient="h")
plt.show(g)
df_campaign_imputeknn[df_campaign_imputeknn['Year_Birth'] < 1920]
round(df_campaign_imputeknn.Year_Birth.astype("float32").mean(), 0)
df_campaign_imputeknn['Year_Birth'][df_campaign_imputeknn['Year_Birth'] > 1920].astype("float32").mean()
df_campaign_imputeknn['Year_Birth'] = df_campaign_imputeknn['Year_Birth'].replace([1899.0, 1900.0, 1893.0], round(df_campaign_imputeknn['Year_Birth'][df_campaign_imputeknn['Year_Birth'] > 1920].astype("float32").mean(), 0))
df_campaign_imputeknn[df_campaign_imputeknn['Year_Birth'] < 1920]
g = sns.swarmplot(data=df_campaign_imputeknn, x='Year_Birth', s=4.5, orient="h")
plt.show(g)
columns_categorical = ['Children', 'Education', 'Marital_Status', 'Response']
from plotly.subplots import make_subplots
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 10, 'color': 'white'}
# plotly setup
plot_rows=1
plot_cols=4
fig = make_subplots(rows=plot_rows,
cols=plot_cols,
shared_yaxes=False,
vertical_spacing=0.1,
subplot_titles=('1', '2', '3', '4'),
specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}]]
)
# add traces
x = 0
names = {}
count = 1
for i in range(1, plot_rows + 1):
for j in range(1, plot_cols + 1):
fig1 = px.pie(df_campaign_imputeknn, values='Income', names=columns_categorical[x])
trace1 = fig1.data[0]
fig.add_trace(trace1,
row=i,
col=j)
names[str(count)]=columns_categorical[x]
x=x+1
count = count + 1
fig.for_each_annotation(lambda a: a.update(text = names[a.text]))
fig.update_layout(width=600, height=400)
fig.update_annotations(y=0.8)
fig.show()
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 8, 'color': 'white'}
fig = px.imshow(
df_campaign_imputeknn[columns_numeric].corr(),
color_continuous_scale=px.colors.diverging.RdBu[::-1],
text_auto='.2f',
zmin=-1,
zmax=1,
)
fig.update_xaxes(tickangle=60, automargin='height')
fig.show()
mpl.rcParams["axes.labelsize"] = 8
g = sns.pairplot(df_campaign_imputeknn[columns_numeric],
corner=True,
diag_kind='kde',
plot_kws=dict(s=10),
)
g.map_lower(sns.kdeplot, levels=4, color="#FA0087")
g.fig.set_size_inches(24,16)
g.fig.suptitle('Grafica comparativa de all vs all')
plt.show(g)
marital_status = (
categorical_transformer
.named_transformers_
.get('ordinalencoder-2')
.inverse_transform(
X = [[0], [1]]
)
)
education_status = (
categorical_transformer
.named_transformers_
.get('ordinalencoder-1')
.inverse_transform(
X = [[0], [1]]
)
)
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 16, 'color': 'white'}
fig = px.histogram(df_campaign_imputeknn, x="Year_Birth", color="Marital_Status", opacity=0.8, barmode='overlay')
fig.update_layout(
title=f"Histogram Marital Status, 0={marital_status[0]} and 1={marital_status[1]}",
)
fig.show()
fig = px.histogram(df_campaign_imputeknn, x="Year_Birth", color="Education", opacity=0.8, barmode='overlay')
fig.update_layout(
title=f"Histogram Education, 0={education_status[0]} and 1={education_status[1]}",
)
fig.show()
hist_data = [df_campaign_imputeknn["Year_Birth"]]
group_labels = ["distplot"] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels, bin_size=1, colors=["#FA0087"])
fig.update_layout(
title="Year Birth costumers"
)
fig.show()
g = sns.catplot(
data=df_campaign_imputeknn, kind="swarm",
x="Marital_Status", y="Income",
hue='Education',
col="Children",
s=3.2,
palette="deep",
)
g.set_xticklabels(["Relationship", "Single"])
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle(f"In Catplot, Education is 0={education_status[0]} and 1={education_status[1]}",
fontsize=24, fontdict={"weight": "bold"})
plt.show(g)
g = sns.catplot(
data=df_campaign_imputeknn, kind="bar",
x="Education", y="Income",
hue='Marital_Status',
col="Children",
palette="deep",
)
g.set_xticklabels(["Post Graduate", "Under Graduate"])
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle(f"In Catplot, marital_status is 0={marital_status[0]} and 1={marital_status[1]}",
fontsize=24, fontdict={"weight": "bold"})
plt.show(g)
df_campaign_imputeknn[['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds', 'Children']].groupby('Children').mean()
df_campaign_imputeknn[['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds', 'Children']].groupby('Children').sum()
subplots_edad = df_campaign_imputeknn.groupby(['Year_Birth']).sum()
fig = make_subplots(
rows=4,
cols=1,
subplot_titles=(
"NumStorePurchases for Year_Birth",
"NumWebPurchases for Year_Birth",
"NumWebVisitsMonth for Year_Birth",
"NumCatalogPurchases for Year_Birth"
),
shared_xaxes=True,
vertical_spacing=0.1
)
fig.add_trace(go.Scatter(x=subplots_edad.index, y=subplots_edad.NumStorePurchases, mode="lines", name="NumStorePurchases", opacity=0.8),
row=1,
col=1,
)
fig.add_trace(go.Scatter(x=subplots_edad.index, y=subplots_edad.NumWebPurchases, mode="lines", name="NumWebPurchases", opacity=0.8),
row=2,
col=1,
)
fig.add_trace(go.Scatter(x=subplots_edad.index, y=subplots_edad.NumWebVisitsMonth, mode="lines", name="NumWebVisitsMonth", opacity=0.8),
row=3,
col=1,
)
fig.add_trace(go.Scatter(x=subplots_edad.index, y=subplots_edad.NumCatalogPurchases, mode="lines", name="NumCatalogPurchases", opacity=0.8),
row=4,
col=1,
)
fig.update_layout(title_text="Subplots variables Place of Dataset")
fig.show()
products = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntGoldProds']
for _ in products:
fig = px.histogram(df_campaign_imputeknn, x='Year_Customer', y=_, color="Month_Customer", barmode='relative', title="Mnts for Year_Customer")
fig.update_layout(width=800, height=600, bargap=0.2)
fig.update_xaxes(tickvals=[2012, 2013, 2014])
fig.show()
scaler = StandardScaler()
df_campaign_scaled = scaler.fit_transform(df_campaign_imputeknn)
df_campaign_scaled = pd.DataFrame(df_campaign_scaled, columns = df_campaign_imputeknn.columns )
df_campaign_scaled
df_campaign_scaled = reduce_memory_usage(df_campaign_scaled, verbose=True)
n = 15
principal_components = {
'PCA': PCA(n_components=n , svd_solver="arpack"),
'IncrementalPCA': IncrementalPCA(n_components=n)
}
fig = go.Figure()
for name, principal_component in principal_components.items():
pca = principal_component
pca.fit(df_campaign_scaled)
cum_var = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
# Aqui anexamos a la misma figura cada trazo
fig.add_trace(go.Scatter(
x=list(range(1, len(principal_component.explained_variance_) + 1)),
y=cum_var,
name=name,
mode='lines+markers',
))
# Aqui verificamos que varianza de datos esta agrupando el metodo usado para nuestros features
components = principal_component.fit_transform(df_campaign_scaled)
total_var = principal_component.explained_variance_ratio_.sum() * 100
print(f"Total Explained Variance: {total_var:.2f}%, to {n} components with {name}")
# Aqui ejecutamos el plot de nuestra grafica
fig.update_layout(showlegend=True)
fig.show()
pca = PCA()
pca.fit(df_campaign_scaled)
pca_data_scaled = pca.transform(df_campaign_scaled)
# Bar plot of explained_variance
plt.bar(
range(1,len(pca.explained_variance_)+1),
pca.explained_variance_
)
plt.plot([1]*(len(pca.explained_variance_)+1), 'r', linewidth=1)
plt.xlabel('PCA Feature')
plt.ylabel('Explained variance')
plt.title('Feature Explained Variance')
plt.show()
pca_data_standard = pd.DataFrame(pca_data_scaled[:,0:8], columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8'])
pca_data_standard.head(1)
pca_data_standard = reduce_memory_usage(pca_data_standard, verbose=True)
fig_elb = KElbowVisualizer(KMeans(random_state=42, n_init=10, max_iter=10000), k=(2,10))
fig_elb.fit(pca_data_standard)
print(f'Elbow value= {fig_elb.elbow_value_}')
fig_elb.show()
Sum_of_squared_distances = []
silhouette_scores = []
K = range(2,10)
for _ in K:
km = KMeans(n_clusters = _, n_init=10, random_state=42, max_iter=1000)
y = km.fit_predict(pca_data_standard)
Sum_of_squared_distances.append(km.inertia_)
silhouette_scores.append(silhouette_score(pca_data_standard,y))
fig, ax1 = plt.subplots(figsize=(8,8))
color = 'tab:blue'
ax1.set_xlabel('K')
ax1.set_ylabel('Inertia', color=color)
ax1.plot(K, Sum_of_squared_distances, color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Silhouette Score', color=color)
ax2.plot(K, silhouette_scores, color=color)
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.show()
pio.templates['new_template']['layout']['plot_bgcolor'] = 'white'
range_n_clusters = [2,3,4,5]
figures = []
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2,
print_grid=False,
subplot_titles=('The silhouette plot for the various clusters.',
'The visualization of the clustered data.'))
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
fig['layout']['xaxis1'].update(title='The silhouette coefficient values',
range=[-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
fig['layout']['yaxis1'].update(title='Cluster label',
showticklabels=False,
range=[0, len(pca_data_standard) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10, max_iter=1000)
cluster_labels = clusterer.fit_predict(pca_data_standard)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(pca_data_standard, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(pca_data_standard, cluster_labels)
y_lower = 10
color = []
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
colors = mpl.colors.colorConverter.to_rgb(cm.nipy_spectral(float(i) / n_clusters))
colors = 'rgb'+str(colors)
color.append(colors)
filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
x=ith_cluster_silhouette_values,
mode='lines',
showlegend=False,
line=dict(width=0.5,
color=colors),
fill='tozerox',
name='Silhouette')
fig.add_traces(filled_area, 1, 1)
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
# The vertical line for average silhouette score of all the values
axis_line = go.Scatter(x=[silhouette_avg, silhouette_avg],
y=[0, y_upper],
showlegend=False,
mode='lines',
line=dict(color="red", dash='dash',
width =2) )
fig.append_trace(axis_line, 1, 1)
# 2nd Plot showing the actual clusters formed
clusters = go.Scatter(x=pca_data_standard['PC1'],
y=pca_data_standard['PC2'],
showlegend=False,
mode='markers',
marker=dict(color=cluster_labels,
size=4, colorscale=color),
name='Data'
)
fig.append_trace(clusters, 1, 2)
# Labeling the clusters
centers_ = clusterer.cluster_centers_
# Draw white circles at cluster centers
df_k_means_center = pd.DataFrame(
{
'x1':centers_[:,0],
'x2':centers_[:,1]
}
)
centers = go.Scatter(x=df_k_means_center.x1,
y=df_k_means_center.x2,
showlegend=True,
text=list(range(1, n_clusters + 1)),
mode='markers',
name='k_means_centers',
marker=dict(color=color, size=20,
symbol="x-dot",
line=dict(width=2,
color='rgb(175, 100, 88)'))
)
fig.append_trace(centers, 1, 2)
fig['layout']['xaxis2'].update(title='Feature space for the 1st feature',
zeroline=False)
fig['layout']['yaxis2'].update(title='Feature space for the 2nd feature',
zeroline=False)
fig['layout'].update(title="Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters)
fig.update_layout(showlegend=True)
# figures.append(fig)
fig.show()
neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(pca_data_standard)
distances, indices = neighbors_fit.kneighbors(pca_data_standard)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
fig = plt.figure(figsize=(10,10))
plt.plot(distances)
eps_values = np.arange(1, 3, 0.10)
min_samples = np.arange(1,10)
dbscan_paramns = list(product(eps_values, min_samples))
no_of_clusters = []
sil_score = []
for p in dbscan_paramns:
y_dbscan = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(pca_data_standard)
try:
sil_score.append(silhouette_score(pca_data_standard,y_dbscan))
except:
sil_score.append(0)
no_of_clusters.append(len(np.unique(y_dbscan)))
df_param_tunning = pd.DataFrame.from_records(dbscan_paramns, columns=['Eps','Min_samples'])
df_param_tunning['sil_score'] = sil_score
df_param_tunning['n_clusters'] = no_of_clusters
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_score', columns='Eps', index='Min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='Eps', index='Min_samples')
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()
dbscan_train = DBSCAN(eps=2.8, min_samples=3)
y_dbscan = dbscan_train.fit_predict(pca_data_standard)
print(silhouette_score(pca_data_standard, y_dbscan ))
figures = []
# Create a subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2,
print_grid=False,
subplot_titles=('The silhouette plot for the various clusters.',
'The visualization of the clustered data.'))
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
fig['layout']['xaxis1'].update(title='The silhouette coefficient values',
range=[-0.1, 1])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = DBSCAN(eps=2.8, min_samples=4)
cluster_labels = clusterer.fit_predict(pca_data_standard)
n_clusters = len(np.unique(cluster_labels))
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
fig['layout']['yaxis1'].update(title='Cluster label',
showticklabels=False,
range=[0, len(pca_data_standard) + (n_clusters + 1) * 10])
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(pca_data_standard, cluster_labels)
print(
"For n_clusters =",
n_clusters,
"The average silhouette_score is :",
silhouette_avg,
)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(pca_data_standard, cluster_labels)
y_lower = 10
color = []
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i - 1]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
colors = mpl.colors.colorConverter.to_rgb(cm.nipy_spectral(float(i) / n_clusters))
colors = 'rgb'+str(colors)
color.append(colors)
filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
x=ith_cluster_silhouette_values,
mode='lines',
showlegend=False,
line=dict(width=0.5,
color=colors),
fill='tozerox',
name='Silhouette')
fig.add_traces(filled_area, 1, 1)
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
# The vertical line for average silhouette score of all the values
axis_line = go.Scatter(x=[silhouette_avg, silhouette_avg],
y=[0, y_upper],
showlegend=False,
mode='lines',
name='Line for average silhouette',
line=dict(color="red", dash='dash',
width =2) )
fig.append_trace(axis_line, 1, 1)
# 2nd Plot showing the actual clusters formed
clusters = go.Scatter(x=pca_data_standard['PC1'],
y=pca_data_standard['PC2'],
showlegend=False,
mode='markers',
marker=dict(color=cluster_labels,
size=4, colorscale=color),
name='Data'
)
fig.append_trace(clusters, 1, 2)
fig['layout']['xaxis2'].update(title='Feature space for the 1st feature',
zeroline=False)
fig['layout']['yaxis2'].update(title='Feature space for the 2nd feature',
zeroline=False)
fig['layout'].update(title="Silhouette analysis for DBSCAN clustering on sample data "
"with n_clusters = %d" % n_clusters)
fig.update_layout(showlegend=True)
figures.append(fig)
fig.show()
reg_clustering = {
'MeanShift': MeanShift,
'AffinityPropagation': AffinityPropagation,
'KMeans': KMeans,
'SpectralClustering': SpectralClustering,
'AgglomerativeClustering': AgglomerativeClustering,
'DBSCAN': DBSCAN,
}
params_clustering = {
'MeanShift':
{'max_iter': 10000, 'bandwidth': estimate_bandwidth(pca_data_standard, quantile=0.2, n_samples=500), 'bin_seeding': True},
'AffinityPropagation':
{'damping': 0.9, 'random_state': 42, 'max_iter': 10000},
'KMeans':
{'n_clusters': 4, 'n_init': 10, 'random_state': 42, 'max_iter': 10000},
'SpectralClustering':
{'n_clusters': 4, 'assign_labels': 'cluster_qr', 'random_state': 42},
'AgglomerativeClustering':
{'n_clusters':4, 'linkage': 'ward'},
'DBSCAN':
{'eps':2.8, 'min_samples': 3},
}
row_col_clustering = {
'MeanShift':
{'row': 1, 'col': 1},
'AffinityPropagation':
{'row': 1, 'col': 2},
'KMeans':
{'row': 1, 'col': 3},
'SpectralClustering':
{'row': 2, 'col': 1},
'AgglomerativeClustering':
{'row': 2, 'col': 2},
'DBSCAN':
{'row': 2, 'col': 3},
}
fig = make_subplots(rows=2,
cols=3,
shared_yaxes=False,
shared_xaxes=False,
vertical_spacing=0.1,
subplot_titles=('1', '2', '3', '4', '5', '6')
)
names = {}
count = 1
for name, reg in reg_clustering.items():
cluster_labels = reg(**params_clustering.get(name)).fit_predict(pca_data_standard)
n_clusters = len(np.unique(cluster_labels))
silhouette_avg = silhouette_score(pca_data_standard, cluster_labels)
print(
f'For {name}:\n',
f"For n_clusters = {n_clusters}\n",
f"The average silhouette_score is : {silhouette_avg}\n"
)
color = []
for i in range(n_clusters):
colors = mpl.colors.colorConverter.to_rgb(cm.nipy_spectral(float(i) / n_clusters))
colors = 'rgb'+str(colors)
color.append(colors)
# add traces to subplots
fig.add_trace(
go.Scatter(
x=pca_data_standard['PC1'],
y=pca_data_standard['PC2'],
showlegend=True,
mode='markers',
name=f'Data {name}',
marker=dict(color=cluster_labels,
size=4, colorscale=color),
),
**row_col_clustering.get(name)
)
# llenamos el diccionario con el name de cada metodo de clustering para luego se edite automaticamente
names[str(count)]=name
count = count + 1
fig.for_each_annotation(lambda a: a.update(text = names[a.text]))
fig.update_layout(title=f"Subplots clustering 2D", width=1200, height=800)
fig.show()
pio.templates['new_template']['layout']['plot_bgcolor'] = 'black'
fig = make_subplots(rows=2,
cols=3,
shared_yaxes=False,
shared_xaxes=False,
vertical_spacing=0.1,
subplot_titles=('1', '2', '3', '4', '5', '6'),
specs=[[{'type': 'scatter3d'}, {'type': 'scatter3d'}, {'type': 'scatter3d'}],
[{'type': 'scatter3d'}, {'type': 'scatter3d'}, {'type': 'scatter3d'}]]
)
names = {}
count = 1
for name, reg in reg_clustering.items():
cluster_labels = reg(**params_clustering.get(name)).fit_predict(pca_data_standard)
n_clusters = len(np.unique(cluster_labels))
silhouette_avg = silhouette_score(pca_data_standard, cluster_labels)
color = []
for i in range(n_clusters):
colors = mpl.colors.colorConverter.to_rgb(cm.nipy_spectral(float(i) / n_clusters))
colors = 'rgb'+str(colors)
color.append(colors)
# add traces to subplots
fig.add_trace(
go.Scatter3d(
x=pca_data_standard['PC1'],
y=pca_data_standard['PC2'],
z=pca_data_standard['PC3'],
showlegend=False,
mode='markers',
name=f'Data {name}',
marker=dict(color=cluster_labels,
size=4, colorscale=color),
),
**row_col_clustering.get(name)
)
# llenamos el diccionario con el name de cada metodo de clustering para luego se edite automaticamente
names[str(count)]=name
count = count + 1
fig.for_each_annotation(lambda a: a.update(text = names[a.text]))
fig.update_layout(title=f"Subplots clustering 3D",
width=1400,
height=1000,
template='seaborn'
)
fig.show()
km = KMeans(n_clusters=4, n_init=10, random_state=42, max_iter=10000)
y = km.fit_predict(pca_data_standard)
print(f"The average silhouette_score of KMeans is : {silhouette_score(pca_data_standard, y)}\n")
df_campaign_imputeknn['k_means_pca'] = y
columns_Numeric = ['Year_Birth',
'Income', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'k_means_pca']
g = sns.pairplot(data=df_campaign_imputeknn[columns_Numeric],
hue='k_means_pca',
corner=True,
diag_kind='kde',
plot_kws=dict(s=10),
palette='coolwarm')
g.fig.set_size_inches(24,16)
g.fig.suptitle('Evaluation of results in Dataset')
plt.show(g)
fig = plt.figure(figsize=(10,10))
sns.scatterplot(data=df_campaign_imputeknn, x='MntWines', y='Income',hue='k_means_pca', palette='coolwarm')
g.fig.suptitle('Evaluation of cluster in graphs 2D')
plt.show()
fig = px.scatter_3d(df_campaign_imputeknn, x='Income', y='MntWines', z='NumStorePurchases',
color='k_means_pca', color_continuous_scale=px.colors.diverging.RdBu[::-1])
fig.update_traces(marker_size=6)
fig.update_coloraxes(showscale=False)
fig.update_layout(title=f"Evaluation of cluster in graphs 3D")
fig.show()
g = sns.catplot(
data=df_campaign_imputeknn, kind="swarm",
x="Education", y="Income",
hue='k_means_pca',
col="Children",
s=3.2,
palette="deep",
)
g.set_xticklabels(["Post Graduate", "Under Graduate"])
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle(f"Evaluation of cluster in Catplot",
fontsize=24, fontdict={"weight": "bold"})
plt.show(g)