!python -m pip install -q --upgrade pip
!pip install -q -r requirements.txt
# Miscellaneous
from __future__ import print_function
#Importing Requierd Libraries
import pandas as pd
import numpy as np
from itertools import product
# For interactive graphics
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
from yellowbrick.cluster import KElbowVisualizer
from ydata_profiling import ProfileReport
# Sklearn
from sklearn.preprocessing import StandardScaler
import sklearn.impute
import sklearn.compose
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, AffinityPropagation, MeanShift, SpectralClustering, estimate_bandwidth
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.neighbors import NearestNeighbors
# Magics Funtions
%load_ext autoreload
%autoreload 2
%run "template_visualitation.ipynb"
%run "pandas-missing-extension.ipynb"
# Install libraries with pip
# !pip install pyarrow
# !pip install pandarallel
# !pip install PivotTableJS
%%time
df_campaign = pd.read_csv('marketing_campaign.csv', sep='\t')
df_campaign.to_feather("marketing_campaign.feather")
%%time
df_campaign = pd.read_feather("marketing_campaign.feather")
df_campaign.describe()
df_campaign.info(memory_usage = "deep")
memory_usage = df_campaign.memory_usage(deep=True) / 1024 ** 2
print(f'memory usage of features:\n{memory_usage.head(7)}')
print('memory usage sum:',memory_usage.sum())
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df
df_campaign = reduce_memory_usage(df_campaign, verbose=True)
df_campaign.info(memory_usage="deep")
df_campaign_transformed = df_campaign.copy(deep=True)
print(df_campaign_transformed['Education'].value_counts(), '\n')
print(df_campaign_transformed['Marital_Status'].value_counts())
df_campaign_transformed['Education'] = df_campaign_transformed['Education'].replace(['PhD','2n Cycle','Graduation', 'Master'],'Post Graduate')  
df_campaign_transformed['Education'] = df_campaign_transformed['Education'].replace(['Basic'], 'Under Graduate')
df_campaign_transformed['Marital_Status'].value_counts()
df_campaign_transformed['Marital_Status'] = df_campaign_transformed['Marital_Status'].replace(['Married', 'Together'],'Relationship')
df_campaign_transformed['Marital_Status'] = df_campaign_transformed['Marital_Status'].replace(['Divorced', 'Widow', 'Alone', 'YOLO', 'Absurd'],'Single')
df_campaign_transformed['Dt_Customer'].head(1)
df_campaign_transformed['Dt_Customer'] = pd.to_datetime(df_campaign_transformed['Dt_Customer'], dayfirst=True, format='%d/%m/%Y', infer_datetime_format=True)
df_campaign_transformed.sort_values(by='Dt_Customer', inplace=True) 
df_campaign_transformed = df_campaign_transformed.set_index('Dt_Customer')
df_campaign_transformed['Year_Customer'] = df_campaign_transformed.index.year
df_campaign_transformed['Month_Customer'] = df_campaign_transformed.index.month
df_campaign_transformed
categorical_columns = df_campaign_transformed.select_dtypes(include='object').columns
categorical_columns
categorical_transformer = sklearn.compose.make_column_transformer(
    (sklearn.preprocessing.OrdinalEncoder(), [categorical_columns[0]]),
    (sklearn.preprocessing.OrdinalEncoder(), [categorical_columns[1]]),
    remainder="passthrough"
)
df_campaign_transformed = (
    pd.DataFrame(
        categorical_transformer.fit_transform(df_campaign_transformed),
        columns = categorical_transformer.get_feature_names_out(),
        index = df_campaign_transformed.index
    )
)
df_campaign_transformed.columns = df_campaign_transformed.columns.str.replace('remainder__', '')
df_campaign_transformed.columns = df_campaign_transformed.columns.str.replace('ordinalencoder-1__', '')
df_campaign_transformed.columns = df_campaign_transformed.columns.str.replace('ordinalencoder-2__', '')
df_campaign_transformed
print(
    categorical_transformer
    .named_transformers_
    .get("ordinalencoder-2") 
    .categories_
)
print(
    categorical_transformer
    .named_transformers_
    .get('ordinalencoder-2')
    .inverse_transform(
        X = [[1], [0]]
    )
)
df_campaign_transformed.info()
df_campaign_transformed['Kidhome'].value_counts()
df_campaign_transformed['Teenhome'].value_counts()
df_campaign_transformed['Children'] = df_campaign_transformed['Kidhome'] + df_campaign_transformed['Teenhome']
df_campaign_transformed=df_campaign_transformed.drop(columns=["Kidhome", "Teenhome", 'ID'],axis=1)
df_campaign_transformed.missing.missing_variable_summary()
missingno.bar(df=df_campaign_transformed)
missingno.matrix(df=df_campaign_transformed)
knn_imputer = sklearn.impute.KNNImputer()
df_campaign_imputeknn = df_campaign_transformed.copy(deep=True)
df_campaign_imputeknn.iloc[:, :] = knn_imputer.fit_transform(df_campaign_imputeknn).round()
(
    pd.concat(
        [
            df_campaign_imputeknn,
            df_campaign_transformed.missing.create_shadow_matrix(True, False, suffix="_imp", only_missing=True)
        ],
        axis=1
    )
    .pipe(
        lambda df: (
            px.scatter(
                df,
                x="Income",
                y="MntWines",
                color='Income_imp',
                marginal_x="box", 
                marginal_y="box"
            )
        )
    )
)
df_campaign_imputeknn.missing.number_missing()
df_campaign_imputeknn[df_campaign_imputeknn.duplicated()]
df_campaign_imputeknn.duplicated().value_counts()
df_campaign_imputeknn.drop_duplicates(inplace=True)
df_campaign_imputeknn.nunique()
df_campaign_imputeknn = df_campaign_imputeknn.drop(columns=["Z_CostContact", "Z_Revenue"],axis=1)
df_campaign_imputeknn = reduce_memory_usage(df_campaign_imputeknn, verbose=True)
profile = ProfileReport(
    df_campaign_imputeknn, title="Pandas Profiling Report", html={"style": {"primary_color": "#FA0087"}},
    minimal=True
)
profile
columns_numeric = ['Year_Birth',
               'Income', 'Recency', 'MntWines', 'MntFruits',
               'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
               'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
               'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 10, 'color': 'white'}
# plotly setup
plot_rows=2
plot_cols=7
fig = make_subplots(rows=plot_rows, cols=plot_cols, shared_yaxes=False)
# add traces
x = 0
for i in range(1, plot_rows + 1):
    for j in range(1, plot_cols + 1):
        fig.add_trace(go.Box(y=df_campaign_imputeknn[columns_numeric[x]].values,
                             name = df_campaign_imputeknn[columns_numeric].columns[x],
                            ),
                     row=i,
                     col=j)
        x=x+1
fig.update_layout(
    width=1500,
    height=800)
fig.show()
df_campaign_imputeknn[df_campaign_imputeknn['Income'] > 300000]
df_campaign_imputeknn['Income'].mean()
df_campaign_imputeknn['Income'][df_campaign_imputeknn['Income'] < 300000].mean()
df_campaign_imputeknn['Income'] = df_campaign_imputeknn['Income'].replace(666666.0, df_campaign_imputeknn['Income'][df_campaign_imputeknn['Income'] < 300000].mean())
df_campaign_imputeknn[df_campaign_imputeknn['Income'] > 300000]
g = sns.swarmplot(data=df_campaign_imputeknn, x='Income', s=4.5, orient="h")
plt.show(g)
df_campaign_imputeknn[df_campaign_imputeknn['Year_Birth'] < 1920]
round(df_campaign_imputeknn.Year_Birth.astype("float32").mean(), 0)
df_campaign_imputeknn['Year_Birth'][df_campaign_imputeknn['Year_Birth'] > 1920].astype("float32").mean()
df_campaign_imputeknn['Year_Birth'] = df_campaign_imputeknn['Year_Birth'].replace([1899.0, 1900.0, 1893.0], round(df_campaign_imputeknn['Year_Birth'][df_campaign_imputeknn['Year_Birth'] > 1920].astype("float32").mean(), 0))
df_campaign_imputeknn[df_campaign_imputeknn['Year_Birth'] < 1920]
g = sns.swarmplot(data=df_campaign_imputeknn, x='Year_Birth', s=4.5, orient="h")
plt.show(g)
columns_categorical = ['Children', 'Education', 'Marital_Status', 'Response']
from plotly.subplots import make_subplots
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 10, 'color': 'white'}
# plotly setup
plot_rows=1
plot_cols=4
fig = make_subplots(rows=plot_rows, 
                    cols=plot_cols, 
                    shared_yaxes=False,
                    vertical_spacing=0.1,
                    subplot_titles=('1', '2', '3', '4'),
                    specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}]]
                   )
# add traces
x = 0
names = {}
count = 1
for i in range(1, plot_rows + 1):
    for j in range(1, plot_cols + 1):
        fig1 = px.pie(df_campaign_imputeknn, values='Income', names=columns_categorical[x])
        trace1 = fig1.data[0]
        fig.add_trace(trace1,
                     row=i,
                     col=j)
        names[str(count)]=columns_categorical[x]
        x=x+1
        count = count + 1
fig.for_each_annotation(lambda a: a.update(text = names[a.text]))
fig.update_layout(width=600, height=400)
fig.update_annotations(y=0.8)
fig.show()
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 8, 'color': 'white'}
fig = px.imshow(
    df_campaign_imputeknn[columns_numeric].corr(),
    color_continuous_scale=px.colors.diverging.RdBu[::-1],
    text_auto='.2f',
    zmin=-1,
    zmax=1,
)
fig.update_xaxes(tickangle=60, automargin='height')
fig.show()
mpl.rcParams["axes.labelsize"] = 8
g = sns.pairplot(df_campaign_imputeknn[columns_numeric],
                 corner=True,
                 diag_kind='kde',
                 plot_kws=dict(s=10),
                 )
g.map_lower(sns.kdeplot, levels=4, color="#FA0087")
g.fig.set_size_inches(24,16)
g.fig.suptitle('Grafica comparativa de all vs all')
plt.show(g)
marital_status = (
    categorical_transformer
    .named_transformers_
    .get('ordinalencoder-2')
    .inverse_transform(
        X = [[0], [1]]
    )
)
education_status = (
    categorical_transformer
    .named_transformers_
    .get('ordinalencoder-1')
    .inverse_transform(
        X = [[0], [1]]
    )
)
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 16, 'color': 'white'}
fig = px.histogram(df_campaign_imputeknn, x="Year_Birth", color="Marital_Status", opacity=0.8, barmode='overlay')
fig.update_layout(
    title=f"Histogram Marital Status, 0={marital_status[0]} and 1={marital_status[1]}",
)
fig.show()
fig = px.histogram(df_campaign_imputeknn, x="Year_Birth", color="Education", opacity=0.8, barmode='overlay')
fig.update_layout(
    title=f"Histogram Education, 0={education_status[0]} and 1={education_status[1]}",
)
fig.show()
hist_data = [df_campaign_imputeknn["Year_Birth"]]
group_labels = ["distplot"]  # name of the dataset
fig = ff.create_distplot(hist_data, group_labels, bin_size=1, colors=["#FA0087"])
fig.update_layout(
    title="Year Birth costumers"
)
fig.show()
g = sns.catplot(
    data=df_campaign_imputeknn, kind="swarm",
    x="Marital_Status", y="Income",
    hue='Education',
    col="Children",
    s=3.2, 
    palette="deep", 
)
g.set_xticklabels(["Relationship", "Single"])
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle(f"In Catplot, Education is 0={education_status[0]} and 1={education_status[1]}",
                  fontsize=24, fontdict={"weight": "bold"})
plt.show(g)
g = sns.catplot(
    data=df_campaign_imputeknn, kind="bar",
    x="Education", y="Income",
    hue='Marital_Status',
    col="Children",
    palette="deep", 
)
g.set_xticklabels(["Post Graduate", "Under Graduate"])
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle(f"In Catplot, marital_status is 0={marital_status[0]} and 1={marital_status[1]}",
                  fontsize=24, fontdict={"weight": "bold"})
plt.show(g)
df_campaign_imputeknn[['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds', 'Children']].groupby('Children').mean()
df_campaign_imputeknn[['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds', 'Children']].groupby('Children').sum()
subplots_edad = df_campaign_imputeknn.groupby(['Year_Birth']).sum()
fig = make_subplots(
    rows=4,
    cols=1,
    subplot_titles=(
        "NumStorePurchases for Year_Birth",
        "NumWebPurchases for Year_Birth",
        "NumWebVisitsMonth for Year_Birth",
        "NumCatalogPurchases for Year_Birth"
    ),
    shared_xaxes=True,
    vertical_spacing=0.1
)
fig.add_trace(go.Scatter(x=subplots_edad.index, y=subplots_edad.NumStorePurchases, mode="lines", name="NumStorePurchases", opacity=0.8),
    row=1,
    col=1,
)
fig.add_trace(go.Scatter(x=subplots_edad.index, y=subplots_edad.NumWebPurchases, mode="lines", name="NumWebPurchases", opacity=0.8),
    row=2,
    col=1,
)
fig.add_trace(go.Scatter(x=subplots_edad.index, y=subplots_edad.NumWebVisitsMonth, mode="lines", name="NumWebVisitsMonth", opacity=0.8),
    row=3,
    col=1,
)
fig.add_trace(go.Scatter(x=subplots_edad.index, y=subplots_edad.NumCatalogPurchases, mode="lines", name="NumCatalogPurchases", opacity=0.8),
    row=4,
    col=1,
)
fig.update_layout(title_text="Subplots variables Place of Dataset")
fig.show()
products = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntGoldProds']
for _ in products:
    fig = px.histogram(df_campaign_imputeknn, x='Year_Customer', y=_, color="Month_Customer", barmode='relative', title="Mnts for Year_Customer")
    fig.update_layout(width=800, height=600, bargap=0.2)
    fig.update_xaxes(tickvals=[2012, 2013, 2014])
    fig.show()
scaler = StandardScaler()
df_campaign_scaled = scaler.fit_transform(df_campaign_imputeknn)
df_campaign_scaled = pd.DataFrame(df_campaign_scaled, columns = df_campaign_imputeknn.columns )
df_campaign_scaled
df_campaign_scaled = reduce_memory_usage(df_campaign_scaled, verbose=True)
n = 15
principal_components = {
    'PCA': PCA(n_components=n , svd_solver="arpack"),
    'IncrementalPCA': IncrementalPCA(n_components=n)
}
fig = go.Figure()
for name, principal_component in principal_components.items():
    pca = principal_component
    pca.fit(df_campaign_scaled)
    cum_var = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100) 
    # Aqui anexamos a la misma figura cada trazo
    fig.add_trace(go.Scatter(
        x=list(range(1, len(principal_component.explained_variance_) + 1)),
        y=cum_var,
        name=name,
        mode='lines+markers',
    ))
    # Aqui verificamos que varianza de datos esta agrupando el metodo usado para nuestros features
    components = principal_component.fit_transform(df_campaign_scaled)
    total_var = principal_component.explained_variance_ratio_.sum() * 100
    print(f"Total Explained Variance: {total_var:.2f}%, to {n} components with {name}")
# Aqui ejecutamos el plot de nuestra grafica
fig.update_layout(showlegend=True)
fig.show()
pca = PCA()
pca.fit(df_campaign_scaled)
pca_data_scaled = pca.transform(df_campaign_scaled)
# Bar plot of explained_variance
plt.bar(
    range(1,len(pca.explained_variance_)+1),
    pca.explained_variance_
    )
    
plt.plot([1]*(len(pca.explained_variance_)+1), 'r', linewidth=1)
plt.xlabel('PCA Feature')
plt.ylabel('Explained variance')
plt.title('Feature Explained Variance')
plt.show()
pca_data_standard = pd.DataFrame(pca_data_scaled[:,0:8], columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8'])
pca_data_standard.head(1)
pca_data_standard = reduce_memory_usage(pca_data_standard, verbose=True)
fig_elb = KElbowVisualizer(KMeans(random_state=42, n_init=10, max_iter=10000), k=(2,10))
fig_elb.fit(pca_data_standard)
print(f'Elbow value= {fig_elb.elbow_value_}')
fig_elb.show()
Sum_of_squared_distances = []
silhouette_scores = []
K = range(2,10) 
for _ in K:
    km = KMeans(n_clusters = _, n_init=10, random_state=42, max_iter=1000)
    y = km.fit_predict(pca_data_standard)
    Sum_of_squared_distances.append(km.inertia_)
    silhouette_scores.append(silhouette_score(pca_data_standard,y))
fig, ax1 = plt.subplots(figsize=(8,8))
color = 'tab:blue'
ax1.set_xlabel('K')
ax1.set_ylabel('Inertia', color=color)
ax1.plot(K, Sum_of_squared_distances, color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()  
color = 'tab:red'
ax2.set_ylabel('Silhouette Score', color=color)  
ax2.plot(K, silhouette_scores, color=color)
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.show()
pio.templates['new_template']['layout']['plot_bgcolor'] = 'white'
range_n_clusters = [2,3,4,5]
figures = []
for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig = make_subplots(rows=1, cols=2,
                        print_grid=False,
                        subplot_titles=('The silhouette plot for the various clusters.',
                                              'The visualization of the clustered data.'))
    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    fig['layout']['xaxis1'].update(title='The silhouette coefficient values',
                                   range=[-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    fig['layout']['yaxis1'].update(title='Cluster label',
                                   showticklabels=False,
                                   range=[0, len(pca_data_standard) + (n_clusters + 1) * 10])
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10, max_iter=1000)
    cluster_labels = clusterer.fit_predict(pca_data_standard)
    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(pca_data_standard, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(pca_data_standard, cluster_labels)
    y_lower = 10
    color = []
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        colors = mpl.colors.colorConverter.to_rgb(cm.nipy_spectral(float(i) / n_clusters))
        colors = 'rgb'+str(colors)
        color.append(colors)
        filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
                                 x=ith_cluster_silhouette_values,
                                 mode='lines',
                                 showlegend=False,
                                 line=dict(width=0.5,
                                          color=colors),
                                 fill='tozerox',
                                 name='Silhouette')
        fig.add_traces(filled_area, 1, 1)
        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples
        # The vertical line for average silhouette score of all the values
        axis_line = go.Scatter(x=[silhouette_avg, silhouette_avg],
                               y=[0, y_upper],
                               showlegend=False,
                               mode='lines',
                               line=dict(color="red", dash='dash',
                                         width =2) )
        fig.append_trace(axis_line, 1, 1)
#   2nd Plot showing the actual clusters formed
    clusters = go.Scatter(x=pca_data_standard['PC1'], 
                          y=pca_data_standard['PC2'], 
                          showlegend=False,
                          mode='markers',
                          marker=dict(color=cluster_labels,
                                     size=4, colorscale=color),
                          name='Data'
                         )
    fig.append_trace(clusters, 1, 2)
    # Labeling the clusters
    centers_ = clusterer.cluster_centers_
#     Draw white circles at cluster centers
    df_k_means_center = pd.DataFrame(
        {
            'x1':centers_[:,0],
            'x2':centers_[:,1]
        }
    )
    centers = go.Scatter(x=df_k_means_center.x1, 
                         y=df_k_means_center.x2,
                         showlegend=True,
                         text=list(range(1, n_clusters + 1)),
                         mode='markers',
                         name='k_means_centers',
                         marker=dict(color=color, size=20,
                                     symbol="x-dot",
                                     line=dict(width=2, 
                                               color='rgb(175, 100, 88)'))
                        )
    fig.append_trace(centers, 1, 2)
    fig['layout']['xaxis2'].update(title='Feature space for the 1st feature',
                                   zeroline=False)
    fig['layout']['yaxis2'].update(title='Feature space for the 2nd feature',
                                  zeroline=False)
    fig['layout'].update(title="Silhouette analysis for KMeans clustering on sample data "
                         "with n_clusters = %d" % n_clusters)
    fig.update_layout(showlegend=True)
#     figures.append(fig)
    fig.show()
neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(pca_data_standard)
distances, indices = neighbors_fit.kneighbors(pca_data_standard)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
fig = plt.figure(figsize=(10,10))
plt.plot(distances)
eps_values = np.arange(1, 3, 0.10)
min_samples = np.arange(1,10)
dbscan_paramns = list(product(eps_values, min_samples))
no_of_clusters = []
sil_score = []
for p in dbscan_paramns:
    y_dbscan = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(pca_data_standard)
    try:
        sil_score.append(silhouette_score(pca_data_standard,y_dbscan))
    except:
        sil_score.append(0)
    no_of_clusters.append(len(np.unique(y_dbscan)))
df_param_tunning = pd.DataFrame.from_records(dbscan_paramns, columns=['Eps','Min_samples'])
df_param_tunning['sil_score'] = sil_score
df_param_tunning['n_clusters'] = no_of_clusters
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_score', columns='Eps', index='Min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='Eps', index='Min_samples')
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()
dbscan_train = DBSCAN(eps=2.8, min_samples=3)
y_dbscan = dbscan_train.fit_predict(pca_data_standard)
print(silhouette_score(pca_data_standard, y_dbscan ))
figures = []
# Create a subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2,
                    print_grid=False,
                    subplot_titles=('The silhouette plot for the various clusters.',
                                          'The visualization of the clustered data.'))
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
fig['layout']['xaxis1'].update(title='The silhouette coefficient values',
                               range=[-0.1, 1])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = DBSCAN(eps=2.8, min_samples=4)
cluster_labels = clusterer.fit_predict(pca_data_standard)
n_clusters = len(np.unique(cluster_labels))
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
fig['layout']['yaxis1'].update(title='Cluster label',
                               showticklabels=False,
                               range=[0, len(pca_data_standard) + (n_clusters + 1) * 10])
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(pca_data_standard, cluster_labels)
print(
    "For n_clusters =",
    n_clusters,
    "The average silhouette_score is :",
    silhouette_avg,
)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(pca_data_standard, cluster_labels)
y_lower = 10
color = []
for i in range(n_clusters):
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i - 1]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    colors = mpl.colors.colorConverter.to_rgb(cm.nipy_spectral(float(i) / n_clusters))
    colors = 'rgb'+str(colors)
    color.append(colors)
    filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
                             x=ith_cluster_silhouette_values,
                             mode='lines',
                             showlegend=False,
                             line=dict(width=0.5,
                                      color=colors),
                             fill='tozerox',
                             name='Silhouette')
    fig.add_traces(filled_area, 1, 1)
    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples
# The vertical line for average silhouette score of all the values
axis_line = go.Scatter(x=[silhouette_avg, silhouette_avg],
                       y=[0, y_upper],
                       showlegend=False,
                       mode='lines',
                       name='Line for average silhouette',
                       line=dict(color="red", dash='dash',
                                 width =2) )
fig.append_trace(axis_line, 1, 1)
# 2nd Plot showing the actual clusters formed
clusters = go.Scatter(x=pca_data_standard['PC1'], 
                      y=pca_data_standard['PC2'], 
                      showlegend=False,
                      mode='markers',
                      marker=dict(color=cluster_labels,
                                 size=4, colorscale=color),
                      name='Data'
                     )
fig.append_trace(clusters, 1, 2)
fig['layout']['xaxis2'].update(title='Feature space for the 1st feature',
                               zeroline=False)
fig['layout']['yaxis2'].update(title='Feature space for the 2nd feature',
                              zeroline=False)
fig['layout'].update(title="Silhouette analysis for DBSCAN clustering on sample data "
                     "with n_clusters = %d" % n_clusters)
fig.update_layout(showlegend=True)
figures.append(fig)
fig.show()
reg_clustering = {
    'MeanShift': MeanShift,
    'AffinityPropagation': AffinityPropagation,
    'KMeans': KMeans,
    'SpectralClustering': SpectralClustering,
    'AgglomerativeClustering': AgglomerativeClustering,
    'DBSCAN': DBSCAN,
}
params_clustering = {
    'MeanShift':
        {'max_iter': 10000, 'bandwidth': estimate_bandwidth(pca_data_standard, quantile=0.2, n_samples=500), 'bin_seeding': True},
    'AffinityPropagation':
        {'damping': 0.9, 'random_state': 42, 'max_iter': 10000},
    'KMeans':
        {'n_clusters': 4, 'n_init': 10, 'random_state': 42, 'max_iter': 10000},
    'SpectralClustering':
        {'n_clusters': 4, 'assign_labels': 'cluster_qr', 'random_state': 42},
    'AgglomerativeClustering':
        {'n_clusters':4, 'linkage': 'ward'},
    'DBSCAN':
        {'eps':2.8, 'min_samples': 3},
}
row_col_clustering = {
    'MeanShift':
        {'row': 1, 'col': 1},
    'AffinityPropagation':
        {'row': 1, 'col': 2},
    'KMeans':
        {'row': 1, 'col': 3},
    'SpectralClustering':
        {'row': 2, 'col': 1},
    'AgglomerativeClustering':
        {'row': 2, 'col': 2},
    'DBSCAN':
        {'row': 2, 'col': 3},
}
fig = make_subplots(rows=2,
                    cols=3,
                    shared_yaxes=False,
                    shared_xaxes=False,
                    vertical_spacing=0.1,
                    subplot_titles=('1', '2', '3', '4', '5', '6')
                   )
names = {}
count = 1
for name, reg in reg_clustering.items():
    cluster_labels = reg(**params_clustering.get(name)).fit_predict(pca_data_standard)
    n_clusters = len(np.unique(cluster_labels))
    silhouette_avg = silhouette_score(pca_data_standard, cluster_labels)
    print(
        f'For {name}:\n',
        f"For n_clusters = {n_clusters}\n",
        f"The average silhouette_score is : {silhouette_avg}\n"
    )
    color = []
    for i in range(n_clusters):
        colors = mpl.colors.colorConverter.to_rgb(cm.nipy_spectral(float(i) / n_clusters))
        colors = 'rgb'+str(colors)
        color.append(colors)
    # add traces to subplots
    fig.add_trace(
        go.Scatter(
            x=pca_data_standard['PC1'], 
            y=pca_data_standard['PC2'], 
            showlegend=True,
            mode='markers',
            name=f'Data {name}',
            marker=dict(color=cluster_labels,
                        size=4, colorscale=color),
        ),
        **row_col_clustering.get(name)
    )
    # llenamos el diccionario con el name de cada metodo de clustering para luego se edite automaticamente 
    names[str(count)]=name
    count = count + 1
fig.for_each_annotation(lambda a: a.update(text = names[a.text]))
fig.update_layout(title=f"Subplots clustering 2D", width=1200, height=800)
fig.show()
pio.templates['new_template']['layout']['plot_bgcolor'] = 'black'
fig = make_subplots(rows=2,
                    cols=3,
                    shared_yaxes=False,
                    shared_xaxes=False,
                    vertical_spacing=0.1,
                    subplot_titles=('1', '2', '3', '4', '5', '6'),
                    specs=[[{'type': 'scatter3d'}, {'type': 'scatter3d'}, {'type': 'scatter3d'}],
                           [{'type': 'scatter3d'}, {'type': 'scatter3d'}, {'type': 'scatter3d'}]]
                   )
names = {}
count = 1
for name, reg in reg_clustering.items():
    cluster_labels = reg(**params_clustering.get(name)).fit_predict(pca_data_standard)
    n_clusters = len(np.unique(cluster_labels))
    silhouette_avg = silhouette_score(pca_data_standard, cluster_labels)
    color = []
    for i in range(n_clusters):
        colors = mpl.colors.colorConverter.to_rgb(cm.nipy_spectral(float(i) / n_clusters))
        colors = 'rgb'+str(colors)
        color.append(colors)
    # add traces to subplots
    fig.add_trace(
        go.Scatter3d(
            x=pca_data_standard['PC1'],
            y=pca_data_standard['PC2'],
            z=pca_data_standard['PC3'],
            showlegend=False,
            mode='markers',
            name=f'Data {name}',
            marker=dict(color=cluster_labels,
                        size=4, colorscale=color),
        ),
        **row_col_clustering.get(name)
    )
    # llenamos el diccionario con el name de cada metodo de clustering para luego se edite automaticamente 
    names[str(count)]=name
    count = count + 1
fig.for_each_annotation(lambda a: a.update(text = names[a.text]))
fig.update_layout(title=f"Subplots clustering 3D",
                  width=1400,
                  height=1000,
                  template='seaborn'
                 )
fig.show()
km = KMeans(n_clusters=4, n_init=10, random_state=42, max_iter=10000)
y = km.fit_predict(pca_data_standard)
print(f"The average silhouette_score of KMeans is : {silhouette_score(pca_data_standard, y)}\n")
df_campaign_imputeknn['k_means_pca'] = y
columns_Numeric = ['Year_Birth',
               'Income', 'Recency', 'MntWines', 'MntFruits',
               'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
               'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
               'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'k_means_pca']
g = sns.pairplot(data=df_campaign_imputeknn[columns_Numeric], 
                 hue='k_means_pca', 
                 corner=True,
                 diag_kind='kde',
                 plot_kws=dict(s=10),
                 palette='coolwarm')
g.fig.set_size_inches(24,16)
g.fig.suptitle('Evaluation of results in Dataset')
plt.show(g)
fig = plt.figure(figsize=(10,10))
sns.scatterplot(data=df_campaign_imputeknn, x='MntWines', y='Income',hue='k_means_pca', palette='coolwarm')
g.fig.suptitle('Evaluation of cluster in graphs 2D')
plt.show()
fig = px.scatter_3d(df_campaign_imputeknn, x='Income', y='MntWines', z='NumStorePurchases',
                    color='k_means_pca', color_continuous_scale=px.colors.diverging.RdBu[::-1])
fig.update_traces(marker_size=6)
fig.update_coloraxes(showscale=False)
fig.update_layout(title=f"Evaluation of cluster in graphs 3D")
fig.show()
g = sns.catplot(
    data=df_campaign_imputeknn, kind="swarm",
    x="Education", y="Income",
    hue='k_means_pca',
    col="Children",
    s=3.2, 
    palette="deep", 
)
g.set_xticklabels(["Post Graduate", "Under Graduate"])
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle(f"Evaluation of cluster in Catplot",
                  fontsize=24, fontdict={"weight": "bold"})
plt.show(g)