#Solo correr en Deepnote
import os
path = r'/work/online_retail_analysis'
os.chdir(path)
!pip install --editable .
!pip install pyprojroot
# !pip install openpyxl
%load_ext autoreload
%autoreload
import online_retail_analysis.utils.paths as path
from online_retail_analysis.visualization.visualize import show_values
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set( rc = {'figure.figsize' : ( 17, 8 ),'axes.labelsize' : 12 })
# sns.set_theme(style="white")
%matplotlib inline
pd.options.display.float_format = '{:,.3f}'.format
df_cluster = pd.read_csv(path.data_processed_dir('data_clustering.csv'))
df_cluster.info()
df_cluster['Cluster'] = df_cluster['Cluster'].astype('category')
# df_cluster['Total_Score'] = df_cluster['Total_Score'].astype('category')
df_cluster.info()
# rfm_values(df_cluster).reset_index()
df_cluster.groupby(['Cluster'])[['Total_Score']].quantile(q=[0.166,0.332, 0.498, 0.664, 0.83,1]).T
# df_cluster[['Total_Score','Cluster']]
# clus_3 = 'Loyal Customers' , clus_1 = 'Potential Costumers', clus_5 = 'Customers Needing Attention', clus_0 = 'Can lose them', clus_2 = 'At risk customers', clus_4 = 'Lost Customers'
df_cluster['Class_Cluster'] = df_cluster['Cluster'].cat.rename_categories(['Pueden perderlos','Clientes potenciales','Clientes en riesgo', 'Clientes fieles','Clientes perdidos','Clientes que necesitan atención'])
# 'Clientes fieles','Clientes potenciales','Clientes que necesitan atención','Pueden perderlos','Clientes en riesgo','Clientes perdidos' - 'Loyal Customers','Potential Costumers','Customers Needing Attention','Can lose them','At risk customers','Lost Customers'
df_cluster['Class_Cluster'] = df_cluster['Class_Cluster'].cat.set_categories(['Clientes fieles','Clientes potenciales','Clientes que necesitan atención','Pueden perderlos','Clientes en riesgo','Clientes perdidos'])
df_cluster[['Cluster','Class_Cluster']]
segmentation1=df_cluster.groupby('Class_Cluster')['CustomerID'].count().sort_values(ascending=False).reset_index()
plt.figure(figsize=(20,8))
segmentation1.drop([0],inplace=False)
sns.set(font_scale = 1.3)
p = sns.barplot(data=segmentation1, x="Class_Cluster", y="CustomerID")
show_values(p)
p.set(xlabel='Segmento', ylabel='Numero de clientes',title="Numero de clientes por categoria")
p
plt.savefig(path.reports_figures_dir('N_Costumer.png'), format='png', dpi=1000)
plt.figure(figsize=(20,8))
sns.boxplot(x='Class_Cluster', y='Recency', data=df_cluster)
plt.title('Boxplot de cluster por Recency', fontsize =15)
plt.xlabel('Segmento',fontsize =15)
plt.ylabel('Recency', fontsize =15)
plt.savefig(path.reports_figures_dir('BOX_Cluster.png'), format='png', dpi=1000)
plt.figure(figsize=(20,8))
sq1=df_cluster.groupby('Class_Cluster')['CustomerID'].nunique().sort_values(ascending=False).plot(kind='pie',cmap='Set2',figsize=(6,6),autopct='%1.1f%%')
plt.title('Pie de cluster por Numero de clientes', fontsize =15)
plt.axis('off')
# plt.xticks([])
# plt.yticks([])
plt.savefig(path.reports_figures_dir('pie_Cluster.png'), format='png', dpi=1000)
plt.figure(figsize=(20,8))
sns.scatterplot(data=df_cluster, y= 'Frequency', x='Recency', hue='Cluster' , style="Cluster")
plt.title('Grafico de dispercion Recencia vs Frecuencia', fontsize =15)
plt.xlabel('Recencia',fontsize =15)
plt.ylabel('Frecuencia', fontsize =15)
plt.savefig(path.reports_figures_dir('scatter_Cluster.png'), format='png', dpi=1000)
def rfm_values(df):
df_new = df.groupby(['Cluster']).agg({
'Recency': 'mean',
'Frequency': 'mean',
'MonetaryValue': ['mean'],
'Variety': ['mean']
}).round(0)
return df_new
df_cluster.Class_Cluster.value_counts().reset_index()