Segmentación de clientes con metodología RFM en Python
Este laboratorio de segmentación tiene como fin probar técnicas de ciencia de datos en transacciones reales de producción.
!curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
!curl https://packages.microsoft.com/config/debian/10/prod.list > /etc/apt/sources.list.d/mssql-release.list
!sudo apt-get update
!sudo ACCEPT_EULA=Y apt-get install -y msodbcsql17
!sudo ACCEPT_EULA=Y apt-get install -y mssql-tools
!echo 'export PATH="$PATH:/opt/mssql-tools/bin"' >> ~/.bashrc
!source ~/.bashrc
!sudo apt-get install -y unixodbc-dev
!sudo apt-get update
!sudo apt-get install -y unixodbc-dev
!pip install pyodbc
!pip install xlrd
!pip install openpyxl
import numpy as np
import pandas as pd
import datetime as dt
import pyodbc
A continuación, utilizamos Python para cargar el dataset desde el servidor SQL
driver='{ODBC Driver 17 for SQL Server}'
server_name = 'SECRET'
db_name = 'SECRET'
user = 'SECRET'
password = 'SECRET'
sql_conn = pyodbc.connect('''
DRIVER={};SERVER={};DATABASE={};UID={};PWD={};
'''.format(driver, server_name, db_name, user, password))
query_sql = '''
SELECT
[Rut] as customeraccount
,[CantFacturada] as qty
,[Afecto] as netprice
,Tipo as doctype
,[Depot] as store
,[CodProd] as sku
,[DesProd] as productname
,[FACTURA2] as invoice
,[Fecha] as invoicedate
FROM [dbo].[Is_Salida_Rep_Vtas] order by Fecha'''
df = pd.read_sql(query_sql, sql_conn)
sql_conn.close()
df
df["sku"].nunique()
df.groupby("sku").agg({"qty": "sum"}).sort_values("qty", ascending=False).head
df["invoice"].nunique()
Quitamos los productos vendidos que puedan ensuciar nuestro análisis:
df=df[~df["sku"].str.contains("DESPACHO", na=False)]
df=df[~df["sku"].str.contains("ENVIO", na=False)]
df=df[~df["sku"].str.contains("WHSSALEOFF", na=False)]
Como se puede observar a continuación, ya han sido removidos:
df.groupby("sku").agg({"qty": "sum"}).sort_values("qty", ascending=False).head
df.dropna(inplace=True)
df.isnull().sum()
df=df[~df["doctype"].str.contains("N", na=False)]
df=df[~df["customeraccount"].str.contains("99999999-9", na=False)]
df=df[~df["customeraccount"].str.contains("66666666-6", na=False)]
df=df[~df["customeraccount"].str.contains("55555555-5", na=False)]
df= df[(df["qty"]>0)]
df["invoicedate"].max()
base_date = dt.datetime(2022, 1, 31)
rfm = df.groupby('customeraccount').agg({'invoicedate': lambda date: (base_date - date.max()).days,
'invoice': lambda num: num.nunique(),
'netprice': lambda netprice: netprice.sum()})
rfm.columns = ['Recency', 'Frequency', 'Monetary']
rfm
rfm = rfm[(rfm["Monetary"]) > 0 & (rfm["Frequency"] > 0)]
rfm["RecencyScore"] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])
rfm["FrequencyScore"] = pd.qcut(rfm['Frequency'].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
rfm["MonetaryScore"] = pd.qcut(rfm['Monetary'], 5, labels=[1, 2, 3, 4, 5])
rfm
rfm["RFM_SCORE"] = (rfm['RecencyScore'].astype(str) +
rfm['FrequencyScore'].astype(str) +
rfm['MonetaryScore'].astype(str))
rfm
seg_map = {
r'[1-2][1-2]': 'Hibernando',
r'[1-2][3-4]': 'En riesgo',
r'[1-2]5': 'No se puede perder',
r'3[1-2]': 'A punto de dormirse',
r'33': 'Necesita atención',
r'[3-4][4-5]': 'Cliente leal',
r'41': 'Prometedor',
r'51': 'Nuevos clientes',
r'[4-5][2-3]': 'Potencial leales',
r'5[4-5]': 'Cliente VIP'
}
rfm['Segmento'] = rfm['RecencyScore'].astype(str) + rfm['FrequencyScore'].astype(str)
rfm['Segmento'] = rfm['Segmento'].replace(seg_map, regex=True)
rfm
Exportando el análisis
rfm.to_excel('Seg_Patagonia.xlsx')