Imports
import pandas as pd
import datetime
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib import cm
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
from matplotlib import colors
from matplotlib.colors import ListedColormap
import numpy as np
from numpy import nan
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
Loading Data & Visualizations
market_df = pd.read_csv('marketing_campaign.csv', sep='\t', lineterminator='\r')
market_df['Dt_Customer'] = pd.to_datetime(market_df['Dt_Customer'], format="%d-%m-%Y")
market_df
market_df.info()
market_df.describe(include='all')
market_df.isnull().any()
Data Cleaning
filtered_df = market_df.copy()
filtered_df.head(20)
# Dropping redundant columns
filtered_df.drop(['ID', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)
filtered_df.drop(filtered_df.tail(1).index,inplace=True) # Last row was full of NaN values
filtered_df
# Filling NaN values with median considering Education and Marital_Status
filtered_df['Income'] = filtered_df['Income'].fillna(filtered_df.groupby(['Education', 'Marital_Status'])['Income'].transform('mean'))
filtered_df.isnull().sum()
# We don't have any NaN values. Time to crunch the numbers
# Convert year of birth to Age
filtered_df = filtered_df.rename(columns={'Year_Birth': 'Age'})
filtered_df['Age'] = 2022 - filtered_df['Age']
# Adding amount spent
filtered_df['Amount_Spent'] = filtered_df['MntWines'] + filtered_df['MntFruits'] + filtered_df['MntMeatProducts'] + filtered_df['MntFishProducts'] + filtered_df['MntSweetProducts'] + filtered_df['MntGoldProds']
filtered_df.drop(['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds'], axis=1, inplace=True)
# Adding number of childs
filtered_df['Children'] = filtered_df['Kidhome'] + filtered_df['Teenhome']
filtered_df.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True)
# Adding number of purchases
filtered_df['Purchases'] = filtered_df['NumWebPurchases'] + filtered_df['NumCatalogPurchases'] + filtered_df['NumStorePurchases']
filtered_df.drop(['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases'], axis=1, inplace=True)
filtered_df
# Grouping and dropping rows filled with absurd values
filtered_df['Marital_Status'] = filtered_df['Marital_Status'].replace({'Alone': 'Single', 'Widow': 'Single', 'YOLO': 'Single', 'Divorced': 'Single', 'Married': 'Together'})
filtered_df = filtered_df.drop(filtered_df.loc[filtered_df['Marital_Status']=='Absurd'].index)
filtered_df['Education']=filtered_df['Education'].replace({'Basic': 'Undergraduate', '2n Cycle': 'Undergraduate', 'Graduation': 'Graduate', 'Master': 'Postgraduate', 'PhD': 'Postgraduate'})
# Using quantiles to delete outliers
income_upper_limit = filtered_df['Income'].quantile(0.99)
income_lower_limit = filtered_df['Income'].quantile(0.01)
filtered_df = filtered_df[(filtered_df['Income'] <= income_upper_limit) & (filtered_df['Income'] >= income_lower_limit)]
age_lower_limit = filtered_df['Age'].quantile(0.99)
filtered_df = filtered_df[(filtered_df['Age'] <= age_lower_limit)]
# Drop some rows which were duplicates
filtered_df = filtered_df.drop_duplicates(subset=['Income','Education','Marital_Status','Age'], ignore_index=True)
filtered_df
# Checking correlation between features
correlation_mat = filtered_df.corr()
plt.figure(figsize=(20,20))
cmap = colors.ListedColormap(["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"])
sns.heatmap(correlation_mat,annot=True, cmap=cmap, center=0)
Data Preprocessing
# One hot encoding using get_dummies on the object type features
object_cols = filtered_df.select_dtypes(['object']).columns
X = pd.get_dummies(filtered_df, columns=object_cols)
# Selecting all numerical features
numeric_cols = filtered_df.select_dtypes(['float']).columns
# Scaling data
standard_scaler = StandardScaler()
standard_scaler.fit(X[numeric_cols])
X[numeric_cols] = standard_scaler.transform(X[numeric_cols])
X
Dimensionality Reduction
# PCA is used for Dimensionality Reduction and removes Multicollinearity between features
pca = PCA(n_components=3)
pca.fit(X)
PCA_X = pd.DataFrame(pca.transform(X), columns=['col1', 'col2', 'col3'])
# Elbow technique to choose the right number of clusters
elbow = KElbowVisualizer(KMeans(), k=10)
elbow.fit(PCA_X)
elbow.show()
Clustering
# KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
predict = kmeans.fit_predict(PCA_X)
filtered_df['Clusters'] = predict
Customer Analysis
labels = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3']
cluster0_val = filtered_df[filtered_df["Clusters"]==0].shape[0]
cluster1_val = filtered_df[filtered_df["Clusters"]==1].shape[0]
cluster2_val = filtered_df[filtered_df["Clusters"]==2].shape[0]
cluster3_val = filtered_df[filtered_df["Clusters"]==3].shape[0]
values = [cluster0_val, cluster1_val, cluster2_val, cluster3_val]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole = 0.5, title="Clusters")])
fig.show()
plot = sns.scatterplot(x = filtered_df['Amount_Spent'], y = filtered_df['Income'], hue = filtered_df['Clusters'])
plot.set_title('Clusters based on income and amount spent')
plt.legend()
plt.show()
plt.figure(figsize=(10, 6))
plot = sns.boxplot(x = filtered_df['Clusters'], y = filtered_df['Purchases'], hue = filtered_df['Clusters'])
plot.set_title('Clusters purchases')
plt.legend()
plt.show()
ax = plt.subplot()
ax.bar(filtered_df['Clusters'], filtered_df['Purchases'], width=0.2, color='b', align='center', tick_label=filtered_df['Clusters'], label = 'Purchases')
ax.bar(filtered_df['Clusters'], filtered_df['NumDealsPurchases'], width=0.2, color='g', align='center', tick_label=filtered_df['Clusters'], label = 'Deals')
ax.set_title('Purchases during deals')
plt.xlabel('Cluster')
plt.ylabel('Deals / Purchases')
ax.legend()
plt.show()
plt.figure(figsize=(10, 6))
plot = sns.boxplot(x = filtered_df['Clusters'], y = filtered_df['Age'], hue = filtered_df['Clusters'])
plot.set_title('Clusters age repartition')
plt.legend()
plt.show()
plt.figure(figsize=(10, 6))
plot = sns.barplot(x = filtered_df['Clusters'], y = filtered_df['Children'], hue = filtered_df['Clusters'])
plot.set_title('Clusters childs number')
plt.legend()
plt.show()