import pandas as pd
import numpy as np
Geocoding sellers and customers
# importing geolocation data and formating zip code
geolocation = pd.read_csv('olist_geolocation_dataset.csv')
geolocation.shape
#the data has dupplicates so it needs cleaning and formating to string
tmp = geolocation[['geolocation_zip_code_prefix']].apply(pd.value_counts).sum(1)
geolocation = geolocation.assign(zip_code_count = geolocation['geolocation_zip_code_prefix'].map(tmp))
geolocation = geolocation.drop_duplicates(['geolocation_zip_code_prefix'])
geolocation.sort_values(by = 'zip_code_count', ascending = False,inplace = True)
# formating to string
geolocation['geolocation_lat'] = geolocation['geolocation_lat'].astype(float)
geolocation['geolocation_lng'] = geolocation['geolocation_lng'].astype(float)
geolocation = geolocation.drop(columns=['geolocation_city', 'geolocation_state', 'zip_code_count'])
geolocation.shape
Add the geolocation to the database of orders
# Uploading the database of orders
df = pd.read_csv('brazil_ecom4.csv')
df.head()
# rename the column to match zip code of sellers
geolocation.rename(columns = {'geolocation_zip_code_prefix':'seller_zip_code_prefix'},inplace = True)
geolocation.dtypes
# addind the location columns for the sellers
df2 = pd.merge(
df,
geolocation,
on = 'seller_zip_code_prefix',
how = 'inner')
df2.shape
#renaming the lat and lgn columns
df2.rename(columns={
'geolocation_lat':'seller_lat',
'geolocation_lng': 'seller_lng'
},inplace=True)
df2.shape
# rename the column to match zip code of customers
geolocation.rename(columns = {'seller_zip_code_prefix':'customer_zip_code_prefix'},inplace = True)
geolocation.dtypes
# addind the location columns for the customers
df2 = pd.merge(
df2,
geolocation,
on = 'customer_zip_code_prefix',
how = 'inner')
df2.shape
#renaming the lat and lgn columns
df2.rename(columns={
'geolocation_lat':'customer_lat',
'geolocation_lng': 'customer_lng',
},inplace=True)
df2.shape
df2.head()
Calculating the harvesting distance
from math import radians, cos, sin, asin, sqrt
def geodistance(lng1,lat1,lng2,lat2):
lng1, lat1, lng2, lat2 = map(radians, [float(lng1), float(lat1), float(lng2), float(lat2)])
dlon=lng2-lng1
dlat=lat2-lat1
a=sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
distance=2*asin(sqrt(a))*6371*1000
distance=round(distance/1000,3)
return distance
# data_seller_customer['distance'] = data_seller_customer[['seller_lng','seller_lat','customer_lng',
# 'customer_lat']].apply(geodistance, axis = 1)
A = df2['seller_lng'].tolist()
B = df2['seller_lat'].tolist()
C = df2['customer_lng'].tolist()
D = df2['customer_lat'].tolist()
df2['distance'] = df2['seller_lat']
for i in range(len(A)):
df2.distance.loc[i] = geodistance(A[i],B[i],C[i],D[i])
df2.head()
df2.to_csv('Brazil_ecom_final.csv', index=False)
df = df2.drop(columns = ['order_id','price','customer_zip_code_prefix', 'seller_zip_code_prefix','seller_lat','seller_lng','customer_lat','customer_lng'])
df = df.dropna()
df.describe()
#creating a dictionary for payment types
print(df['payment_type'].unique())
pay_map = {
'credit_card':1,
'voucher':2,
'debit_card':3,
'boleto':4
}
#Replace in dataframe
df['payment_type'] = df.payment_type.replace(pay_map)
df.head()
Visualisation of the variables
import seaborn as sns
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
sns.histplot(data=df, x="freight", bins= 20, color="skyblue", ax=axs[0, 0])
sns.histplot(data=df, x="payment_value", bins= 20, color="olive", ax=axs[0, 1])
sns.histplot(data=df, x="delivery_gap", bins= 20, color="gold", ax=axs[1, 0])
sns.histplot(data=df, x="distance", bins= 20, color="teal", ax=axs[1, 1])
pay_type = ['credit_card', 'voucher', 'debit_card', 'boleto']
pay_type
reviews = [1,2,3,4,5]
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
sns.countplot(data=df, x= 'review_score', palette="crest", ax=axs[0])
sns.countplot(data=df, x='payment_type', palette="rocket",ax=axs[1])
axs[1].set_xticklabels(pay_type)
# https://dev.to/thalesbruno/subplotting-with-matplotlib-and-seaborn-5ei8
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(data=df, x= 'itens', color="olive", bins= 20, ax=axs[0])
sns.histplot(data=df, x='payment_installments', color="blue", bins= 20, ax=axs[1])
corrMatrix = df.corr()
print (corrMatrix)
# heatmap of the correlations among values of the database)
sns.set(rc = {'figure.figsize':(15,8)})
sn.heatmap(corrMatrix,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True,
annot=True)
plt.show()
corr_spear = df.corr(method="spearman")
# heatmap of the correlations among values of the database)
sns.set(rc = {'figure.figsize':(15,8)})
sn.heatmap(corr_spear,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True,
annot=True)
plt.show()
# !pip install kmodes
# Selecting the numerical features
numerical_features = list(df.select_dtypes(include=['int64','float64', 'uint8']).columns)
numerical_features
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
# Clustering Librairies import
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance
from kmodes.kprototypes import KPrototypes
from sklearn.pipeline import Pipeline, make_pipeline
# Scaling the numerical variables
scaler = MinMaxScaler()
preprocessor = ColumnTransformer([
('scaler', scaler, numerical_features)])
# Elbow method
X = df.copy()
kmeans_visualizer = Pipeline([
("preprocessor", preprocessor),
("kelbowvisualizer", KElbowVisualizer(KMeans(),K=(4,12)))])
kmeans_visualizer.fit(X)
kmeans_visualizer.named_steps['kelbowvisualizer'].show()
# Best K in Elbow
K = kmeans_visualizer.named_steps['kelbowvisualizer'].elbow_value_
# Silhouette Visualizer
silhouette_visualizer = Pipeline([
("preprocessor", preprocessor),
("silhouettevisualizer", SilhouetteVisualizer(KMeans(K)))])
silhouette_visualizer.fit(X)
silhouette_visualizer.named_steps['silhouettevisualizer'].show()
# Silhouette Visualizer for 5 clusters, aiming to visualise that is not as good than 4 clusters..
silhouette_visualizer = Pipeline([
("preprocessor", preprocessor),
("silhouettevisualizer", SilhouetteVisualizer(KMeans(5)))])
silhouette_visualizer.fit(X)
silhouette_visualizer.named_steps['silhouettevisualizer'].show()
# Intercluster distance Map with best k
distance_visualizer = Pipeline([
("preprocessor", preprocessor),
("distancevisualizer", InterclusterDistance(KMeans(K)))])
distance_visualizer.fit(X)
distance_visualizer.named_steps['distancevisualizer'].show()
# 1,2 - KMeans Pipeline with best K
kmeans_model = Pipeline([("preprocessor", preprocessor),
("kmeans", KMeans(K))])
kmeans_model.fit(X)
# 3 - Kmeans labels
kmeans_labels = kmeans_model.named_steps['kmeans'].labels_
df["kmeans_label"] = kmeans_labels
# 4 - Grouping averages by clusters
kmeans_clusters_means = df.groupby("kmeans_label").mean().reset_index()
kmeans_clusters_means
# 5 - Clustering the transformed data - X - with the model and grouping averages
X_scaled = preprocessor.fit_transform(X) #create the object with the preprocessor on X
X_scaled = pd.DataFrame(X_scaled, index=X.index, columns=X.columns) #create a dataframe with that
X_scaled["kmeans_label"] = kmeans_labels #label the data with the clusters
X_scaled_clusters = X_scaled.groupby("kmeans_label").mean() # now grouping the scaled object by average
X_scaled_clusters
def plot_radars(data, group):
# our data has multiple scales, so we use a scaler to make the comparison possible
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data),
index=data.index,
columns=data.columns).reset_index()
#starting the figure in plotly
fig = go.Figure()
#creating the areas inside the radar plot
for k in data[group]:
fig.add_trace(go.Scatterpolar(
r=data[data[group]==k].iloc[:,1:].values.reshape(-1), # the values of each k group
theta=data.columns[1:], #the name labels in the radar, from the columns names
#fill='toself', #it fills the areas
name='Cluster '+str(k) #naming each area with the cluster name
))
#updating the layout of the radar
fig.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])), #updates the range
showlegend=True,
title= {'text': "Comparison of means by cluster variable"},
title_font_size=18)
fig.show()
plot_radars(data=X_scaled_clusters,
group="kmeans_label")