import pandas as pd import numpy as np

Geocoding sellers and customers

# importing geolocation data and formating zip code geolocation = pd.read_csv('olist_geolocation_dataset.csv') geolocation.shape

#the data has dupplicates so it needs cleaning and formating to string tmp = geolocation[['geolocation_zip_code_prefix']].apply(pd.value_counts).sum(1)

geolocation = geolocation.assign(zip_code_count = geolocation['geolocation_zip_code_prefix'].map(tmp))

geolocation = geolocation.drop_duplicates(['geolocation_zip_code_prefix']) geolocation.sort_values(by = 'zip_code_count', ascending = False,inplace = True)

# formating to string geolocation['geolocation_lat'] = geolocation['geolocation_lat'].astype(float) geolocation['geolocation_lng'] = geolocation['geolocation_lng'].astype(float)

geolocation = geolocation.drop(columns=['geolocation_city', 'geolocation_state', 'zip_code_count'])

geolocation.shape

Add the geolocation to the database of orders

# Uploading the database of orders df = pd.read_csv('brazil_ecom4.csv') df.head()

# rename the column to match zip code of sellers geolocation.rename(columns = {'geolocation_zip_code_prefix':'seller_zip_code_prefix'},inplace = True) geolocation.dtypes

# addind the location columns for the sellers df2 = pd.merge( df, geolocation, on = 'seller_zip_code_prefix', how = 'inner') df2.shape

#renaming the lat and lgn columns df2.rename(columns={ 'geolocation_lat':'seller_lat', 'geolocation_lng': 'seller_lng' },inplace=True) df2.shape

# rename the column to match zip code of customers geolocation.rename(columns = {'seller_zip_code_prefix':'customer_zip_code_prefix'},inplace = True) geolocation.dtypes

# addind the location columns for the customers df2 = pd.merge( df2, geolocation, on = 'customer_zip_code_prefix', how = 'inner') df2.shape

#renaming the lat and lgn columns df2.rename(columns={ 'geolocation_lat':'customer_lat', 'geolocation_lng': 'customer_lng', },inplace=True) df2.shape

df2.head()

Calculating the harvesting distance

from math import radians, cos, sin, asin, sqrt def geodistance(lng1,lat1,lng2,lat2): lng1, lat1, lng2, lat2 = map(radians, [float(lng1), float(lat1), float(lng2), float(lat2)]) dlon=lng2-lng1 dlat=lat2-lat1 a=sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 distance=2*asin(sqrt(a))*6371*1000 distance=round(distance/1000,3) return distance

# data_seller_customer['distance'] = data_seller_customer[['seller_lng','seller_lat','customer_lng', # 'customer_lat']].apply(geodistance, axis = 1) A = df2['seller_lng'].tolist() B = df2['seller_lat'].tolist() C = df2['customer_lng'].tolist() D = df2['customer_lat'].tolist() df2['distance'] = df2['seller_lat'] for i in range(len(A)): df2.distance.loc[i] = geodistance(A[i],B[i],C[i],D[i])

df2.head()

df2.to_csv('Brazil_ecom_final.csv', index=False)

df = df2.drop(columns = ['order_id','price','customer_zip_code_prefix', 'seller_zip_code_prefix','seller_lat','seller_lng','customer_lat','customer_lng'])

df = df.dropna()

df.describe()

#creating a dictionary for payment types print(df['payment_type'].unique())

pay_map = { 'credit_card':1, 'voucher':2, 'debit_card':3, 'boleto':4 }

#Replace in dataframe df['payment_type'] = df.payment_type.replace(pay_map) df.head()

Visualisation of the variables

import seaborn as sns import matplotlib.pyplot as plt

fig, axs = plt.subplots(2, 2, figsize=(12, 10)) sns.histplot(data=df, x="freight", bins= 20, color="skyblue", ax=axs[0, 0]) sns.histplot(data=df, x="payment_value", bins= 20, color="olive", ax=axs[0, 1]) sns.histplot(data=df, x="delivery_gap", bins= 20, color="gold", ax=axs[1, 0]) sns.histplot(data=df, x="distance", bins= 20, color="teal", ax=axs[1, 1])

pay_type = ['credit_card', 'voucher', 'debit_card', 'boleto'] pay_type

reviews = [1,2,3,4,5]

fig, axs = plt.subplots(1, 2, figsize=(12, 5)) sns.countplot(data=df, x= 'review_score', palette="crest", ax=axs[0]) sns.countplot(data=df, x='payment_type', palette="rocket",ax=axs[1]) axs[1].set_xticklabels(pay_type) # https://dev.to/thalesbruno/subplotting-with-matplotlib-and-seaborn-5ei8

fig, axs = plt.subplots(1, 2, figsize=(12, 5)) sns.histplot(data=df, x= 'itens', color="olive", bins= 20, ax=axs[0]) sns.histplot(data=df, x='payment_installments', color="blue", bins= 20, ax=axs[1])

corrMatrix = df.corr() print (corrMatrix)

# heatmap of the correlations among values of the database) sns.set(rc = {'figure.figsize':(15,8)}) sn.heatmap(corrMatrix, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=True, annot=True) plt.show()

corr_spear = df.corr(method="spearman")

# heatmap of the correlations among values of the database) sns.set(rc = {'figure.figsize':(15,8)}) sn.heatmap(corr_spear, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=True, annot=True) plt.show()

# !pip install kmodes

# Selecting the numerical features numerical_features = list(df.select_dtypes(include=['int64','float64', 'uint8']).columns) numerical_features

from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.compose import ColumnTransformer, make_column_transformer

# Clustering Librairies import from sklearn.cluster import KMeans from sklearn.cluster import MeanShift, estimate_bandwidth from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance from kmodes.kprototypes import KPrototypes from sklearn.pipeline import Pipeline, make_pipeline

# Scaling the numerical variables scaler = MinMaxScaler() preprocessor = ColumnTransformer([ ('scaler', scaler, numerical_features)])

# Elbow method X = df.copy() kmeans_visualizer = Pipeline([ ("preprocessor", preprocessor), ("kelbowvisualizer", KElbowVisualizer(KMeans(),K=(4,12)))]) kmeans_visualizer.fit(X) kmeans_visualizer.named_steps['kelbowvisualizer'].show()

# Best K in Elbow K = kmeans_visualizer.named_steps['kelbowvisualizer'].elbow_value_ # Silhouette Visualizer silhouette_visualizer = Pipeline([ ("preprocessor", preprocessor), ("silhouettevisualizer", SilhouetteVisualizer(KMeans(K)))]) silhouette_visualizer.fit(X) silhouette_visualizer.named_steps['silhouettevisualizer'].show()

# Silhouette Visualizer for 5 clusters, aiming to visualise that is not as good than 4 clusters.. silhouette_visualizer = Pipeline([ ("preprocessor", preprocessor), ("silhouettevisualizer", SilhouetteVisualizer(KMeans(5)))]) silhouette_visualizer.fit(X) silhouette_visualizer.named_steps['silhouettevisualizer'].show()

# Intercluster distance Map with best k distance_visualizer = Pipeline([ ("preprocessor", preprocessor), ("distancevisualizer", InterclusterDistance(KMeans(K)))]) distance_visualizer.fit(X) distance_visualizer.named_steps['distancevisualizer'].show()

# 1,2 - KMeans Pipeline with best K kmeans_model = Pipeline([("preprocessor", preprocessor), ("kmeans", KMeans(K))]) kmeans_model.fit(X)

# 3 - Kmeans labels kmeans_labels = kmeans_model.named_steps['kmeans'].labels_ df["kmeans_label"] = kmeans_labels

# 4 - Grouping averages by clusters kmeans_clusters_means = df.groupby("kmeans_label").mean().reset_index() kmeans_clusters_means

# 5 - Clustering the transformed data - X - with the model and grouping averages X_scaled = preprocessor.fit_transform(X) #create the object with the preprocessor on X X_scaled = pd.DataFrame(X_scaled, index=X.index, columns=X.columns) #create a dataframe with that X_scaled["kmeans_label"] = kmeans_labels #label the data with the clusters X_scaled_clusters = X_scaled.groupby("kmeans_label").mean() # now grouping the scaled object by average X_scaled_clusters

def plot_radars(data, group): # our data has multiple scales, so we use a scaler to make the comparison possible scaler = MinMaxScaler() data = pd.DataFrame(scaler.fit_transform(data), index=data.index, columns=data.columns).reset_index() #starting the figure in plotly fig = go.Figure() #creating the areas inside the radar plot for k in data[group]: fig.add_trace(go.Scatterpolar( r=data[data[group]==k].iloc[:,1:].values.reshape(-1), # the values of each k group theta=data.columns[1:], #the name labels in the radar, from the columns names #fill='toself', #it fills the areas name='Cluster '+str(k) #naming each area with the cluster name )) #updating the layout of the radar fig.update_layout( polar=dict(radialaxis=dict(visible=True, range=[0, 1])), #updates the range showlegend=True, title= {'text': "Comparison of means by cluster variable"}, title_font_size=18) fig.show()

plot_radars(data=X_scaled_clusters, group="kmeans_label")

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Geocoding sellers and customers

Add the geolocation to the database of orders

Calculating the harvesting distance

Visualisation of the variables

Geocoding sellers and customers