E-Commerce EDA v2

pip install pandas-profiling

# Fixing the import issue with pydantic in ProfileReport # Updating the ProfileReport import following the migration instructions provided in the error message import pandas as pd import datetime as dt import seaborn as sns import matplotlib.pyplot as plt from ydata_profiling import ProfileReport

df = pd.read_csv('data.csv', encoding= 'unicode_escape') df.head()

df_check = ProfileReport(df) df_check.to_notebook_iframe()

df.info()

df.isnull().sum()

df = df.dropna() df.info()

list_special_codes = df[df['StockCode'].str.contains('^[a-zA-Z]+', regex=True)]['StockCode'].unique() list_special_codes

df[df['StockCode'].apply(lambda x: x in list_special_codes)] df = df[~df['StockCode'].isin(list_special_codes)].sort_index() df.head()

df['Total Price'] = df['Quantity'] * df['UnitPrice'] df['InvoiceDate'] = df['InvoiceDate'].astype('datetime64[ns]') df

df1 = df.groupby(["Description","CustomerID","Country",])["Total Price"].sum().reset_index() df1

df_2 = df[["InvoiceNo", "StockCode","Description", "InvoiceDate", "CustomerID", "Quantity"]].sort_values(by="CustomerID",ascending=True) df_merge = pd.merge(df1,df_2,how='outer') df_merge

df_negative = df1[df1["Total Price"] <= 0] df_negative

df3 = df_merge[~df_merge["Total Price"].isin(df_negative["Total Price"])] df3

general_trend = pd.DataFrame(data={'Date':pd.to_datetime(df["InvoiceDate"]).dt.date, 'Total Price':df["Total Price"]}) general_trend = general_trend.groupby("Date")["Total Price"].sum() general_trend = pd.DataFrame(general_trend)

dates = [] dates.append(pd.to_datetime("201012",format="%Y%m")) dates += [pd.to_datetime("2011"+str(month),format="%Y%m") for month in range(1,12)]

rolling_days = general_trend.copy() rolling_days["Total Price"] = rolling_days["Total Price"].rolling(window=30).mean()

%matplotlib inline plt.figure(figsize = (18,5)).suptitle('The General Sales Trend', fontsize=20) sns.lineplot(data=general_trend) sns.lineplot(data=rolling_days, palette=['red']) plt.xticks(dates,rotation = 45) plt.show()

df_dup = df3[df3["CustomerID"].duplicated(keep=False)] df3["All StockCode"] = df_dup.groupby(["InvoiceNo","CustomerID"])["StockCode"].transform(', '.join) df3.head()

df_dup = df3[[ "InvoiceNo", "CustomerID","All StockCode","InvoiceDate", "Country"]].drop_duplicates() df_dup.head()

df4 = df3.groupby(["InvoiceNo"])["Total Price"].sum().reset_index() df4.head()

df_dup2 = pd.merge(df_dup,df4,on='InvoiceNo') df_dup2["InvoiceNo"] = df_dup2["InvoiceNo"].replace('C', '', regex=True) df_dup2

most_recent_date = df_dup2["InvoiceDate"].max() rfm_data = df_dup2.groupby(by='CustomerID').aggregate({ 'InvoiceDate' : lambda x: (most_recent_date - x.max()).days, 'InvoiceNo' : lambda x: len(x), 'Total Price' : lambda x: sum(x) }) rfm_data.columns = ['Recency', 'Frequency', 'Monetary Total'] rfm_data.head()

from sklearn.preprocessing import StandardScaler rfm_data_scale = StandardScaler() rfm_data_scale = rfm_data_scale.fit_transform(rfm_data) rfm_data_scale = rfm_data_scale.tolist() df_rfm_data_scale = pd.DataFrame (rfm_data_scale, columns = ['Recency', 'Frequency', 'Monetary Total']) df_rfm_data_scale

from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples, silhouette_score

sse = {} for k in range(1, 11): kmeans = KMeans(n_clusters=k, random_state=1) kmeans.fit(df_rfm_data_scale) sse[k] = kmeans.inertia_ # Plot SSE for each value of k plt.title('The Elbow Method') plt.xlabel('k'); plt.ylabel('SSE') sns.pointplot(x=list(sse.keys()), y=list(sse.values())) plt.show()

km = KMeans(n_clusters=3) km

y_predicted = km.fit_predict(df_rfm_data_scale[['Frequency', 'Monetary Total']]) y_predicted

rfm_data['cluster_Kmeans'] = y_predicted rfm_data.head()

rfm_data["cluster_Kmeans"].value_counts()

rfm_data = rfm_data.reset_index()

rfm_data0 = rfm_data[rfm_data["cluster_Kmeans"] == 0] rfm_data1 = rfm_data[rfm_data["cluster_Kmeans"] == 1] rfm_data2 = rfm_data[rfm_data["cluster_Kmeans"] == 2] rfm_data0.head()

## Group 1 rfm_data1.head()

## Group 2 rfm_data2.head()

sns.scatterplot(rfm_data, x = "Frequency", y = "Monetary Total", hue = y_predicted, palette='Set1')

=> Most customers in group 2 have the most revenue. Besides that, group 0 is also a potential group can generate more revenue.

sns.scatterplot(rfm_data, x = "Recency", y = "Monetary Total", hue = y_predicted, palette='Set1')

=> The higher recency, the less revenue is generated. Both group 0 and 2 have good performance about less Recency and high Monetary Value.

sns.scatterplot(rfm_data, x = "Frequency", y = "Recency", hue = y_predicted, palette='Set1')

=> Obviously, when a customer purchases with a high Frequency, Recency will be inversely proportional to this.

import matplotlib.cm cmap = matplotlib.cm.get_cmap('Blues') # Create a copy of the DataFrame df_fpgrowth = df.copy() # Drop columns with datetime64 type to avoid the TypeError datetime_cols = df_fpgrowth.select_dtypes(include=['datetime64']).columns df_fpgrowth = df_fpgrowth.drop(columns=datetime_cols) df_fpgrowth["incident_count"] = 1 df_fpgrowth = df_fpgrowth.groupby("Description").sum().sort_values("incident_count", ascending=False).reset_index() df_fpgrowth

import plotly.express as px df_fpgrowth["all"] = "Top 10 items" fig = px.treemap(df_fpgrowth.head(10), path=['all', "Description"], values='incident_count', color=df_fpgrowth["incident_count"].head(10), hover_data=['Description'], color_continuous_scale='Blues') fig

df_preprocessing_fpgrowth = df3.groupby(['CustomerID', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('CustomerID') df_preprocessing_fpgrowth

def num(x): if x == 0: return False if x >= 1: return True df_for_fpgrowth = df_preprocessing_fpgrowth.applymap(num) df_for_fpgrowth.head(10)

from mlxtend.frequent_patterns import fpgrowth from mlxtend.frequent_patterns import association_rules res=fpgrowth(df_for_fpgrowth,min_support=0.05, use_colnames=True) res.head(10)

from mlxtend.frequent_patterns import association_rules num_itemsets = len(res) final_association_rules = association_rules(res, num_itemsets=num_itemsets, metric="confidence", min_threshold=0.5).sort_values("confidence", ascending=False) final_association_rules.head()

# Grouping data by clusters and calculating total revenue for each segment revenue_by_segment = rfm_data.groupby('cluster_Kmeans')['Monetary Total'].sum().reset_index() revenue_by_segment.columns = ['Customer Segment', 'Total Revenue'] revenue_by_segment