E-commerce analysis

import pandas as pd import datetime as dt import seaborn as sns import matplotlib.pyplot as plt from ydata_profiling import ProfileReport

df = pd.read_csv('data.csv', encoding= 'unicode_escape') df.head()

profile = ProfileReport(df, title="E-Commerce") profile.to_notebook_iframe()

df.info()

df.isnull().sum()

df = df.dropna() df.info()

list_special_codes = df[df['StockCode'].str.contains('^[a-zA-Z]+', regex=True)]['StockCode'].unique() list_special_codes

df[df['StockCode'].apply(lambda x: x in list_special_codes)] df = df[~df['StockCode'].isin(list_special_codes)].sort_index() df.head()

df['Total Price'] = df['Quantity'] * df['UnitPrice'] df['InvoiceDate'] = df['InvoiceDate'].astype('datetime64[ns]') df

df1 = df.groupby(["Description","CustomerID","Country",])["Total Price"].sum().reset_index() df1

df2 = df[["InvoiceNo", "StockCode","Description", "InvoiceDate", "CustomerID", "Quantity"]].sort_values(by="CustomerID",ascending=True) df_merge = pd.merge(df1,df2,how='outer') df_merge

df_negative = df1[df1["Total Price"] <= 0] df_negative

df3 = df_merge[~df_merge["Total Price"].isin(df_negative["Total Price"])] df3

general_trend = pd.DataFrame(data={'Date':pd.to_datetime(df["InvoiceDate"]).dt.date, 'Total Price':df["Total Price"]}) general_trend = general_trend.groupby("Date")["Total Price"].sum() general_trend = pd.DataFrame(general_trend)

dates = [] dates.append(pd.to_datetime("201012",format="%Y%m")) dates += [pd.to_datetime("2011"+str(month),format="%Y%m") for month in range(1,12)]

rolling_days = general_trend.copy() rolling_days["Total Price"] = rolling_days["Total Price"].rolling(window=30).mean()

%matplotlib inline plt.figure(figsize = (18,5)).suptitle('The General Sales Trend', fontsize=20) sns.lineplot(data=general_trend) sns.lineplot(data=rolling_days, palette=['red']) plt.xticks(dates,rotation = 45) plt.show()

df_dup = df3[df3["CustomerID"].duplicated(keep=False)] df3["All StockCode"] = df_dup.groupby(["InvoiceNo","CustomerID"])["StockCode"].transform(', '.join) df3.head()

df_dup = df3[[ "InvoiceNo", "CustomerID","All StockCode","InvoiceDate", "Country"]].drop_duplicates() df_dup.head()

df4 = df3.groupby(["InvoiceNo"])["Total Price"].sum().reset_index() df4.head()

df_dup2 = pd.merge(df_dup,df4,on='InvoiceNo') df_dup2["InvoiceNo"] = df_dup2["InvoiceNo"].replace('C', '', regex=True) df_dup2

most_recent_date = df_dup2["InvoiceDate"].max() rfm_data = df_dup2.groupby(by='CustomerID').aggregate({ 'InvoiceDate' : lambda x: (most_recent_date - x.max()).days, 'InvoiceNo' : lambda x: len(x), 'Total Price' : lambda x: sum(x) }) rfm_data.columns = ['Recency', 'Frequency', 'Monetary Total'] rfm_data.head()

from sklearn.preprocessing import StandardScaler rfm_data_scale = StandardScaler() rfm_data_scale = rfm_data_scale.fit_transform(rfm_data) rfm_data_scale = rfm_data_scale.tolist() df_rfm_data_scale = pd.DataFrame (rfm_data_scale, columns = ['Recency', 'Frequency', 'Monetary Total']) df_rfm_data_scale

from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples, silhouette_score

sse = {} for k in range(1, 11): kmeans = KMeans(n_clusters=k, random_state=1) kmeans.fit(df_rfm_data_scale) sse[k] = kmeans.inertia_ # Plot SSE for each value of k plt.title('The Elbow Method') plt.xlabel('k'); plt.ylabel('SSE') sns.pointplot(x=list(sse.keys()), y=list(sse.values())) plt.show()

km = KMeans(n_clusters=3) km

y_predicted = km.fit_predict(df_rfm_data_scale[['Frequency', 'Monetary Total']]) y_predicted

rfm_data['cluster_Kmeans'] = y_predicted rfm_data.head()

rfm_data["cluster_Kmeans"].value_counts()

rfm_data = rfm_data.reset_index()

rfm_data0 = rfm_data[rfm_data["cluster_Kmeans"] == 0] rfm_data1 = rfm_data[rfm_data["cluster_Kmeans"] == 1] rfm_data2 = rfm_data[rfm_data["cluster_Kmeans"] == 2] rfm_data0.head()

## Group 1 print(rfm_data0.head()) # Create theplot plt.figure(figsize=(12, 8)) sns.scatterplot( data=rfm_data0, x='Frequency', y='Monetary Total', size='Monetary Total', hue='Recency', sizes=(100, 1000), palette='cool', legend='brief', alpha=0.6 ) # Customize the plot plt.title('Group 1') plt.xlabel('Frequency') plt.ylabel('Monetary Total') plt.legend(title='Recency') plt.grid(True) plt.show()

## Group 2 print(rfm_data1.head()) # Create theplot plt.figure(figsize=(12, 8)) sns.scatterplot( data=rfm_data1, x='Frequency', y='Monetary Total', size='Monetary Total', hue='Recency', sizes=(100, 1000), palette='cool', legend='brief', alpha=0.6 ) # Customize the plot plt.title('Group 2') plt.xlabel('Frequency') plt.ylabel('Monetary Total') plt.legend(title='Recency') plt.grid(True) plt.show()

## Group 0 print(rfm_data2.head()) # Create theplot plt.figure(figsize=(12, 8)) sns.scatterplot( data=rfm_data2, x='Frequency', y='Monetary Total', size='Monetary Total', hue='Recency', sizes=(100, 1000), palette='cool', legend='brief', alpha=0.6 ) # Customize the plot plt.title('Group 0') plt.xlabel('Frequency') plt.ylabel('Monetary Total') plt.legend(title='Recency') plt.grid(True) plt.show()

sns.scatterplot(rfm_data, x = "Frequency", y = "Monetary Total", hue = y_predicted, palette='Set1')

sns.scatterplot(rfm_data, x = "Recency", y = "Monetary Total", hue = y_predicted, palette='Set1')

sns.scatterplot(rfm_data, x = "Frequency", y = "Recency", hue = y_predicted, palette='Set1')

df_fpgrowth

df_fpgrowth = df df_fpgrowth["incident_count"] = 1 df_fpgrowth = df_fpgrowth.groupby("Description").sum(numeric_only=True).sort_values("incident_count", ascending=False).reset_index() df_fpgrowth.head(10).style.background_gradient(cmap='Blues')

import plotly.express as px df_fpgrowth["all"] = "Top 10 items" fig = px.treemap(df_fpgrowth.head(10), path=['all', "Description"], values='incident_count', color=df_fpgrowth["incident_count"].head(10), hover_data=['Description'], color_continuous_scale='Blues') fig

df_preprocessing_fpgrowth = df3.groupby(['CustomerID', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('CustomerID') df_preprocessing_fpgrowth

def num(x): if x == 0: return False if x >= 1: return True df_for_fpgrowth = df_preprocessing_fpgrowth.applymap(num) df_for_fpgrowth.head(10)

pip install mlxtend

from mlxtend.frequent_patterns import fpgrowth from mlxtend.frequent_patterns import association_rules res=fpgrowth(df_for_fpgrowth,min_support=0.05, use_colnames=True) res.head(10)

final_association_rules = association_rules(res, metric="confidence", min_threshold=0.5).sort_values("confidence",ascending=False) final_association_rules.head()