pip install pandas-profiling
# Fixing the import issue with pydantic in ProfileReport
# Updating the ProfileReport import following the migration instructions provided in the error message
import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
df = pd.read_csv('data.csv', encoding= 'unicode_escape')
df.head()
df_check = ProfileReport(df)
df_check.to_notebook_iframe()
df.info()
df.isnull().sum()
df = df.dropna()
df.info()
list_special_codes = df[df['StockCode'].str.contains('^[a-zA-Z]+', regex=True)]['StockCode'].unique()
list_special_codes
df[df['StockCode'].apply(lambda x: x in list_special_codes)]
df = df[~df['StockCode'].isin(list_special_codes)].sort_index()
df.head()
df['Total Price'] = df['Quantity'] * df['UnitPrice']
df['InvoiceDate'] = df['InvoiceDate'].astype('datetime64[ns]')
df
df1 = df.groupby(["Description","CustomerID","Country",])["Total Price"].sum().reset_index()
df1
df_2 = df[["InvoiceNo", "StockCode","Description", "InvoiceDate", "CustomerID", "Quantity"]].sort_values(by="CustomerID",ascending=True)
df_merge = pd.merge(df1,df_2,how='outer')
df_merge
df_negative = df1[df1["Total Price"] <= 0]
df_negative
df3 = df_merge[~df_merge["Total Price"].isin(df_negative["Total Price"])]
df3
general_trend = pd.DataFrame(data={'Date':pd.to_datetime(df["InvoiceDate"]).dt.date,
'Total Price':df["Total Price"]})
general_trend = general_trend.groupby("Date")["Total Price"].sum()
general_trend = pd.DataFrame(general_trend)
dates = []
dates.append(pd.to_datetime("201012",format="%Y%m"))
dates += [pd.to_datetime("2011"+str(month),format="%Y%m") for month in range(1,12)]
rolling_days = general_trend.copy()
rolling_days["Total Price"] = rolling_days["Total Price"].rolling(window=30).mean()
%matplotlib inline
plt.figure(figsize = (18,5)).suptitle('The General Sales Trend', fontsize=20)
sns.lineplot(data=general_trend)
sns.lineplot(data=rolling_days, palette=['red'])
plt.xticks(dates,rotation = 45)
plt.show()
df_dup = df3[df3["CustomerID"].duplicated(keep=False)]
df3["All StockCode"] = df_dup.groupby(["InvoiceNo","CustomerID"])["StockCode"].transform(', '.join)
df3.head()
df_dup = df3[[ "InvoiceNo", "CustomerID","All StockCode","InvoiceDate", "Country"]].drop_duplicates()
df_dup.head()
df4 = df3.groupby(["InvoiceNo"])["Total Price"].sum().reset_index()
df4.head()
df_dup2 = pd.merge(df_dup,df4,on='InvoiceNo')
df_dup2["InvoiceNo"] = df_dup2["InvoiceNo"].replace('C', '', regex=True)
df_dup2
most_recent_date = df_dup2["InvoiceDate"].max()
rfm_data = df_dup2.groupby(by='CustomerID').aggregate({
'InvoiceDate' : lambda x: (most_recent_date - x.max()).days,
'InvoiceNo' : lambda x: len(x),
'Total Price' : lambda x: sum(x)
})
rfm_data.columns = ['Recency', 'Frequency', 'Monetary Total']
rfm_data.head()
from sklearn.preprocessing import StandardScaler
rfm_data_scale = StandardScaler()
rfm_data_scale = rfm_data_scale.fit_transform(rfm_data)
rfm_data_scale = rfm_data_scale.tolist()
df_rfm_data_scale = pd.DataFrame (rfm_data_scale, columns = ['Recency', 'Frequency', 'Monetary Total'])
df_rfm_data_scale
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
sse = {}
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(df_rfm_data_scale)
sse[k] = kmeans.inertia_
# Plot SSE for each value of k
plt.title('The Elbow Method')
plt.xlabel('k');
plt.ylabel('SSE')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()
km = KMeans(n_clusters=3)
km
y_predicted = km.fit_predict(df_rfm_data_scale[['Frequency', 'Monetary Total']])
y_predicted
rfm_data['cluster_Kmeans'] = y_predicted
rfm_data.head()
rfm_data["cluster_Kmeans"].value_counts()
rfm_data = rfm_data.reset_index()
rfm_data0 = rfm_data[rfm_data["cluster_Kmeans"] == 0]
rfm_data1 = rfm_data[rfm_data["cluster_Kmeans"] == 1]
rfm_data2 = rfm_data[rfm_data["cluster_Kmeans"] == 2]
rfm_data0.head()
## Group 1
rfm_data1.head()
## Group 2
rfm_data2.head()
sns.scatterplot(rfm_data, x = "Frequency", y = "Monetary Total", hue = y_predicted, palette='Set1')
=> Most customers in group 2 have the most revenue. Besides that, group 0 is also a potential group can generate more revenue.
sns.scatterplot(rfm_data, x = "Recency", y = "Monetary Total", hue = y_predicted, palette='Set1')
=> The higher recency, the less revenue is generated. Both group 0 and 2 have good performance about less Recency and high Monetary Value.
sns.scatterplot(rfm_data, x = "Frequency", y = "Recency", hue = y_predicted, palette='Set1')
=> Obviously, when a customer purchases with a high Frequency, Recency will be inversely proportional to this.
import matplotlib.cm
cmap = matplotlib.cm.get_cmap('Blues')
# Create a copy of the DataFrame
df_fpgrowth = df.copy()
# Drop columns with datetime64 type to avoid the TypeError
datetime_cols = df_fpgrowth.select_dtypes(include=['datetime64']).columns
df_fpgrowth = df_fpgrowth.drop(columns=datetime_cols)
df_fpgrowth["incident_count"] = 1
df_fpgrowth = df_fpgrowth.groupby("Description").sum().sort_values("incident_count", ascending=False).reset_index()
df_fpgrowth
import plotly.express as px
df_fpgrowth["all"] = "Top 10 items"
fig = px.treemap(df_fpgrowth.head(10), path=['all', "Description"], values='incident_count',
color=df_fpgrowth["incident_count"].head(10), hover_data=['Description'],
color_continuous_scale='Blues')
fig
df_preprocessing_fpgrowth = df3.groupby(['CustomerID', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('CustomerID')
df_preprocessing_fpgrowth
def num(x):
if x == 0:
return False
if x >= 1:
return True
df_for_fpgrowth = df_preprocessing_fpgrowth.applymap(num)
df_for_fpgrowth.head(10)
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
res=fpgrowth(df_for_fpgrowth,min_support=0.05, use_colnames=True)
res.head(10)
from mlxtend.frequent_patterns import association_rules
num_itemsets = len(res)
final_association_rules = association_rules(res, num_itemsets=num_itemsets, metric="confidence", min_threshold=0.5).sort_values("confidence", ascending=False)
final_association_rules.head()
# Grouping data by clusters and calculating total revenue for each segment
revenue_by_segment = rfm_data.groupby('cluster_Kmeans')['Monetary Total'].sum().reset_index()
revenue_by_segment.columns = ['Customer Segment', 'Total Revenue']
revenue_by_segment