import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
df = pd.read_csv('data.csv', encoding= 'unicode_escape')
df.head()
profile = ProfileReport(df, title="E-Commerce")
profile.to_notebook_iframe()
df.info()
df.isnull().sum()
df = df.dropna()
df.info()
list_special_codes = df[df['StockCode'].str.contains('^[a-zA-Z]+', regex=True)]['StockCode'].unique()
list_special_codes
df[df['StockCode'].apply(lambda x: x in list_special_codes)]
df = df[~df['StockCode'].isin(list_special_codes)].sort_index()
df.head()
df['Total Price'] = df['Quantity'] * df['UnitPrice']
df['InvoiceDate'] = df['InvoiceDate'].astype('datetime64[ns]')
df
df1 = df.groupby(["Description","CustomerID","Country",])["Total Price"].sum().reset_index()
df1
df2 = df[["InvoiceNo", "StockCode","Description", "InvoiceDate", "CustomerID", "Quantity"]].sort_values(by="CustomerID",ascending=True)
df_merge = pd.merge(df1,df2,how='outer')
df_merge
df_negative = df1[df1["Total Price"] <= 0]
df_negative
df3 = df_merge[~df_merge["Total Price"].isin(df_negative["Total Price"])]
df3
general_trend = pd.DataFrame(data={'Date':pd.to_datetime(df["InvoiceDate"]).dt.date,
'Total Price':df["Total Price"]})
general_trend = general_trend.groupby("Date")["Total Price"].sum()
general_trend = pd.DataFrame(general_trend)
dates = []
dates.append(pd.to_datetime("201012",format="%Y%m"))
dates += [pd.to_datetime("2011"+str(month),format="%Y%m") for month in range(1,12)]
rolling_days = general_trend.copy()
rolling_days["Total Price"] = rolling_days["Total Price"].rolling(window=30).mean()
%matplotlib inline
plt.figure(figsize = (18,5)).suptitle('The General Sales Trend', fontsize=20)
sns.lineplot(data=general_trend)
sns.lineplot(data=rolling_days, palette=['red'])
plt.xticks(dates,rotation = 45)
plt.show()
df_dup = df3[df3["CustomerID"].duplicated(keep=False)]
df3["All StockCode"] = df_dup.groupby(["InvoiceNo","CustomerID"])["StockCode"].transform(', '.join)
df3.head()
df_dup = df3[[ "InvoiceNo", "CustomerID","All StockCode","InvoiceDate", "Country"]].drop_duplicates()
df_dup.head()
df4 = df3.groupby(["InvoiceNo"])["Total Price"].sum().reset_index()
df4.head()
df_dup2 = pd.merge(df_dup,df4,on='InvoiceNo')
df_dup2["InvoiceNo"] = df_dup2["InvoiceNo"].replace('C', '', regex=True)
df_dup2
most_recent_date = df_dup2["InvoiceDate"].max()
rfm_data = df_dup2.groupby(by='CustomerID').aggregate({
'InvoiceDate' : lambda x: (most_recent_date - x.max()).days,
'InvoiceNo' : lambda x: len(x),
'Total Price' : lambda x: sum(x)
})
rfm_data.columns = ['Recency', 'Frequency', 'Monetary Total']
rfm_data.head()
from sklearn.preprocessing import StandardScaler
rfm_data_scale = StandardScaler()
rfm_data_scale = rfm_data_scale.fit_transform(rfm_data)
rfm_data_scale = rfm_data_scale.tolist()
df_rfm_data_scale = pd.DataFrame (rfm_data_scale, columns = ['Recency', 'Frequency', 'Monetary Total'])
df_rfm_data_scale
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
sse = {}
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(df_rfm_data_scale)
sse[k] = kmeans.inertia_
# Plot SSE for each value of k
plt.title('The Elbow Method')
plt.xlabel('k');
plt.ylabel('SSE')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()
km = KMeans(n_clusters=3)
km
y_predicted = km.fit_predict(df_rfm_data_scale[['Frequency', 'Monetary Total']])
y_predicted
rfm_data['cluster_Kmeans'] = y_predicted
rfm_data.head()
rfm_data["cluster_Kmeans"].value_counts()
rfm_data = rfm_data.reset_index()
rfm_data0 = rfm_data[rfm_data["cluster_Kmeans"] == 0]
rfm_data1 = rfm_data[rfm_data["cluster_Kmeans"] == 1]
rfm_data2 = rfm_data[rfm_data["cluster_Kmeans"] == 2]
rfm_data0.head()
## Group 1
print(rfm_data0.head())
# Create theplot
plt.figure(figsize=(12, 8))
sns.scatterplot(
data=rfm_data0,
x='Frequency',
y='Monetary Total',
size='Monetary Total',
hue='Recency',
sizes=(100, 1000),
palette='cool',
legend='brief',
alpha=0.6
)
# Customize the plot
plt.title('Group 1')
plt.xlabel('Frequency')
plt.ylabel('Monetary Total')
plt.legend(title='Recency')
plt.grid(True)
plt.show()
## Group 2
print(rfm_data1.head())
# Create theplot
plt.figure(figsize=(12, 8))
sns.scatterplot(
data=rfm_data1,
x='Frequency',
y='Monetary Total',
size='Monetary Total',
hue='Recency',
sizes=(100, 1000),
palette='cool',
legend='brief',
alpha=0.6
)
# Customize the plot
plt.title('Group 2')
plt.xlabel('Frequency')
plt.ylabel('Monetary Total')
plt.legend(title='Recency')
plt.grid(True)
plt.show()
## Group 0
print(rfm_data2.head())
# Create theplot
plt.figure(figsize=(12, 8))
sns.scatterplot(
data=rfm_data2,
x='Frequency',
y='Monetary Total',
size='Monetary Total',
hue='Recency',
sizes=(100, 1000),
palette='cool',
legend='brief',
alpha=0.6
)
# Customize the plot
plt.title('Group 0')
plt.xlabel('Frequency')
plt.ylabel('Monetary Total')
plt.legend(title='Recency')
plt.grid(True)
plt.show()
sns.scatterplot(rfm_data, x = "Frequency", y = "Monetary Total", hue = y_predicted, palette='Set1')
sns.scatterplot(rfm_data, x = "Recency", y = "Monetary Total", hue = y_predicted, palette='Set1')
sns.scatterplot(rfm_data, x = "Frequency", y = "Recency", hue = y_predicted, palette='Set1')
df_fpgrowth
df_fpgrowth = df
df_fpgrowth["incident_count"] = 1
df_fpgrowth = df_fpgrowth.groupby("Description").sum(numeric_only=True).sort_values("incident_count", ascending=False).reset_index()
df_fpgrowth.head(10).style.background_gradient(cmap='Blues')
import plotly.express as px
df_fpgrowth["all"] = "Top 10 items"
fig = px.treemap(df_fpgrowth.head(10), path=['all', "Description"], values='incident_count',
color=df_fpgrowth["incident_count"].head(10), hover_data=['Description'],
color_continuous_scale='Blues')
fig
df_preprocessing_fpgrowth = df3.groupby(['CustomerID', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('CustomerID')
df_preprocessing_fpgrowth
def num(x):
if x == 0:
return False
if x >= 1:
return True
df_for_fpgrowth = df_preprocessing_fpgrowth.applymap(num)
df_for_fpgrowth.head(10)
pip install mlxtend
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
res=fpgrowth(df_for_fpgrowth,min_support=0.05, use_colnames=True)
res.head(10)
final_association_rules = association_rules(res, metric="confidence", min_threshold=0.5).sort_values("confidence",ascending=False)
final_association_rules.head()