import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
from mlxtend.frequent_patterns import association_rules
%matplotlib inline
items = [('M', 'O', 'N', 'K', 'E', 'Y'),
('D', 'O', 'N', 'K', 'E', 'Y'),
('M', 'A', 'K', 'E'),
('M', 'U', 'C', 'K', 'Y'),
('C', 'O', 'O', 'K', 'I', 'E')]
# Adapted from: http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/
te = TransactionEncoder()
te_ary = te.fit(items).transform(items)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
frequent_itemsets
order_small = pd.read_csv("order_products__train_small.csv")
order_med = pd.read_csv("orders_product__train_med.csv")
products = pd.read_csv("products.csv")
# Adapted from: https://stackoverflow.com/questions/16180946/drawing-average-line-in-histogram-matplotlib
# Calculating number of products per order
products_per_order_small = order_small['order_id'].value_counts()
# Creating histogram
fig = plt.figure(figsize = (12, 10))
plt.hist(products_per_order_small, bins = 71)
# Calculating and plotting mean
plt.axvline(products_per_order_small.mean(),
color = 'k', linestyle = 'dashed', linewidth = 1)
min_ylim, max_ylim = plt.ylim()
plt.text(products_per_order_small.mean()*1.1, max_ylim*0.9,
'Mean: {:.2f}'.format(products_per_order_small.mean()))
# Setting axis and titles
plt.title("Number of products per Order for Small")
plt.ylabel("Number of Products")
plt.xlabel("Orders");
# Calculating number of products per order
products_per_order_med = order_med['order_id'].value_counts()
# Creating histogram
fig = plt.figure(figsize = (12, 10))
plt.hist(products_per_order_med, bins = 71)
# Calculating and plotting mean
plt.axvline(products_per_order_med.mean(),
color = 'k', linestyle = 'dashed', linewidth = 1)
min_ylim, max_ylim = plt.ylim()
plt.text(products_per_order_med.mean()*1.1, max_ylim*0.9,
'Mean: {:.2f}'.format(products_per_order_med.mean()))
# Setting axis and titles
plt.title("Number of products per Order for Med")
plt.ylabel("Number of Products")
plt.xlabel("Orders");
# Gets the top 15 items for small
freq_purchase_small = order_small['product_id'].value_counts()
top15_small = freq_purchase_small[:15]
# Getting all product names
product_name = []
for pname in products["product_name"]:
product_name.append(pname)
# Assign product name to product id
pid_pname = []
for ide in top15_small.keys():
pid_pname.append(product_name[ide-1])
# Creating bar plot
fig, ax = plt.subplots()
ax.bar(pid_pname, top15_small)
fig.autofmt_xdate()
ax.set_title("Frequency of Top 15 Purchases")
ax.set_xlabel("Product Name")
ax.set_ylabel("Purchase Frequency");
# Input: order_id
# Output: all product names on order
def total_order(oid):
# Getting products for order_id
order = order_small.loc[order_small["order_id"] == oid]
tote_ord = []
for each in order["product_id"]:
tote_ord.append(str(each))
# Multiplying product_id by add_to_cart_order
# Finds total amount of products per order
li = []
order = order_small.loc[order_small["order_id"] == oid]
for r in range(1, len(order)+1):
li += r*[tote_ord[r-1]]
# Assigning product_name to product_id
pname = []
for ide in li:
pname.append(product_name[int(ide)-1])
return pname
# EXAMPLE USE:
# total_order(order_id[1])
# Getting unique order_id
order_id = order_small["order_id"].unique()
# Creating array of all products
ords = []
for un in order_id:
ords.append(total_order(un))
# Apriori with min_support = 0.003
te = TransactionEncoder()
te_ary = te.fit(ords).transform(ords)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.003, use_colnames=True)
# Top 10 rules with confidence = 0.5
association = association_rules(frequent_itemsets,
metric="confidence",
min_threshold=0.5)
d = {'antecedents': association["antecedents"][:10],
'consequents': association["consequents"][:10],
'support': association["support"][:10],
'confidence': association["confidence"][:10],
'lift': association["lift"][:10]}
dat = pd.DataFrame(data = d)
dat.sort_values(by = 'lift')
#Apriori with min_support = 0.0025
te = TransactionEncoder()
te_ary = te.fit(ords).transform(ords)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.0025, use_colnames=True)
fi = pd.DataFrame(data = frequent_itemsets[:10])
# Top 10 rules with confidence = 0.5
association = association_rules(frequent_itemsets,
metric="confidence",
min_threshold=0.5)
d = {'antecedents': association["antecedents"][:10],
'consequents': association["consequents"][:10],
'support': association["support"][:10],
'confidence': association["confidence"][:10],
'lift': association["lift"][:10]}
dat = pd.DataFrame(data = d)
dat.sort_values(by = 'lift')
# https://www.statology.org/matplotlib-scatterplot-color-by-value/
# Scatterplot of rules
x = dat["support"]
y = dat["confidence"]
z = dat["lift"]
n = dat["antecedents"]
fig, ax = plt.subplots(figsize=(9, 9))
ax.scatter(x, y, s=150, c=z)
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Rules for Support vs. Confidence ')
labs = str(dat["antecedents"])
for i, txt in enumerate(n):
ax.annotate(list(txt),
(x[i]+0.00005, y[i]+0.0001),
fontsize=12,
rotation = 0)