Week 11
Violy Lislianty Puisetya
00000036447
import numpy as np
import matplotlib.pyplot as plt
import cv2
from sklearn.cluster import DBSCAN
def cluster_gen(n_clusters, pts_minmax=(10, 100), x_mult=(1, 4), y_mult=(1, 3),
x_off=(0, 50), y_off=(0, 50)):
# n_clusters = number of clusters to generate
# pts_minmax = range of number of points per cluster
# x_mult = range of multiplier to modify the size of cluster in the x-direction
# y_mult = range of multiplier to modify the size of cluster in the y-direction
# x_off = range of cluster position offset in the x-direction
# y_off = range of cluster position offset in the y-direction
# Initialize some empty lists to receive cluster member positions
clusters_x = []
clusters_y = []
# Generate random values given parameter ranges
n_points = np.random.randint(pts_minmax[0], pts_minmax[1], n_clusters)
x_multipliers = np.random.randint(x_mult[0], x_mult[1], n_clusters)
y_multipliers = np.random.randint(y_mult[0], y_mult[1], n_clusters)
x_offsets = np.random.randint(x_off[0], x_off[1], n_clusters)
y_offsets = np.random.randint(y_off[0], y_off[1], n_clusters)
# Generate random clusters given parameter values
for idx, npts in enumerate(n_points):
xpts = np.random.randn(npts) * x_multipliers[idx] + x_offsets[idx]
ypts = np.random.randn(npts) * y_multipliers[idx] + y_offsets[idx]
clusters_x.append(xpts)
clusters_y.append(ypts)
# Return cluster positions
return clusters_x, clusters_y
n_clusters = 50
clusters_x, clusters_y = cluster_gen(n_clusters)
data = np.float32((np.concatenate(clusters_x),
np.concatenate(clusters_y))).transpose()
max_distance = 1
db = DBSCAN(eps=max_distance, min_samples=10).fit(data)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
unique_labels = set(labels)
min_x = np.min(data[:, 0])
max_x = np.max(data[:, 0])
min_y = np.min(data[:, 1])
max_y = np.max(data[:, 1])
fig = plt.figure(figsize=(12,6))
plt.subplot(121)
plt.plot(data[:, 0], data[:, 1], 'ko')
plt.xlim(min_x, max_x)
plt.ylim(min_y, max_y)
plt.title('Original Data', fontsize = 20)
plt.subplot(122)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = data[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=7)
xy = data[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=3)
plt.xlim(min_x, max_x)
plt.ylim(min_y, max_y)
plt.title('DBSCAN: %d clusters found' % n_clusters, fontsize = 20)
fig.tight_layout()
plt.subplots_adjust(left=0.03, right=0.98, top=0.9, bottom=0.05)
import numpy as np
import pandas as pd
import os
#print(os.listdir("../input"))
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
df = pd.read_csv('german_credit_data.csv')
df.head()
df.isnull().sum()
df.dtypes
fig, axes = plt.subplots(1, 2, figsize=(12, 8))
sns.boxplot(x="Sex", y="Duration", data=df, orient='v', ax=axes[0])
sns.boxplot(x="Sex", y="Credit amount", data=df, orient='v', ax=axes[1])
df_group_one = df[['Sex', 'Credit amount', 'Duration']]
df_group_one.groupby(['Sex'], as_index=False).mean()
df_female = df[df['Sex'] == "female"]
print(df_female.shape)
df_female.head()
Percentage = (df_female.shape[0]/df.shape[0])*100
print('Female Percentage: ', round(Percentage), '%')
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
Clus_dataSet = df_female[['Age', 'Credit amount', 'Duration']]
Clus_dataSet = np.nan_to_num(Clus_dataSet)
Clus_dataSet = np.array(Clus_dataSet, dtype=np.float64)
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)
# Compute DBSCAN
db = DBSCAN(eps=0.5, min_samples=4).fit(Clus_dataSet)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
df_female['Clus_Db'] = labels
realClusterNum = len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels))
# A sample of clusters
print(df_female[['Age', 'Credit amount', 'Duration', 'Clus_Db']].head())
# Number of labels
print('number of labels: ', set(labels))
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = Clus_dataSet[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = Clus_dataSet[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % realClusterNum)
plt.show()
n_noise_ = list(labels).count(-1)
print('number of noise(s): ', n_noise_)
for clust_number in set(labels):
clust_set = df_female[df_female.Clus_Db == clust_number]
if clust_number != -1:
print('Cluster '+str(clust_number)+', Avg Age: '+ str(round(np.mean(clust_set.Age)))+\
', Avg Credit amount: '+ str(round(np.mean(clust_set['Credit amount'])))+\
', Avg Duration: '+ str(round(np.mean(clust_set['Duration'])))+\
', Count: '+ str(np.count_nonzero(clust_set.index)))
pip install mlxtend
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice Cream', 'Eggs']]
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns = te.columns_)
df
from mlxtend.frequent_patterns import apriori
apriori(df, min_support=0.6)
apriori(df, min_support=0.6, use_colnames=True)
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
(frequent_itemsets['support'] >= 0.8) ]
frequent_itemsets[ frequent_itemsets['itemsets'] == {'Onion', 'Eggs'} ]
from mlxtend.frequent_patterns import fpgrowth
fpgrowth(df, min_support=0.6)
fpgrowth(df, min_support=0.6, use_colnames=True)
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns = te.columns_)
from mlxtend.frequent_patterns import apriori
%timeit -n 100 -r 10 apriori(df, min_support=0.6)
from mlxtend.frequent_patterns import fpgrowth
%timeit -n 100 -r 10 fpgrowth(df, min_support=0.6)