import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datetime, nltk, warnings
import matplotlib.cm as cm
import itertools
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import preprocessing, model_selection, metrics, feature_selection
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn import neighbors, linear_model, svm, tree, ensemble
from wordcloud import WordCloud, STOPWORDS
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from IPython.display import display, HTML
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
%matplotlib inline
#__________________
# read the datafile
df_initial = pd.read_csv('../input/data.csv',encoding="ISO-8859-1",
dtype={'CustomerID': str,'InvoiceID': str})
print('Dataframe dimensions:', df_initial.shape)
#______
df_initial['InvoiceDate'] = pd.to_datetime(df_initial['InvoiceDate'])
#____________________________________________________________
# gives some infos on columns types and numer of null values
tab_info=pd.DataFrame(df_initial.dtypes).T.rename(index={0:'column type'})
tab_info=tab_info.append(pd.DataFrame(df_initial.isnull().sum()).T.rename(index={0:'null values (nb)'}))
tab_info=tab_info.append(pd.DataFrame(df_initial.isnull().sum()/df_initial.shape[0]*100).T.
rename(index={0:'null values (%)'}))
display(tab_info)
#__________________
# show first lines
display(df_initial[:5])
df_initial.dropna(axis = 0, subset = ['CustomerID'], inplace = True)
print('Dataframe dimensions:', df_initial.shape)
#____________________________________________________________
# gives some infos on columns types and numer of null values
tab_info=pd.DataFrame(df_initial.dtypes).T.rename(index={0:'column type'})
tab_info=tab_info.append(pd.DataFrame(df_initial.isnull().sum()).T.rename(index={0:'null values (nb)'}))
tab_info=tab_info.append(pd.DataFrame(df_initial.isnull().sum()/df_initial.shape[0]*100).T.
rename(index={0:'null values (%)'}))
display(tab_info)
print('Entrées dupliquées: {}'.format(df_initial.duplicated().sum()))
df_initial.drop_duplicates(inplace = True)
temp = df_initial[['CustomerID', 'InvoiceNo', 'Country']].groupby(['CustomerID', 'InvoiceNo', 'Country']).count()
temp = temp.reset_index(drop = False)
countries = temp['Country'].value_counts()
print('Nb. de pays dans le dataframe: {}'.format(len(countries)))
data = dict(type='choropleth',
locations = countries.index,
locationmode = 'country names', z = countries,
text = countries.index, colorbar = {'title':'Order nb.'},
colorscale=[[0, 'rgb(224,255,255)'],
[0.01, 'rgb(166,206,227)'], [0.02, 'rgb(31,120,180)'],
[0.03, 'rgb(178,223,138)'], [0.05, 'rgb(51,160,44)'],
[0.10, 'rgb(251,154,153)'], [0.20, 'rgb(255,255,0)'],
[1, 'rgb(227,26,28)']],
reversescale = False)
#_______________________
layout = dict(title='Number of orders per country',
geo = dict(showframe = True, projection={'type':'mercator'}))
#______________
choromap = go.Figure(data = [data], layout = layout)
iplot(choromap, validate=False)
pd.DataFrame([{'products': len(df_initial['StockCode'].value_counts()),
'transactions': len(df_initial['InvoiceNo'].value_counts()),
'customers': len(df_initial['CustomerID'].value_counts()),
}], columns = ['products', 'transactions', 'customers'], index = ['quantity'])
temp = df_initial.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate'].count()
nb_products_per_basket = temp.rename(columns = {'InvoiceDate':'Number of products'})
nb_products_per_basket[:10].sort_values('CustomerID')
nb_products_per_basket['order_canceled'] = nb_products_per_basket['InvoiceNo'].apply(lambda x:int('C' in x))
display(nb_products_per_basket[:5])
#______________________________________________________________________________________________
n1 = nb_products_per_basket['order_canceled'].sum()
n2 = nb_products_per_basket.shape[0]
print('Number of orders canceled: {}/{} ({:.2f}%) '.format(n1, n2, n1/n2*100))
display(df_initial.sort_values('CustomerID')[:5])
df_check = df_initial[df_initial['Quantity'] < 0][['CustomerID','Quantity',
'StockCode','Description','UnitPrice']]
for index, col in df_check.iterrows():
if df_initial[(df_initial['CustomerID'] == col[0]) & (df_initial['Quantity'] == -col[1])
& (df_initial['Description'] == col[2])].shape[0] == 0:
print(df_check.loc[index])
print(15*'-'+'>'+' HYPOTHESIS NOT FULFILLED')
break
df_check = df_initial[(df_initial['Quantity'] < 0) & (df_initial['Description'] != 'Discount')][
['CustomerID','Quantity','StockCode',
'Description','UnitPrice']]
for index, col in df_check.iterrows():
if df_initial[(df_initial['CustomerID'] == col[0]) & (df_initial['Quantity'] == -col[1])
& (df_initial['Description'] == col[2])].shape[0] == 0:
print(index, df_check.loc[index])
print(15*'-'+'>'+' HYPOTHESIS NOT FULFILLED')
break
df_cleaned = df_initial.copy(deep = True)
df_cleaned['QuantityCanceled'] = 0
entry_to_remove = [] ; doubtfull_entry = []
for index, col in df_initial.iterrows():
if (col['Quantity'] > 0) or col['Description'] == 'Discount': continue
df_test = df_initial[(df_initial['CustomerID'] == col['CustomerID']) &
(df_initial['StockCode'] == col['StockCode']) &
(df_initial['InvoiceDate'] < col['InvoiceDate']) &
(df_initial['Quantity'] > 0)].copy()
#_________________________________
# Cancelation WITHOUT counterpart
if (df_test.shape[0] == 0):
doubtfull_entry.append(index)
#________________________________
# Cancelation WITH a counterpart
elif (df_test.shape[0] == 1):
index_order = df_test.index[0]
df_cleaned.loc[index_order, 'QuantityCanceled'] = -col['Quantity']
entry_to_remove.append(index)
#______________________________________________________________
# Various counterparts exist in orders: we delete the last one
elif (df_test.shape[0] > 1):
df_test.sort_index(axis=0 ,ascending=False, inplace = True)
for ind, val in df_test.iterrows():
if val['Quantity'] < -col['Quantity']: continue
df_cleaned.loc[ind, 'QuantityCanceled'] = -col['Quantity']
entry_to_remove.append(index)
break
print("entry_to_remove: {}".format(len(entry_to_remove)))
print("doubtfull_entry: {}".format(len(doubtfull_entry)))
df_cleaned.drop(entry_to_remove, axis = 0, inplace = True)
df_cleaned.drop(doubtfull_entry, axis = 0, inplace = True)
remaining_entries = df_cleaned[(df_cleaned['Quantity'] < 0) & (df_cleaned['StockCode'] != 'D')]
print("nb of entries to delete: {}".format(remaining_entries.shape[0]))
remaining_entries[:5]
df_cleaned[(df_cleaned['CustomerID'] == 14048) & (df_cleaned['StockCode'] == '22464')]
list_special_codes = df_cleaned[df_cleaned['StockCode'].str.contains('^[a-zA-Z]+', regex=True)]['StockCode'].unique()
list_special_codes
for code in list_special_codes:
print("{:<15} -> {:<30}".format(code, df_cleaned[df_cleaned['StockCode'] == code]['Description'].unique()[0]))
df_cleaned['TotalPrice'] = df_cleaned['UnitPrice'] * (df_cleaned['Quantity'] - df_cleaned['QuantityCanceled'])
df_cleaned.sort_values('CustomerID')[:5]
#___________________________________________
# somme des achats / utilisateur & commande
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['TotalPrice'].sum()
basket_price = temp.rename(columns = {'TotalPrice':'Basket Price'})
#_____________________
# date de la commande
df_cleaned['InvoiceDate_int'] = df_cleaned['InvoiceDate'].astype('int64')
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate_int'].mean()
df_cleaned.drop('InvoiceDate_int', axis = 1, inplace = True)
basket_price.loc[:, 'InvoiceDate'] = pd.to_datetime(temp['InvoiceDate_int'])
#______________________________________
# selection des entrées significatives:
basket_price = basket_price[basket_price['Basket Price'] > 0]
basket_price.sort_values('CustomerID')[:6]
#____________________
# Décompte des achats
price_range = [0, 50, 100, 200, 500, 1000, 5000, 50000]
count_price = []
for i, price in enumerate(price_range):
if i == 0: continue
val = basket_price[(basket_price['Basket Price'] < price) &
(basket_price['Basket Price'] > price_range[i-1])]['Basket Price'].count()
count_price.append(val)
#____________________________________________
# Représentation du nombre d'achats / montant
plt.rc('font', weight='bold')
f, ax = plt.subplots(figsize=(11, 6))
colors = ['yellowgreen', 'gold', 'wheat', 'c', 'violet', 'royalblue','firebrick']
labels = [ '{}<.<{}'.format(price_range[i-1], s) for i,s in enumerate(price_range) if i != 0]
sizes = count_price
explode = [0.0 if sizes[i] < 100 else 0.0 for i in range(len(sizes))]
ax.pie(sizes, explode = explode, labels=labels, colors = colors,
autopct = lambda x:'{:1.0f}%'.format(x) if x > 1 else '',
shadow = False, startangle=0)
ax.axis('equal')
f.text(0.5, 1.01, "Répartition des montants des commandes", ha='center', fontsize = 18);
is_noun = lambda pos: pos[:2] == 'NN'
def keywords_inventory(dataframe, colonne = 'Description'):
stemmer = nltk.stem.SnowballStemmer("english")
keywords_roots = dict() # collect the words / root
keywords_select = dict() # association: root <-> keyword
category_keys = []
count_keywords = dict()
icount = 0
for s in dataframe[colonne]:
if pd.isnull(s): continue
lines = s.lower()
tokenized = nltk.word_tokenize(lines)
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]
for t in nouns:
t = t.lower() ; racine = stemmer.stem(t)
if racine in keywords_roots:
keywords_roots[racine].add(t)
count_keywords[racine] += 1
else:
keywords_roots[racine] = {t}
count_keywords[racine] = 1
for s in keywords_roots.keys():
if len(keywords_roots[s]) > 1:
min_length = 1000
for k in keywords_roots[s]:
if len(k) < min_length:
clef = k ; min_length = len(k)
category_keys.append(clef)
keywords_select[s] = clef
else:
category_keys.append(list(keywords_roots[s])[0])
keywords_select[s] = list(keywords_roots[s])[0]
print("Nb of keywords in variable '{}': {}".format(colonne,len(category_keys)))
return category_keys, keywords_roots, keywords_select, count_keywords
df_produits = pd.DataFrame(df_initial['Description'].unique()).rename(columns = {0:'Description'})
keywords, keywords_roots, keywords_select, count_keywords = keywords_inventory(df_produits)
list_products = []
for k,v in count_keywords.items():
list_products.append([keywords_select[k],v])
list_products.sort(key = lambda x:x[1], reverse = True)
liste = sorted(list_products, key = lambda x:x[1], reverse = True)
#_______________________________
plt.rc('font', weight='normal')
fig, ax = plt.subplots(figsize=(7, 25))
y_axis = [i[1] for i in liste[:125]]
x_axis = [k for k,i in enumerate(liste[:125])]
x_label = [i[0] for i in liste[:125]]
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 13)
plt.yticks(x_axis, x_label)
plt.xlabel("Nb. of occurences", fontsize = 18, labelpad = 10)
ax.barh(x_axis, y_axis, align = 'center')
ax = plt.gca()
ax.invert_yaxis()
#_______________________________________________________________________________________
plt.title("Words occurence",bbox={'facecolor':'k', 'pad':5}, color='w',fontsize = 25)
plt.show()
list_products = []
for k,v in count_keywords.items():
word = keywords_select[k]
if word in ['pink', 'blue', 'tag', 'green', 'orange']: continue
if len(word) < 3 or v < 13: continue
if ('+' in word) or ('/' in word): continue
list_products.append([word, v])
#______________________________________________________
list_products.sort(key = lambda x:x[1], reverse = True)
print('mots conservés:', len(list_products))
liste_produits = df_cleaned['Description'].unique()
X = pd.DataFrame()
for key, occurence in list_products:
X.loc[:, key] = list(map(lambda x:int(key.upper() in x), liste_produits))
threshold = [0, 1, 2, 3, 5, 10]
label_col = []
for i in range(len(threshold)):
if i == len(threshold)-1:
col = '.>{}'.format(threshold[i])
else:
col = '{}<.<{}'.format(threshold[i],threshold[i+1])
label_col.append(col)
X.loc[:, col] = 0
for i, prod in enumerate(liste_produits):
prix = df_cleaned[ df_cleaned['Description'] == prod]['UnitPrice'].mean()
j = 0
while prix > threshold[j]:
j+=1
if j == len(threshold): break
X.loc[i, label_col[j-1]] = 1
print("{:<8} {:<20} \n".format('gamme', 'nb. produits') + 20*'-')
for i in range(len(threshold)):
if i == len(threshold)-1:
col = '.>{}'.format(threshold[i])
else:
col = '{}<.<{}'.format(threshold[i],threshold[i+1])
print("{:<10} {:<20}".format(col, X.loc[:, col].sum()))
matrix = X.as_matrix()
for n_clusters in range(3,10):
kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=30)
kmeans.fit(matrix)
clusters = kmeans.predict(matrix)
silhouette_avg = silhouette_score(matrix, clusters)
print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)
n_clusters = 5
silhouette_avg = -1
while silhouette_avg < 0.145:
kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=30)
kmeans.fit(matrix)
clusters = kmeans.predict(matrix)
silhouette_avg = silhouette_score(matrix, clusters)
#km = kmodes.KModes(n_clusters = n_clusters, init='Huang', n_init=2, verbose=0)
#clusters = km.fit_predict(matrix)
#silhouette_avg = silhouette_score(matrix, clusters)
print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)
pd.Series(clusters).value_counts()
def graph_component_silhouette(n_clusters, lim_x, mat_size, sample_silhouette_values, clusters):
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
#____________________________
fig, ax1 = plt.subplots(1, 1)
fig.set_size_inches(8, 8)
ax1.set_xlim([lim_x[0], lim_x[1]])
ax1.set_ylim([0, mat_size + (n_clusters + 1) * 10])
y_lower = 10
for i in range(n_clusters):
#___________________________________________________________________________________
# Aggregate the silhouette scores for samples belonging to cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[clusters == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
cmap = cm.get_cmap("Spectral")
color = cmap(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.8)
#____________________________________________________________________
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.03, y_lower + 0.5 * size_cluster_i, str(i), color = 'red', fontweight = 'bold',
bbox=dict(facecolor='white', edgecolor='black', boxstyle='round, pad=0.3'))
#______________________________________
# Compute the new y_lower for next plot
y_lower = y_upper + 10
#____________________________________
# define individual silouhette scores
sample_silhouette_values = silhouette_samples(matrix, clusters)
#__________________
# and do the graph
graph_component_silhouette(n_clusters, [-0.07, 0.33], len(X), sample_silhouette_values, clusters)
liste = pd.DataFrame(liste_produits)
liste_words = [word for (word, occurence) in list_products]
occurence = [dict() for _ in range(n_clusters)]
for i in range(n_clusters):
liste_cluster = liste.loc[clusters == i]
for word in liste_words:
if word in ['art', 'set', 'heart', 'pink', 'blue', 'tag']: continue
occurence[i][word] = sum(liste_cluster.loc[:, 0].str.contains(word.upper()))
#________________________________________________________________________
def random_color_func(word=None, font_size=None, position=None,
orientation=None, font_path=None, random_state=None):
h = int(360.0 * tone / 255.0)
s = int(100.0 * 255.0 / 255.0)
l = int(100.0 * float(random_state.randint(70, 120)) / 255.0)
return "hsl({}, {}%, {}%)".format(h, s, l)
#________________________________________________________________________
def make_wordcloud(liste, increment):
ax1 = fig.add_subplot(4,2,increment)
words = dict()
trunc_occurences = liste[0:150]
for s in trunc_occurences:
words[s[0]] = s[1]
#________________________________________________________
wordcloud = WordCloud(width=1000,height=400, background_color='lightgrey',
max_words=1628,relative_scaling=1,
color_func = random_color_func,
normalize_plurals=False)
wordcloud.generate_from_frequencies(words)
ax1.imshow(wordcloud, interpolation="bilinear")
ax1.axis('off')
plt.title('cluster nº{}'.format(increment-1))
#________________________________________________________________________
fig = plt.figure(1, figsize=(14,14))
color = [0, 160, 130, 95, 280, 40, 330, 110, 25]
for i in range(n_clusters):
list_cluster_occurences = occurence[i]
tone = color[i] # define the color of the words
liste = []
for key, value in list_cluster_occurences.items():
liste.append([key, value])
liste.sort(key = lambda x:x[1], reverse = True)
make_wordcloud(liste, i+1)
pca = PCA()
pca.fit(matrix)
pca_samples = pca.transform(matrix)
fig, ax = plt.subplots(figsize=(14, 5))
sns.set(font_scale=1)
plt.step(range(matrix.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid',
label='cumulative explained variance')
sns.barplot(np.arange(1,matrix.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g',
label='individual explained variance')
plt.xlim(0, 100)
ax.set_xticklabels([s if int(s.get_text())%2 == 0 else '' for s in ax.get_xticklabels()])
plt.ylabel('Explained variance', fontsize = 14)
plt.xlabel('Principal components', fontsize = 14)
plt.legend(loc='upper left', fontsize = 13);
pca = PCA(n_components=50)
matrix_9D = pca.fit_transform(matrix)
mat = pd.DataFrame(matrix_9D)
mat['cluster'] = pd.Series(clusters)
import matplotlib.patches as mpatches
sns.set_style("white")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2.5})
LABEL_COLOR_MAP = {0:'r', 1:'gold', 2:'b', 3:'k', 4:'c', 5:'g'}
label_color = [LABEL_COLOR_MAP[l] for l in mat['cluster']]
fig = plt.figure(figsize = (15,8))
increment = 0
for ix in range(4):
for iy in range(ix+1, 4):
increment += 1
ax = fig.add_subplot(2,3,increment)
ax.scatter(mat[ix], mat[iy], c= label_color, alpha=0.4)
plt.ylabel('PCA {}'.format(iy+1), fontsize = 12)
plt.xlabel('PCA {}'.format(ix+1), fontsize = 12)
ax.yaxis.grid(color='lightgray', linestyle=':')
ax.xaxis.grid(color='lightgray', linestyle=':')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
if increment == 9: break
if increment == 9: break
#_______________________________________________
# I set the legend: abreviation -> airline name
comp_handler = []
for i in range(5):
comp_handler.append(mpatches.Patch(color = LABEL_COLOR_MAP[i], label = i))
plt.legend(handles=comp_handler, bbox_to_anchor=(1.1, 0.97),
title='Cluster', facecolor = 'lightgrey',
shadow = True, frameon = True, framealpha = 1,
fontsize = 13, bbox_transform = plt.gcf().transFigure)
plt.show()
corresp = dict()
for key, val in zip (liste_produits, clusters):
corresp[key] = val
#__________________________________________________________________________
df_cleaned['categ_product'] = df_cleaned.loc[:, 'Description'].map(corresp)
for i in range(5):
col = 'categ_{}'.format(i)
df_temp = df_cleaned[df_cleaned['categ_product'] == i]
price_temp = df_temp['UnitPrice'] * (df_temp['Quantity'] - df_temp['QuantityCanceled'])
price_temp = price_temp.apply(lambda x:x if x > 0 else 0)
df_cleaned.loc[:, col] = price_temp
df_cleaned[col].fillna(0, inplace = True)
#__________________________________________________________________________________________________
df_cleaned[['InvoiceNo', 'Description', 'categ_product', 'categ_0', 'categ_1', 'categ_2', 'categ_3','categ_4']][:5]
#___________________________________________
# somme des achats / utilisateur & commande
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['TotalPrice'].sum()
basket_price = temp.rename(columns = {'TotalPrice':'Basket Price'})
#____________________________________________________________
# pourcentage du prix de la commande / categorie de produit
for i in range(5):
col = 'categ_{}'.format(i)
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)[col].sum()
basket_price.loc[:, col] = temp
#_____________________
# date de la commande
df_cleaned['InvoiceDate_int'] = df_cleaned['InvoiceDate'].astype('int64')
temp = df_cleaned.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate_int'].mean()
df_cleaned.drop('InvoiceDate_int', axis = 1, inplace = True)
basket_price.loc[:, 'InvoiceDate'] = pd.to_datetime(temp['InvoiceDate_int'])
#______________________________________
# selection des entrées significatives:
basket_price = basket_price[basket_price['Basket Price'] > 0]
basket_price.sort_values('CustomerID', ascending = True)[:5]
print(basket_price['InvoiceDate'].min(), '->', basket_price['InvoiceDate'].max())
set_entrainement = basket_price[basket_price['InvoiceDate'] < datetime.date(2011,10,1)]
set_test = basket_price[basket_price['InvoiceDate'] >= datetime.date(2011,10,1)]
basket_price = set_entrainement.copy(deep = True)
#________________________________________________________________
# nb de visites et stats sur le montant du panier / utilisateurs
transactions_per_user=basket_price.groupby(by=['CustomerID'])['Basket Price'].agg(['count','min','max','mean','sum'])
for i in range(5):
col = 'categ_{}'.format(i)
transactions_per_user.loc[:,col] = basket_price.groupby(by=['CustomerID'])[col].sum() /\
transactions_per_user['sum']*100
transactions_per_user.reset_index(drop = False, inplace = True)
basket_price.groupby(by=['CustomerID'])['categ_0'].sum()
transactions_per_user.sort_values('CustomerID', ascending = True)[:5]
last_date = basket_price['InvoiceDate'].max().date()
first_registration = pd.DataFrame(basket_price.groupby(by=['CustomerID'])['InvoiceDate'].min())
last_purchase = pd.DataFrame(basket_price.groupby(by=['CustomerID'])['InvoiceDate'].max())
test = first_registration.applymap(lambda x:(last_date - x.date()).days)
test2 = last_purchase.applymap(lambda x:(last_date - x.date()).days)
transactions_per_user.loc[:, 'LastPurchase'] = test2.reset_index(drop = False)['InvoiceDate']
transactions_per_user.loc[:, 'FirstPurchase'] = test.reset_index(drop = False)['InvoiceDate']
transactions_per_user[:5]
n1 = transactions_per_user[transactions_per_user['count'] == 1].shape[0]
n2 = transactions_per_user.shape[0]
print("nb. de clients avec achat unique: {:<2}/{:<5} ({:<2.2f}%)".format(n1,n2,n1/n2*100))
list_cols = ['count','min','max','mean','categ_0','categ_1','categ_2','categ_3','categ_4']
#_____________________________________________________________
selected_customers = transactions_per_user.copy(deep = True)
matrix = selected_customers[list_cols].as_matrix()
scaler = StandardScaler()
scaler.fit(matrix)
print('variables mean values: \n' + 90*'-' + '\n' , scaler.mean_)
scaled_matrix = scaler.transform(matrix)
pca = PCA()
pca.fit(scaled_matrix)
pca_samples = pca.transform(scaled_matrix)
fig, ax = plt.subplots(figsize=(14, 5))
sns.set(font_scale=1)
plt.step(range(matrix.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid',
label='cumulative explained variance')
sns.barplot(np.arange(1,matrix.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g',
label='individual explained variance')
plt.xlim(0, 10)
ax.set_xticklabels([s if int(s.get_text())%2 == 0 else '' for s in ax.get_xticklabels()])
plt.ylabel('Explained variance', fontsize = 14)
plt.xlabel('Principal components', fontsize = 14)
plt.legend(loc='best', fontsize = 13);
n_clusters = 11
kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=100)
kmeans.fit(scaled_matrix)
clusters_clients = kmeans.predict(scaled_matrix)
silhouette_avg = silhouette_score(scaled_matrix, clusters_clients)
print('score de silhouette: {:<.3f}'.format(silhouette_avg))
pd.DataFrame(pd.Series(clusters_clients).value_counts(), columns = ['nb. de clients']).T
pca = PCA(n_components=6)
matrix_3D = pca.fit_transform(scaled_matrix)
mat = pd.DataFrame(matrix_3D)
mat['cluster'] = pd.Series(clusters_clients)
import matplotlib.patches as mpatches
sns.set_style("white")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2.5})
LABEL_COLOR_MAP = {0:'r', 1:'tan', 2:'b', 3:'k', 4:'c', 5:'g', 6:'deeppink', 7:'skyblue', 8:'darkcyan', 9:'orange',
10:'yellow', 11:'tomato', 12:'seagreen'}
label_color = [LABEL_COLOR_MAP[l] for l in mat['cluster']]
fig = plt.figure(figsize = (12,10))
increment = 0
for ix in range(6):
for iy in range(ix+1, 6):
increment += 1
ax = fig.add_subplot(4,3,increment)
ax.scatter(mat[ix], mat[iy], c= label_color, alpha=0.5)
plt.ylabel('PCA {}'.format(iy+1), fontsize = 12)
plt.xlabel('PCA {}'.format(ix+1), fontsize = 12)
ax.yaxis.grid(color='lightgray', linestyle=':')
ax.xaxis.grid(color='lightgray', linestyle=':')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
if increment == 12: break
if increment == 12: break
#_______________________________________________
# I set the legend: abreviation -> airline name
comp_handler = []
for i in range(n_clusters):
comp_handler.append(mpatches.Patch(color = LABEL_COLOR_MAP[i], label = i))
plt.legend(handles=comp_handler, bbox_to_anchor=(1.1, 0.9),
title='Cluster', facecolor = 'lightgrey',
shadow = True, frameon = True, framealpha = 1,
fontsize = 13, bbox_transform = plt.gcf().transFigure)
plt.tight_layout()
sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)
#____________________________________
# define individual silouhette scores
sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)
#__________________
# and do the graph
graph_component_silhouette(n_clusters, [-0.15, 0.55], len(scaled_matrix), sample_silhouette_values, clusters_clients)
selected_customers.loc[:, 'cluster'] = clusters_clients
merged_df = pd.DataFrame()
for i in range(n_clusters):
test = pd.DataFrame(selected_customers[selected_customers['cluster'] == i].mean())
test = test.T.set_index('cluster', drop = True)
test['size'] = selected_customers[selected_customers['cluster'] == i].shape[0]
merged_df = pd.concat([merged_df, test])
#_____________________________________________________
merged_df.drop('CustomerID', axis = 1, inplace = True)
print('number of customers:', merged_df['size'].sum())
merged_df = merged_df.sort_values('sum')
liste_index = []
for i in range(5):
column = 'categ_{}'.format(i)
liste_index.append(merged_df[merged_df[column] > 45].index.values[0])
#___________________________________
liste_index_reordered = liste_index
liste_index_reordered += [ s for s in merged_df.index if s not in liste_index]
#___________________________________________________________
merged_df = merged_df.reindex(index = liste_index_reordered)
merged_df = merged_df.reset_index(drop = False)
display(merged_df[['cluster', 'count', 'min', 'max', 'mean', 'sum', 'categ_0',
'categ_1', 'categ_2', 'categ_3', 'categ_4', 'size']])
def _scale_data(data, ranges):
(x1, x2) = ranges[0]
d = data[0]
return [(d - y1) / (y2 - y1) * (x2 - x1) + x1 for d, (y1, y2) in zip(data, ranges)]
class RadarChart():
def __init__(self, fig, location, sizes, variables, ranges, n_ordinate_levels = 6):
angles = np.arange(0, 360, 360./len(variables))
ix, iy = location[:] ; size_x, size_y = sizes[:]
axes = [fig.add_axes([ix, iy, size_x, size_y], polar = True,
label = "axes{}".format(i)) for i in range(len(variables))]
_, text = axes[0].set_thetagrids(angles, labels = variables)
for txt, angle in zip(text, angles):
if angle > -1 and angle < 181:
txt.set_rotation(angle - 90)
else:
txt.set_rotation(angle - 270)
for ax in axes[1:]:
ax.patch.set_visible(False)
ax.xaxis.set_visible(False)
ax.grid("off")
for i, ax in enumerate(axes):
grid = np.linspace(*ranges[i],num = n_ordinate_levels)
grid_label = [""]+["{:.0f}".format(x) for x in grid[1:-1]]
ax.set_rgrids(grid, labels = grid_label, angle = angles[i])
ax.set_ylim(*ranges[i])
self.angle = np.deg2rad(np.r_[angles, angles[0]])
self.ranges = ranges
self.ax = axes[0]
def plot(self, data, *args, **kw):
sdata = _scale_data(data, self.ranges)
self.ax.plot(self.angle, np.r_[sdata, sdata[0]], *args, **kw)
def fill(self, data, *args, **kw):
sdata = _scale_data(data, self.ranges)
self.ax.fill(self.angle, np.r_[sdata, sdata[0]], *args, **kw)
def legend(self, *args, **kw):
self.ax.legend(*args, **kw)
def title(self, title, *args, **kw):
self.ax.text(0.9, 1, title, transform = self.ax.transAxes, *args, **kw)
fig = plt.figure(figsize=(10,12))
attributes = ['count', 'mean', 'sum', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4']
ranges = [[0.01, 10], [0.01, 1500], [0.01, 10000], [0.01, 75], [0.01, 75], [0.01, 75], [0.01, 75], [0.01, 75]]
index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
n_groups = n_clusters ; i_cols = 3
i_rows = n_groups//i_cols
size_x, size_y = (1/i_cols), (1/i_rows)
for ind in range(n_clusters):
ix = ind%3 ; iy = i_rows - ind//3
pos_x = ix*(size_x + 0.05) ; pos_y = iy*(size_y + 0.05)
location = [pos_x, pos_y] ; sizes = [size_x, size_y]
#______________________________________________________
data = np.array(merged_df.loc[index[ind], attributes])
radar = RadarChart(fig, location, sizes, attributes, ranges)
radar.plot(data, color = 'b', linewidth=2.0)
radar.fill(data, alpha = 0.2, color = 'b')
radar.title(title = 'cluster nº{}'.format(index[ind]), color = 'r')
ind += 1
class Class_Fit(object):
def __init__(self, clf, params=None):
if params:
self.clf = clf(**params)
else:
self.clf = clf()
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def grid_search(self, parameters, Kfold):
self.grid = GridSearchCV(estimator = self.clf, param_grid = parameters, cv = Kfold)
def grid_fit(self, X, Y):
self.grid.fit(X, Y)
def grid_predict(self, X, Y):
self.predictions = self.grid.predict(X)
print("Precision: {:.2f} % ".format(100*metrics.accuracy_score(Y, self.predictions)))
columns = ['mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4' ]
X = selected_customers[columns]
Y = selected_customers['cluster']
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, train_size = 0.8)
svc = Class_Fit(clf = svm.LinearSVC)
svc.grid_search(parameters = [{'C':np.logspace(-2,2,10)}], Kfold = 5)
svc.grid_fit(X = X_train, Y = Y_train)
svc.grid_predict(X_test, Y_test)
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
#_________________________________________________
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
#_________________________________________________
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
#_________________________________________________
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
class_names = [i for i in range(11)]
cnf_matrix = confusion_matrix(Y_test, svc.predictions)
np.set_printoptions(precision=2)
plt.figure(figsize = (8,8))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize = False, title='Confusion matrix')
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 10)):
"""Generate a simple plot of the test and training learning curve"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.legend(loc="best")
return plt
g = plot_learning_curve(svc.grid.best_estimator_,
"SVC learning curves", X_train, Y_train, ylim = [1.01, 0.6],
cv = 5, train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 1])
lr = Class_Fit(clf = linear_model.LogisticRegression)
lr.grid_search(parameters = [{'C':np.logspace(-2,2,20)}], Kfold = 5)
lr.grid_fit(X = X_train, Y = Y_train)
lr.grid_predict(X_test, Y_test)
g = plot_learning_curve(lr.grid.best_estimator_, "Logistic Regression learning curves", X_train, Y_train,
ylim = [1.01, 0.7], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
knn = Class_Fit(clf = neighbors.KNeighborsClassifier)
knn.grid_search(parameters = [{'n_neighbors': np.arange(1,50,1)}], Kfold = 5)
knn.grid_fit(X = X_train, Y = Y_train)
knn.grid_predict(X_test, Y_test)
g = plot_learning_curve(knn.grid.best_estimator_, "Nearest Neighbors learning curves", X_train, Y_train,
ylim = [1.01, 0.7], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
tr = Class_Fit(clf = tree.DecisionTreeClassifier)
tr.grid_search(parameters = [{'criterion' : ['entropy', 'gini'], 'max_features' :['sqrt', 'log2']}], Kfold = 5)
tr.grid_fit(X = X_train, Y = Y_train)
tr.grid_predict(X_test, Y_test)
g = plot_learning_curve(tr.grid.best_estimator_, "Decision tree learning curves", X_train, Y_train,
ylim = [1.01, 0.7], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
rf = Class_Fit(clf = ensemble.RandomForestClassifier)
param_grid = {'criterion' : ['entropy', 'gini'], 'n_estimators' : [20, 40, 60, 80, 100],
'max_features' :['sqrt', 'log2']}
rf.grid_search(parameters = param_grid, Kfold = 5)
rf.grid_fit(X = X_train, Y = Y_train)
rf.grid_predict(X_test, Y_test)
g = plot_learning_curve(rf.grid.best_estimator_, "Random Forest learning curves", X_train, Y_train,
ylim = [1.01, 0.7], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
ada = Class_Fit(clf = AdaBoostClassifier)
param_grid = {'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
ada.grid_search(parameters = param_grid, Kfold = 5)
ada.grid_fit(X = X_train, Y = Y_train)
ada.grid_predict(X_test, Y_test)
g = plot_learning_curve(ada.grid.best_estimator_, "AdaBoost learning curves", X_train, Y_train,
ylim = [1.01, 0.4], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
gb = Class_Fit(clf = ensemble.GradientBoostingClassifier)
param_grid = {'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
gb.grid_search(parameters = param_grid, Kfold = 5)
gb.grid_fit(X = X_train, Y = Y_train)
gb.grid_predict(X_test, Y_test)
g = plot_learning_curve(gb.grid.best_estimator_, "Gradient Boosting learning curves", X_train, Y_train,
ylim = [1.01, 0.7], cv = 5,
train_sizes = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
rf_best = ensemble.RandomForestClassifier(**rf.grid.best_params_)
gb_best = ensemble.GradientBoostingClassifier(**gb.grid.best_params_)
svc_best = svm.LinearSVC(**svc.grid.best_params_)
tr_best = tree.DecisionTreeClassifier(**tr.grid.best_params_)
knn_best = neighbors.KNeighborsClassifier(**knn.grid.best_params_)
lr_best = linear_model.LogisticRegression(**lr.grid.best_params_)
votingC = ensemble.VotingClassifier(estimators=[('rf', rf_best),('gb', gb_best),
('knn', knn_best)], voting='soft')
votingC = votingC.fit(X_train, Y_train)
predictions = votingC.predict(X_test)
print("Precision: {:.2f} % ".format(100*metrics.accuracy_score(Y_test, predictions)))
basket_price = set_test.copy(deep = True)
transactions_per_user=basket_price.groupby(by=['CustomerID'])['Basket Price'].agg(['count','min','max','mean','sum'])
for i in range(5):
col = 'categ_{}'.format(i)
transactions_per_user.loc[:,col] = basket_price.groupby(by=['CustomerID'])[col].sum() /\
transactions_per_user['sum']*100
transactions_per_user.reset_index(drop = False, inplace = True)
basket_price.groupby(by=['CustomerID'])['categ_0'].sum()
#_______________________
# Correcting time range
transactions_per_user['count'] = 5 * transactions_per_user['count']
transactions_per_user['sum'] = transactions_per_user['count'] * transactions_per_user['mean']
transactions_per_user.sort_values('CustomerID', ascending = True)[:5]
list_cols = ['count','min','max','mean','categ_0','categ_1','categ_2','categ_3','categ_4']
#_____________________________________________________________
matrix_test = transactions_per_user[list_cols].as_matrix()
scaled_test_matrix = scaler.transform(matrix_test)
Y = kmeans.predict(scaled_test_matrix)
columns = ['mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4' ]
X = transactions_per_user[columns]
classifiers = [(svc, 'Support Vector Machine'),
(lr, 'Logostic Regression'),
(knn, 'k-Nearest Neighbors'),
(tr, 'Decision Tree'),
(rf, 'Random Forest'),
(gb, 'Gradient Boosting')]
#______________________________
for clf, label in classifiers:
print(30*'_', '\n{}'.format(label))
clf.grid_predict(X, Y)
predictions = votingC.predict(X)
print("Precision: {:.2f} % ".format(100*metrics.accuracy_score(Y, predictions)))