import pandas as pd
# Loading the data
df = pd.read_csv('CampaignResponses.txt', header = 0, sep="\t")
print("Type of contained values:")
print(df.dtypes)
print("\nSize of dataframe is: {}".format(df.shape))
Type of contained values:
Gender object
MoneySpend int64
Recency int64
Frequency int64
Age int64
Child int64
Teen int64
Cook int64
DIY int64
Reference int64
Art int64
History int64
PURCHASE_OF_SECRETS_OF_ITALIAN_KITCHEN int64
PURCHASE_OF_ATLAS_OF_ITALY int64
PURCHASE_OF_ITALIAN_ART int64
PURCHASE_OF_ART_HISTORY_OF_FLORENCE int64
AMOUNT_ART_HISTORY_OF_FLORENCE int64
dtype: object
Size of dataframe is: (27654, 17)
# Number of values in each column
print("Each column contains the same number of values (27564).")
print(df.count())
Each column contains the same number of values (27564).
Gender 27654
MoneySpend 27654
Recency 27654
Frequency 27654
Age 27654
Child 27654
Teen 27654
Cook 27654
DIY 27654
Reference 27654
Art 27654
History 27654
PURCHASE_OF_SECRETS_OF_ITALIAN_KITCHEN 27654
PURCHASE_OF_ATLAS_OF_ITALY 27654
PURCHASE_OF_ITALIAN_ART 27654
PURCHASE_OF_ART_HISTORY_OF_FLORENCE 27654
AMOUNT_ART_HISTORY_OF_FLORENCE 27654
dtype: int64
# First analysis of data contained
print(df.describe())
MoneySpend Recency Frequency Age Child \
count 27654.000000 27654.000000 27654.000000 27654.000000 27654.000000
mean 211.207818 13.048022 3.996529 26.837890 0.865806
std 102.118276 8.111111 3.534926 18.560403 1.129438
min 15.000000 2.000000 1.000000 2.000000 0.000000
25% 132.000000 8.000000 1.000000 12.000000 0.000000
50% 212.000000 12.000000 2.000000 22.000000 1.000000
75% 287.000000 16.000000 7.000000 38.000000 1.000000
max 479.000000 36.000000 12.000000 99.000000 8.000000
Teen Cook DIY Reference Art \
count 27654.000000 27654.000000 27654.000000 27654.000000 27654.000000
mean 0.394988 0.936899 0.459536 0.324293 0.429956
std 0.688661 1.184509 0.762182 0.616312 0.724619
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 1.000000 0.000000 0.000000 0.000000
75% 1.000000 1.000000 1.000000 1.000000 1.000000
max 6.000000 8.000000 6.000000 5.000000 6.000000
History PURCHASE_OF_SECRETS_OF_ITALIAN_KITCHEN \
count 27654.000000 27654.000000
mean 0.585051 0.124069
std 0.876355 0.329666
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 1.000000 0.000000
max 7.000000 1.000000
PURCHASE_OF_ATLAS_OF_ITALY PURCHASE_OF_ITALIAN_ART \
count 27654.000000 27654.000000
mean 0.042092 0.063644
std 0.200802 0.244121
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.000000 0.000000
max 1.000000 1.000000
PURCHASE_OF_ART_HISTORY_OF_FLORENCE AMOUNT_ART_HISTORY_OF_FLORENCE
count 27654.000000 27654.000000
mean 0.165546 7.685977
std 0.371679 19.592759
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.000000 0.000000
max 1.000000 117.000000
Quick analysis of some values
ax = df.hist(column='PURCHASE_OF_ART_HISTORY_OF_FLORENCE', bins=25, grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
ax = ax[0]
for x in ax:
# Despine
x.spines['right'].set_visible(False)
x.spines['top'].set_visible(False)
# Switch off ticks
x.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on")
# Draw horizontal axis lines
vals = x.get_yticks()
for tick in vals:
x.axhline(y=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)
# Remove title
x.set_title("PURCHASE OF ART HISTORY OF FLORENCE")
# Set y-axis label
x.set_ylabel("Frequency", labelpad=20, weight='bold', size=12)
df.boxplot(column=['AMOUNT_ART_HISTORY_OF_FLORENCE'], return_type='axes')
df.groupby(['Gender']).sum().plot(kind='pie', y=0, autopct='%1.0f%%')
Using a decision tree to predict a given class
# Converting Gender to numerical
df.Gender.replace(['F', 'M'], [1, 0], inplace=True)
df.Gender.head()
from sklearn import tree
# Building data for decision tree classifier
target = df.PURCHASE_OF_ART_HISTORY_OF_FLORENCE
data = df.drop(['AMOUNT_ART_HISTORY_OF_FLORENCE', 'PURCHASE_OF_ART_HISTORY_OF_FLORENCE'], 1)
clf = tree.DecisionTreeClassifier(max_depth=1)
clf = clf.fit(data, target)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only
"""
tree.plot_tree(clf,filled=True,feature_names = data.columns)
# r = tree.export_text(clf, feature_names=list(data.columns.values))
# Displaying the beginning of the decision tree (too long otherwise!)
# print('\n'.join(r.split('\n')[:7]))
# Model learning error
from sklearn.metrics import confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# calc confusion matrix
y_test_predict = clf.predict(data)
m = ConfusionMatrixDisplay(confusion_matrix(target, y_test_predict),display_labels=clf.classes_).plot()
m.ax_.set_title("Confusion Matrix \n")
plt.show()
print("The precision for this decision tree is ", precision_score(target, y_test_predict))
The precision for this decision tree is 0.5029838022165388
for i in range(1, 10, 2):
clf = tree.DecisionTreeClassifier(max_depth=i)
clf = clf.fit(data, target)
y_test_predict = clf.predict(data)
m = ConfusionMatrixDisplay(confusion_matrix(target, y_test_predict),display_labels=clf.classes_).plot()
title = f"MAX-DEPTH = {i} - Confusion Matrix \n"
m.ax_.set_title(title)
m.ax_.cmap = 'plasma'
plt.show()
print("The precision for this decision tree is ", precision_score(target, y_test_predict))
The precision for this decision tree is 0.5029838022165388
The precision for this decision tree is 0.5802919708029197
The precision for this decision tree is 0.6799177518848526
The precision for this decision tree is 0.7124931805782869
The precision for this decision tree is 0.710378418883477
from sklearn.model_selection import train_test_split
# function to compute error rates
def get_error_rates(model, parameter, steps, internal_loops, print_interval):
test_scores = {}
train_scores = {}
model_p = {}
for i in range(*steps):
loops = internal_loops
train_score = 0
test_score = 0
for j in range(loops):
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25)
model_p[parameter] = i
clf = model(**model_p)
clf = clf.fit(X_train, y_train)
y_test_predict = clf.predict(X_test)
test_score += clf.score(X_test, y_test)
y_train_predict = clf.predict(X_train)
train_score += clf.score(X_train, y_train)
if i % print_interval == 0:
print(f"####### N_NEIGHBORS = {i} #######")
print(f"Average training precision is {train_score/loops}")
print(f"Average testing precision is {test_score/loops}\n")
test_scores[i] = 1 - test_score/loops
train_scores[i] = 1 - train_score/loops
return train_scores, test_scores
DT_train_scores, DT_test_scores = get_error_rates(tree.DecisionTreeClassifier, "max_depth", (1, 50), 3, 10)
####### N_NEIGHBORS = 10 #######
Average training precision is 0.8897942783670846
Average testing precision is 0.8543052743226304
####### N_NEIGHBORS = 20 #######
Average training precision is 0.9819832851173257
Average testing precision is 0.8808215215504772
####### N_NEIGHBORS = 30 #######
Average training precision is 0.9968981035036966
Average testing precision is 0.8893549320219843
####### N_NEIGHBORS = 40 #######
Average training precision is 0.9975731276117005
Average testing precision is 0.8831838781216854
####### N_NEIGHBORS = 50 #######
Average training precision is 0.9977659916425586
Average testing precision is 0.8841963166522033
# Plotting the error rates
import matplotlib.pylab as plt
def plot_error_rates(tr_scores, te_scores, title, reverse=False):
test_list = te_scores.items()
test_list = sorted(test_list)
x1, y1 = zip(*test_list)
train_list = tr_scores.items()
train_list = sorted(train_list)
x2, y2 = zip(*train_list)
plt.plot(x1, y1, label="Test")
plt.plot(x2, y2, label="Train")
if (reverse):
plt.gca().invert_xaxis()
plt.xlabel('PARAMETER')
plt.ylabel('Error rates')
plt.title(title)
plt.legend()
plt.show()
plot_error_rates(DT_train_scores, DT_test_scores,"Average accuracy in terms of the max-depth parameter")
from sklearn.neighbors import KNeighborsClassifier
KNN_train_scores, KNN_test_scores = get_error_rates(KNeighborsClassifier, "n_neighbors", (1, 30), 1, 10)
####### N_NEIGHBORS = 10 #######
Average training precision is 0.8439729990356798
Average testing precision is 0.8312120335551055
####### N_NEIGHBORS = 20 #######
Average training precision is 0.8380424300867888
Average testing precision is 0.8358403239803298
####### N_NEIGHBORS = 30 #######
Average training precision is 0.8360655737704918
Average testing precision is 0.8352617876771767
plot_error_rates(KNN_train_scores, KNN_test_scores,"Average error rate in terms of the n_neighbors parameter",True)
from sklearn.ensemble import RandomForestClassifier
RFC_train_scores, RFC_test_scores = get_error_rates(RandomForestClassifier, "n_estimators", (1, 30), 2, 10)
####### N_NEIGHBORS = 10 #######
Average training precision is 0.9907907425265188
Average testing precision is 0.918571015331212
####### N_NEIGHBORS = 20 #######
Average training precision is 0.9950337512054002
Average testing precision is 0.9218975990743419
####### N_NEIGHBORS = 30 #######
Average training precision is 0.9964802314368371
Average testing precision is 0.9263089383858838
plot_error_rates(RFC_train_scores, RFC_test_scores,"Average accuracy in terms of the n_estimators parameter")
from sklearn.metrics import mean_squared_error
# function to compute error rates
def get_error_rates_regressor(model, parameter, steps, internal_loops, print_interval):
test_scores = {}
train_scores = {}
model_p = {}
for i in range(*steps):
loops = internal_loops
train_score = 0
test_score = 0
for j in range(loops):
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25)
model_p[parameter] = i
clf = model(**model_p)
clf = clf.fit(X_train, y_train)
y_test_predict = clf.predict(X_test)
test_score += mean_squared_error(y_test, y_test_predict)
y_train_predict = clf.predict(X_train)
train_score += mean_squared_error(y_train, y_train_predict)
if i % print_interval == 0:
print(f"####### N_NEIGHBORS = {i} #######")
print(f"Average mean squared error on training is {train_score/loops}")
print(f"Average mean squared error on testing is {test_score/loops}\n")
test_scores[i] = test_score/loops
train_scores[i] = train_score/loops
return train_scores, test_scores
# import the regressor
from sklearn.tree import DecisionTreeRegressor
# Building data for decision tree Regressor
target = df.AMOUNT_ART_HISTORY_OF_FLORENCE
data = df.drop(['AMOUNT_ART_HISTORY_OF_FLORENCE', 'PURCHASE_OF_ART_HISTORY_OF_FLORENCE'], 1)
DT_train_scores, DT_test_scores = get_error_rates_regressor(DecisionTreeRegressor, "max_depth", (1, 50), 3, 10)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:6: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only
####### N_NEIGHBORS = 10 #######
Average mean squared error on training is 123.99267970213187
Average mean squared error on testing is 210.81323498186725
####### N_NEIGHBORS = 20 #######
Average mean squared error on training is 9.676524352091862
Average mean squared error on testing is 196.1468177318886
####### N_NEIGHBORS = 30 #######
Average mean squared error on training is 1.4481008209098476
Average mean squared error on testing is 207.60693172612426
####### N_NEIGHBORS = 40 #######
Average mean squared error on training is 1.2810296796314153
Average mean squared error on testing is 213.23737799847868
plot_error_rates(DT_train_scores, DT_test_scores,"Mean squared error in terms of the max-depth parameter")
from sklearn.neighbors import KNeighborsRegressor
KNN_train_scores, KNN_test_scores = get_error_rates_regressor(KNeighborsRegressor, "n_neighbors", (1, 30), 1, 10)
####### N_NEIGHBORS = 10 #######
Average mean squared error on training is 265.5661972034715
Average mean squared error on testing is 328.5932716227943
####### N_NEIGHBORS = 20 #######
Average mean squared error on training is 303.8951024590164
Average mean squared error on testing is 328.8247082007521
plot_error_rates(KNN_train_scores, KNN_test_scores,"Mean squared error in terms of the n_neighbors parameter",True)
from sklearn.ensemble import RandomForestRegressor
RFR_train_scores, RFR_test_scores = get_error_rates_regressor(RandomForestRegressor, "n_estimators", (1, 40, 2), 2, 5)
####### N_NEIGHBORS = 5 #######
Average mean squared error on training is 35.225507943377714
Average mean squared error on testing is 154.57247984058986
####### N_NEIGHBORS = 15 #######
Average mean squared error on training is 23.52174914727293
Average mean squared error on testing is 140.69289496106447
####### N_NEIGHBORS = 25 #######
Average mean squared error on training is 21.570102420376198
Average mean squared error on testing is 130.93513276363845
####### N_NEIGHBORS = 35 #######
Average mean squared error on training is 20.375676663035012
Average mean squared error on testing is 125.95312026680352
plot_error_rates(RFR_train_scores, RFR_test_scores,"Mean squared error in terms of the n_neighbors parameter")