import pandas as pd
# Loading the data
df = pd.read_csv('CampaignResponses.txt', header = 0, sep="\t")
print("Type of contained values:")
print(df.dtypes)
print("\nSize of dataframe is: {}".format(df.shape))
# Number of values in each column
print("Each column contains the same number of values (27564).")
print(df.count())
# First analysis of data contained
print(df.describe())
Quick analysis of some values
ax = df.hist(column='PURCHASE_OF_ART_HISTORY_OF_FLORENCE', bins=25, grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
ax = ax[0]
for x in ax:
# Despine
x.spines['right'].set_visible(False)
x.spines['top'].set_visible(False)
# Switch off ticks
x.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on")
# Draw horizontal axis lines
vals = x.get_yticks()
for tick in vals:
x.axhline(y=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)
# Remove title
x.set_title("PURCHASE OF ART HISTORY OF FLORENCE")
# Set y-axis label
x.set_ylabel("Frequency", labelpad=20, weight='bold', size=12)
df.boxplot(column=['AMOUNT_ART_HISTORY_OF_FLORENCE'], return_type='axes')
df.groupby(['Gender']).sum().plot(kind='pie', y=0, autopct='%1.0f%%')
Using a decision tree to predict a given class
# Converting Gender to numerical
df.Gender.replace(['F', 'M'], [1, 0], inplace=True)
df.Gender.head()
from sklearn import tree
# Building data for decision tree classifier
target = df.PURCHASE_OF_ART_HISTORY_OF_FLORENCE
data = df.drop(['AMOUNT_ART_HISTORY_OF_FLORENCE', 'PURCHASE_OF_ART_HISTORY_OF_FLORENCE'], 1)
clf = tree.DecisionTreeClassifier(max_depth=1)
clf = clf.fit(data, target)
tree.plot_tree(clf,filled=True,feature_names = data.columns)
# r = tree.export_text(clf, feature_names=list(data.columns.values))
# Displaying the beginning of the decision tree (too long otherwise!)
# print('\n'.join(r.split('\n')[:7]))
# Model learning error
from sklearn.metrics import confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# calc confusion matrix
y_test_predict = clf.predict(data)
m = ConfusionMatrixDisplay(confusion_matrix(target, y_test_predict),display_labels=clf.classes_).plot()
m.ax_.set_title("Confusion Matrix \n")
plt.show()
print("The precision for this decision tree is ", precision_score(target, y_test_predict))
for i in range(1, 10, 2):
clf = tree.DecisionTreeClassifier(max_depth=i)
clf = clf.fit(data, target)
y_test_predict = clf.predict(data)
m = ConfusionMatrixDisplay(confusion_matrix(target, y_test_predict),display_labels=clf.classes_).plot()
title = f"MAX-DEPTH = {i} - Confusion Matrix \n"
m.ax_.set_title(title)
m.ax_.cmap = 'plasma'
plt.show()
print("The precision for this decision tree is ", precision_score(target, y_test_predict))
from sklearn.model_selection import train_test_split
# function to compute error rates
def get_error_rates(model, parameter, steps, internal_loops, print_interval):
test_scores = {}
train_scores = {}
model_p = {}
for i in range(*steps):
loops = internal_loops
train_score = 0
test_score = 0
for j in range(loops):
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25)
model_p[parameter] = i
clf = model(**model_p)
clf = clf.fit(X_train, y_train)
y_test_predict = clf.predict(X_test)
test_score += clf.score(X_test, y_test)
y_train_predict = clf.predict(X_train)
train_score += clf.score(X_train, y_train)
if i % print_interval == 0:
print(f"####### N_NEIGHBORS = {i} #######")
print(f"Average training precision is {train_score/loops}")
print(f"Average testing precision is {test_score/loops}\n")
test_scores[i] = 1 - test_score/loops
train_scores[i] = 1 - train_score/loops
return train_scores, test_scores
DT_train_scores, DT_test_scores = get_error_rates(tree.DecisionTreeClassifier, "max_depth", (1, 50), 3, 10)
# Plotting the error rates
import matplotlib.pylab as plt
def plot_error_rates(tr_scores, te_scores, title, reverse=False):
test_list = te_scores.items()
test_list = sorted(test_list)
x1, y1 = zip(*test_list)
train_list = tr_scores.items()
train_list = sorted(train_list)
x2, y2 = zip(*train_list)
plt.plot(x1, y1, label="Test")
plt.plot(x2, y2, label="Train")
if (reverse):
plt.gca().invert_xaxis()
plt.xlabel('PARAMETER')
plt.ylabel('Error rates')
plt.title(title)
plt.legend()
plt.show()
plot_error_rates(DT_train_scores, DT_test_scores,"Average accuracy in terms of the max-depth parameter")
from sklearn.neighbors import KNeighborsClassifier
KNN_train_scores, KNN_test_scores = get_error_rates(KNeighborsClassifier, "n_neighbors", (1, 30), 1, 10)
plot_error_rates(KNN_train_scores, KNN_test_scores,"Average error rate in terms of the n_neighbors parameter",True)
from sklearn.ensemble import RandomForestClassifier
RFC_train_scores, RFC_test_scores = get_error_rates(RandomForestClassifier, "n_estimators", (1, 30), 2, 10)
plot_error_rates(RFC_train_scores, RFC_test_scores,"Average accuracy in terms of the n_estimators parameter")
from sklearn.metrics import mean_squared_error
# function to compute error rates
def get_error_rates_regressor(model, parameter, steps, internal_loops, print_interval):
test_scores = {}
train_scores = {}
model_p = {}
for i in range(*steps):
loops = internal_loops
train_score = 0
test_score = 0
for j in range(loops):
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25)
model_p[parameter] = i
clf = model(**model_p)
clf = clf.fit(X_train, y_train)
y_test_predict = clf.predict(X_test)
test_score += mean_squared_error(y_test, y_test_predict)
y_train_predict = clf.predict(X_train)
train_score += mean_squared_error(y_train, y_train_predict)
if i % print_interval == 0:
print(f"####### N_NEIGHBORS = {i} #######")
print(f"Average mean squared error on training is {train_score/loops}")
print(f"Average mean squared error on testing is {test_score/loops}\n")
test_scores[i] = test_score/loops
train_scores[i] = train_score/loops
return train_scores, test_scores
# import the regressor
from sklearn.tree import DecisionTreeRegressor
# Building data for decision tree Regressor
target = df.AMOUNT_ART_HISTORY_OF_FLORENCE
data = df.drop(['AMOUNT_ART_HISTORY_OF_FLORENCE', 'PURCHASE_OF_ART_HISTORY_OF_FLORENCE'], 1)
DT_train_scores, DT_test_scores = get_error_rates_regressor(DecisionTreeRegressor, "max_depth", (1, 50), 3, 10)
plot_error_rates(DT_train_scores, DT_test_scores,"Mean squared error in terms of the max-depth parameter")
from sklearn.neighbors import KNeighborsRegressor
KNN_train_scores, KNN_test_scores = get_error_rates_regressor(KNeighborsRegressor, "n_neighbors", (1, 30), 1, 10)
plot_error_rates(KNN_train_scores, KNN_test_scores,"Mean squared error in terms of the n_neighbors parameter",True)
from sklearn.ensemble import RandomForestRegressor
RFR_train_scores, RFR_test_scores = get_error_rates_regressor(RandomForestRegressor, "n_estimators", (1, 40, 2), 2, 5)
plot_error_rates(RFR_train_scores, RFR_test_scores,"Mean squared error in terms of the n_neighbors parameter")