import pandas as pd
# Loading the data
df = pd.read_csv('CampaignResponses.txt', header = 0, sep="\t")
print("Type of contained values:")
print(df.dtypes)
print("\nSize of dataframe is: {}".format(df.shape))
# Number of values in each column
print("Each column contains the same number of values (27564).")
print(df.count())
# First analysis of data contained
print(df.describe())
Quick analysis of some values
ax = df.hist(column='PURCHASE_OF_ART_HISTORY_OF_FLORENCE', bins=25, grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
ax = ax[0]
for x in ax:
    # Despine
    x.spines['right'].set_visible(False)
    x.spines['top'].set_visible(False)
    # Switch off ticks
    x.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on")
    # Draw horizontal axis lines
    vals = x.get_yticks()
    for tick in vals:
        x.axhline(y=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)
    # Remove title
    x.set_title("PURCHASE OF ART HISTORY OF FLORENCE")
    # Set y-axis label
    x.set_ylabel("Frequency", labelpad=20, weight='bold', size=12)
df.boxplot(column=['AMOUNT_ART_HISTORY_OF_FLORENCE'], return_type='axes')
df.groupby(['Gender']).sum().plot(kind='pie', y=0, autopct='%1.0f%%')
Using a decision tree to predict a given class
# Converting Gender to numerical
df.Gender.replace(['F', 'M'], [1, 0], inplace=True)
df.Gender.head()
from sklearn import tree
# Building data for decision tree classifier
target = df.PURCHASE_OF_ART_HISTORY_OF_FLORENCE
data = df.drop(['AMOUNT_ART_HISTORY_OF_FLORENCE', 'PURCHASE_OF_ART_HISTORY_OF_FLORENCE'], 1)
clf = tree.DecisionTreeClassifier(max_depth=1)
clf = clf.fit(data, target)
tree.plot_tree(clf,filled=True,feature_names = data.columns)
# r = tree.export_text(clf, feature_names=list(data.columns.values))
# Displaying the beginning of the decision tree (too long otherwise!)
# print('\n'.join(r.split('\n')[:7]))
# Model learning error
from sklearn.metrics import confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# calc confusion matrix
y_test_predict = clf.predict(data)
m = ConfusionMatrixDisplay(confusion_matrix(target, y_test_predict),display_labels=clf.classes_).plot()
m.ax_.set_title("Confusion Matrix \n")
plt.show()
print("The precision for this decision tree is ", precision_score(target, y_test_predict)) 
for i in range(1, 10, 2):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    clf = clf.fit(data, target)
    y_test_predict = clf.predict(data)
    m = ConfusionMatrixDisplay(confusion_matrix(target, y_test_predict),display_labels=clf.classes_).plot()
    title = f"MAX-DEPTH = {i} - Confusion Matrix \n"
    m.ax_.set_title(title)
    m.ax_.cmap = 'plasma'
    plt.show()
    print("The precision for this decision tree is ", precision_score(target, y_test_predict)) 
from sklearn.model_selection import train_test_split
# function to compute error rates 
def get_error_rates(model, parameter, steps, internal_loops, print_interval):
    test_scores = {}
    train_scores = {}
    model_p = {}
    for i in range(*steps):
        loops = internal_loops
        train_score = 0
        test_score = 0
        for j in range(loops):
            X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25)
        
            model_p[parameter] = i
            clf = model(**model_p)
            clf = clf.fit(X_train, y_train)
            y_test_predict = clf.predict(X_test)
            test_score += clf.score(X_test, y_test)
            y_train_predict = clf.predict(X_train)
            train_score += clf.score(X_train, y_train)
        if i % print_interval == 0:
            print(f"####### N_NEIGHBORS = {i} #######")
            print(f"Average training precision is {train_score/loops}")
            print(f"Average testing precision is {test_score/loops}\n")
        test_scores[i] = 1 - test_score/loops
        train_scores[i] = 1 - train_score/loops
    return train_scores, test_scores
DT_train_scores, DT_test_scores = get_error_rates(tree.DecisionTreeClassifier, "max_depth", (1, 50), 3, 10)
# Plotting the error rates
import matplotlib.pylab as plt
def plot_error_rates(tr_scores, te_scores, title, reverse=False):
    test_list = te_scores.items()
    test_list = sorted(test_list) 
    x1, y1 = zip(*test_list) 
    train_list = tr_scores.items()
    train_list = sorted(train_list) 
    x2, y2 = zip(*train_list) 
    plt.plot(x1, y1, label="Test")
    plt.plot(x2, y2, label="Train")
    if (reverse):
        plt.gca().invert_xaxis()
    plt.xlabel('PARAMETER')
    plt.ylabel('Error rates')
    plt.title(title)
    plt.legend()
    plt.show()
plot_error_rates(DT_train_scores, DT_test_scores,"Average accuracy in terms of the max-depth parameter")
from sklearn.neighbors import KNeighborsClassifier
KNN_train_scores, KNN_test_scores = get_error_rates(KNeighborsClassifier, "n_neighbors", (1, 30), 1, 10)
plot_error_rates(KNN_train_scores, KNN_test_scores,"Average error rate in terms of the n_neighbors parameter",True)
from sklearn.ensemble import RandomForestClassifier
RFC_train_scores, RFC_test_scores = get_error_rates(RandomForestClassifier, "n_estimators", (1, 30), 2, 10)
plot_error_rates(RFC_train_scores, RFC_test_scores,"Average accuracy in terms of the n_estimators parameter")
from sklearn.metrics import mean_squared_error
# function to compute error rates 
def get_error_rates_regressor(model, parameter, steps, internal_loops, print_interval):
    test_scores = {}
    train_scores = {}
    model_p = {}
    for i in range(*steps):
        loops = internal_loops
        train_score = 0
        test_score = 0
        for j in range(loops):
            X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25)
        
            model_p[parameter] = i
            clf = model(**model_p)
            clf = clf.fit(X_train, y_train)
            y_test_predict = clf.predict(X_test)
            test_score += mean_squared_error(y_test, y_test_predict)
            y_train_predict = clf.predict(X_train)
            train_score += mean_squared_error(y_train, y_train_predict)
        if i % print_interval == 0:
            print(f"####### N_NEIGHBORS = {i} #######")
            print(f"Average mean squared error on training is {train_score/loops}")
            print(f"Average mean squared error on testing is {test_score/loops}\n")
        test_scores[i] = test_score/loops
        train_scores[i] = train_score/loops
    return train_scores, test_scores
# import the regressor
from sklearn.tree import DecisionTreeRegressor
# Building data for decision tree Regressor
target = df.AMOUNT_ART_HISTORY_OF_FLORENCE
data = df.drop(['AMOUNT_ART_HISTORY_OF_FLORENCE', 'PURCHASE_OF_ART_HISTORY_OF_FLORENCE'], 1)
DT_train_scores, DT_test_scores = get_error_rates_regressor(DecisionTreeRegressor, "max_depth", (1, 50), 3, 10)
plot_error_rates(DT_train_scores, DT_test_scores,"Mean squared error in terms of the max-depth parameter")
from sklearn.neighbors import KNeighborsRegressor
KNN_train_scores, KNN_test_scores = get_error_rates_regressor(KNeighborsRegressor, "n_neighbors", (1, 30), 1, 10)
plot_error_rates(KNN_train_scores, KNN_test_scores,"Mean squared error in terms of the n_neighbors parameter",True)
from sklearn.ensemble import RandomForestRegressor
RFR_train_scores, RFR_test_scores = get_error_rates_regressor(RandomForestRegressor, "n_estimators", (1, 40, 2), 2, 5)
plot_error_rates(RFR_train_scores, RFR_test_scores,"Mean squared error in terms of the n_neighbors parameter")