TP1_ScikitLearn

import pandas as pd # Loading the data df = pd.read_csv('CampaignResponses.txt', header = 0, sep="\t") print("Type of contained values:") print(df.dtypes) print("\nSize of dataframe is: {}".format(df.shape))

# Number of values in each column print("Each column contains the same number of values (27564).") print(df.count())

# First analysis of data contained print(df.describe())

Quick analysis of some values

ax = df.hist(column='PURCHASE_OF_ART_HISTORY_OF_FLORENCE', bins=25, grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9) ax = ax[0] for x in ax: # Despine x.spines['right'].set_visible(False) x.spines['top'].set_visible(False) # Switch off ticks x.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on") # Draw horizontal axis lines vals = x.get_yticks() for tick in vals: x.axhline(y=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1) # Remove title x.set_title("PURCHASE OF ART HISTORY OF FLORENCE") # Set y-axis label x.set_ylabel("Frequency", labelpad=20, weight='bold', size=12)

df.boxplot(column=['AMOUNT_ART_HISTORY_OF_FLORENCE'], return_type='axes')

df.groupby(['Gender']).sum().plot(kind='pie', y=0, autopct='%1.0f%%')

Using a decision tree to predict a given class

# Converting Gender to numerical df.Gender.replace(['F', 'M'], [1, 0], inplace=True) df.Gender.head()

from sklearn import tree # Building data for decision tree classifier target = df.PURCHASE_OF_ART_HISTORY_OF_FLORENCE data = df.drop(['AMOUNT_ART_HISTORY_OF_FLORENCE', 'PURCHASE_OF_ART_HISTORY_OF_FLORENCE'], 1) clf = tree.DecisionTreeClassifier(max_depth=1) clf = clf.fit(data, target)

tree.plot_tree(clf,filled=True,feature_names = data.columns) # r = tree.export_text(clf, feature_names=list(data.columns.values)) # Displaying the beginning of the decision tree (too long otherwise!) # print('\n'.join(r.split('\n')[:7]))

# Model learning error from sklearn.metrics import confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay import matplotlib.pyplot as plt # calc confusion matrix y_test_predict = clf.predict(data) m = ConfusionMatrixDisplay(confusion_matrix(target, y_test_predict),display_labels=clf.classes_).plot() m.ax_.set_title("Confusion Matrix \n") plt.show() print("The precision for this decision tree is ", precision_score(target, y_test_predict))

for i in range(1, 10, 2): clf = tree.DecisionTreeClassifier(max_depth=i) clf = clf.fit(data, target) y_test_predict = clf.predict(data) m = ConfusionMatrixDisplay(confusion_matrix(target, y_test_predict),display_labels=clf.classes_).plot() title = f"MAX-DEPTH = {i} - Confusion Matrix \n" m.ax_.set_title(title) m.ax_.cmap = 'plasma' plt.show() print("The precision for this decision tree is ", precision_score(target, y_test_predict))

from sklearn.model_selection import train_test_split

# function to compute error rates def get_error_rates(model, parameter, steps, internal_loops, print_interval): test_scores = {} train_scores = {} model_p = {} for i in range(*steps): loops = internal_loops train_score = 0 test_score = 0 for j in range(loops): X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25) model_p[parameter] = i clf = model(**model_p) clf = clf.fit(X_train, y_train) y_test_predict = clf.predict(X_test) test_score += clf.score(X_test, y_test) y_train_predict = clf.predict(X_train) train_score += clf.score(X_train, y_train) if i % print_interval == 0: print(f"####### N_NEIGHBORS = {i} #######") print(f"Average training precision is {train_score/loops}") print(f"Average testing precision is {test_score/loops}\n") test_scores[i] = 1 - test_score/loops train_scores[i] = 1 - train_score/loops return train_scores, test_scores

DT_train_scores, DT_test_scores = get_error_rates(tree.DecisionTreeClassifier, "max_depth", (1, 50), 3, 10)

# Plotting the error rates import matplotlib.pylab as plt def plot_error_rates(tr_scores, te_scores, title, reverse=False): test_list = te_scores.items() test_list = sorted(test_list) x1, y1 = zip(*test_list) train_list = tr_scores.items() train_list = sorted(train_list) x2, y2 = zip(*train_list) plt.plot(x1, y1, label="Test") plt.plot(x2, y2, label="Train") if (reverse): plt.gca().invert_xaxis() plt.xlabel('PARAMETER') plt.ylabel('Error rates') plt.title(title) plt.legend() plt.show()

plot_error_rates(DT_train_scores, DT_test_scores,"Average accuracy in terms of the max-depth parameter")

from sklearn.neighbors import KNeighborsClassifier KNN_train_scores, KNN_test_scores = get_error_rates(KNeighborsClassifier, "n_neighbors", (1, 30), 1, 10)

plot_error_rates(KNN_train_scores, KNN_test_scores,"Average error rate in terms of the n_neighbors parameter",True)

from sklearn.ensemble import RandomForestClassifier RFC_train_scores, RFC_test_scores = get_error_rates(RandomForestClassifier, "n_estimators", (1, 30), 2, 10)

plot_error_rates(RFC_train_scores, RFC_test_scores,"Average accuracy in terms of the n_estimators parameter")

from sklearn.metrics import mean_squared_error # function to compute error rates def get_error_rates_regressor(model, parameter, steps, internal_loops, print_interval): test_scores = {} train_scores = {} model_p = {} for i in range(*steps): loops = internal_loops train_score = 0 test_score = 0 for j in range(loops): X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25) model_p[parameter] = i clf = model(**model_p) clf = clf.fit(X_train, y_train) y_test_predict = clf.predict(X_test) test_score += mean_squared_error(y_test, y_test_predict) y_train_predict = clf.predict(X_train) train_score += mean_squared_error(y_train, y_train_predict) if i % print_interval == 0: print(f"####### N_NEIGHBORS = {i} #######") print(f"Average mean squared error on training is {train_score/loops}") print(f"Average mean squared error on testing is {test_score/loops}\n") test_scores[i] = test_score/loops train_scores[i] = train_score/loops return train_scores, test_scores

# import the regressor from sklearn.tree import DecisionTreeRegressor # Building data for decision tree Regressor target = df.AMOUNT_ART_HISTORY_OF_FLORENCE data = df.drop(['AMOUNT_ART_HISTORY_OF_FLORENCE', 'PURCHASE_OF_ART_HISTORY_OF_FLORENCE'], 1) DT_train_scores, DT_test_scores = get_error_rates_regressor(DecisionTreeRegressor, "max_depth", (1, 50), 3, 10)

plot_error_rates(DT_train_scores, DT_test_scores,"Mean squared error in terms of the max-depth parameter")

from sklearn.neighbors import KNeighborsRegressor KNN_train_scores, KNN_test_scores = get_error_rates_regressor(KNeighborsRegressor, "n_neighbors", (1, 30), 1, 10)

plot_error_rates(KNN_train_scores, KNN_test_scores,"Mean squared error in terms of the n_neighbors parameter",True)

from sklearn.ensemble import RandomForestRegressor RFR_train_scores, RFR_test_scores = get_error_rates_regressor(RandomForestRegressor, "n_estimators", (1, 40, 2), 2, 5)

plot_error_rates(RFR_train_scores, RFR_test_scores,"Mean squared error in terms of the n_neighbors parameter")

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Quick analysis of some values

Using a decision tree to predict a given class

Quick analysis of some values