Paired T-test for Model Comparison

Image borrowed from: https://www.statstest.com/paired-samples-t-test/

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import scipy.stats as stats from collections import Counter

data = pd.read_csv('data.csv')

data.head()

data_shuffled = data.sample(frac = 1).reset_index() # shuffle the dataframe X = data_shuffled[['x1', 'x2']].copy() y = data_shuffled['y'].copy()

data['y'].value_counts()

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=True, test_size=0.2)

from sklearn.linear_model import LogisticRegression

# Create an instance of Logistic Regression classifier lr = LogisticRegression(multi_class='ovr', solver='liblinear') # Train the model with the train data lr.fit(X_train, y_train)

# Predict the lables of the validation data y_pred_lr = lr.predict(X_val)

from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB() gnb.fit(X_train, y_train)

# Predict the lables of the validation data y_pred_gnb = gnb.predict(X_val)

def f1_score(y_true, y_pred, labels=None, average=None): """ y_true : 1d array-like Ground truth (correct) target values. y_pred : 1d array-like Estimated targets as returned by a classifier. labels : list of unique labels for sort out the result average : string, ['micro', 'macro'] instead of defining labels. """ if not isinstance(y_true, np.ndarray): y_true = np.array(y_true) if labels is None: if average == 'micro': tp_fp_all = len(y_pred) tp_fn_all = len(y_true) tp_all = 0 for i in range(len(y_pred)): if y_true[i] == y_pred[i]: tp_all += 1 precision = tp_all/tp_fp_all recall = tp_all/tp_fn_all micro_f1 = 2 * (precision*recall) / (precision+recall) return micro_f1 # the score with micro averaging elif average == 'macro': score_list = [] count_pred = Counter(y_pred) count_true = Counter(y_true) labels = count_pred.keys() for label in labels: tp_fp = count_pred[label] tp_fn = count_true[label] tp = 0 for i in range(len(y_true)): if (y_true[i] == label) & (y_pred[i] == label): tp += 1 precision = tp/tp_fp recall = tp/tp_fn f1 = 2 * (precision*recall) / (precision+recall) score_list.append(f1) macro_score = np.average(score_list) return macro_score # the score with macro averaging else: score_list = [] for label in labels: tp_fp = Counter(y_pred)[label] tp_fn = Counter(y_true)[label] tp = 0 for i in range(len(y_true)): if (y_true[i] == label) & (y_pred[i] == label): tp += 1 precision = tp/tp_fp recall = tp/tp_fn f1 = 2 * (precision*recall) / (precision+recall) score_list.append(f1) return score_list # list of score of each label

f1_lr = f1_score(y_val, y_pred_lr, average='macro') print("F1 score of the Logistic Regression Model: %f" %f1_lr)

f1_gnb = f1_score(y_val, y_pred_gnb, average='macro') print("F1 score of the Gaussian Naive Bayes Model: %f" %f1_gnb)

data_shuffled = data.sample(frac = 1).reset_index() # shuffle the dataframe X = data_shuffled[['x1', 'x2']].copy() y = data_shuffled['y'].copy()

macro_f1_lr = [] k = 10 for fold_nbr in range(k): split_point_1 = int(float(fold_nbr)/k*len(X)) split_point_2 = int(float(fold_nbr+1)/k*len(X)) X_train = pd.concat([X.iloc[:split_point_1], X.iloc[split_point_2:]]) y_train = pd.concat([y.iloc[:split_point_1], y.iloc[split_point_2:]]) X_val = X.iloc[split_point_1:split_point_2] y_val = list(y.iloc[split_point_1:split_point_2]) lr = LogisticRegression(multi_class='ovr', solver='liblinear') # Train the model with the train data lr.fit(X_train, y_train) # apply the classifier to validation data predicted = lr.predict(X_val) # calcultae macro-f1 score macro_f1_lr.append(f1_score(y_val, predicted, average='macro'))

macro_f1_gnb = [] k = 10 for fold_nbr in range(k): split_point_1 = int(float(fold_nbr)/k*len(X)) split_point_2 = int(float(fold_nbr+1)/k*len(X)) X_train = pd.concat([X.iloc[:split_point_1], X.iloc[split_point_2:]]) y_train = pd.concat([y.iloc[:split_point_1], y.iloc[split_point_2:]]) X_val = X.iloc[split_point_1:split_point_2] y_val = list(y.iloc[split_point_1:split_point_2]) gnb = GaussianNB() # Train the model with the train data gnb.fit(X_train, y_train) # apply the classifier to validation data predicted = gnb.predict(X_val) # calcultae macro-f1 score macro_f1_gnb.append(f1_score(y_val, predicted, average='macro'))

plt.plot(macro_f1_gnb, label='Gaussian Naive Bayes', c='tab:blue') plt.plot(macro_f1_lr, label='Logistic Regression', c='tab:orange') plt.title("Macro-F1 score (Logistic Regression vs Gaussian Naive Bayes)") plt.legend(loc='upper right', bbox_to_anchor=(1.5, 1)) plt.show()

def paired_ttest(data1, data2, alpha=0.05, alternative='two-sided'): """ Calculate the t-test on TWO RELATED samples of scores, data1 and data2. This is a test for the null hypothesis that two related or repeated samples have identical average (expected) values. Parameters ---------- data1, data2 : array_like The arrays must have the same shape. alpha : int or float, default: 0.05 The significance level alternative : {'two-sided', 'less', 'greater'}, optional Defines the alternative hypothesis. The following options are available (default is 'two-sided'): * 'two-sided': the means of the distributions underlying the samples are unequal. * 'less': the mean of the distribution underlying the first sample is less than the mean of the distribution underlying the second sample. * 'greater': the mean of the distribution underlying the first sample is greater than the mean of the distribution underlying the second sample. Returns ------- t_stat : The t-statistic. p : The p-value associated with the given alternative. df : The number of degrees of freedom used in calculation of the t-statistic; this is one less than the size of the sample cv: critical value """ if len(data1) != len(data2): raise Exception("The size of data1 and data2 should be the same") data1 = np.array(data1) data2 = np.array(data2) # calculate means and standard devision of data1 and data 2 mean1, mean2 = np.average(data1), np.average(data2) std1, std2 = np.std(data1, ddof=1), np.std(data2, ddof=1) # sample size n = len(data1) # degree of freedom df = n-1 diff = data1 - data2 mean_diff = np.average(diff) # standard deviaion of the difference s_diff = np.sqrt((1/(n-1))*(sum((diff-mean_diff)**2))) # t-test statistic t_stat = mean_diff/(s_diff/np.sqrt(n)) # calculate the p-value if alternative == 'two-sided': p = min([(stats.t.cdf(abs(t_stat), df)),(1.0 - stats.t.cdf(abs(t_stat), df))]) * 2.0 elif alternative == 'greater': p = 1.0 - stats.t.cdf(abs(t_stat), df) elif alternative == 'less': p = stats.t.cdf(abs(t_stat), df) # calculate the critical value cv = ((stats.t.ppf(alpha, df)),(stats.t.ppf(1.0 - alpha, df))) # return everything return t_stat, p, df, cv

ttest, p_value, df, cv = paired_ttest(macro_f1_gnb, macro_f1_lr, alpha=0.05, alternative='greater') print("The test statistic: ", ttest) print("The p value:%.8f" % p_value) if p_value <0.05: print("=> Reject the null hypothesis") else: print("=> Fail to reject the null hypothesis")

ttest, p_value = stats.ttest_rel(macro_f1_gnb, macro_f1_lr, alternative='greater') print("The test statistic: ", ttest) print("The p value:%.8f" % p_value) if p_value <0.05: print("=> Reject the null hypothesis") else: print("=> Fail to reject the null hypothesis")