Image borrowed from: https://www.statstest.com/paired-samples-t-test/
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from collections import Counter
data = pd.read_csv('data.csv')
data.head()
data_shuffled = data.sample(frac = 1).reset_index() # shuffle the dataframe
X = data_shuffled[['x1', 'x2']].copy()
y = data_shuffled['y'].copy()
data['y'].value_counts()
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=True, test_size=0.2)
from sklearn.linear_model import LogisticRegression
# Create an instance of Logistic Regression classifier
lr = LogisticRegression(multi_class='ovr', solver='liblinear')
# Train the model with the train data
lr.fit(X_train, y_train)
# Predict the lables of the validation data
y_pred_lr = lr.predict(X_val)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
# Predict the lables of the validation data
y_pred_gnb = gnb.predict(X_val)
def f1_score(y_true, y_pred, labels=None, average=None):
    """
    y_true : 1d array-like Ground truth (correct) target values.
    y_pred : 1d array-like Estimated targets as returned by a classifier.
    labels : list of unique labels for sort out the result
    average : string, ['micro', 'macro'] instead of defining labels.
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)
    
    if labels is None:
        if average == 'micro':
            tp_fp_all = len(y_pred)
            tp_fn_all = len(y_true)
            tp_all = 0
            for i in range(len(y_pred)):
                if y_true[i] == y_pred[i]:
                    tp_all += 1
            precision = tp_all/tp_fp_all
            recall = tp_all/tp_fn_all
            micro_f1 = 2 * (precision*recall) / (precision+recall)
            return micro_f1 # the score with micro averaging
        elif average == 'macro':
            score_list = []
            count_pred = Counter(y_pred)
            count_true = Counter(y_true)
            labels = count_pred.keys()
            for label in labels:
                tp_fp = count_pred[label]
                tp_fn = count_true[label]
                tp = 0
                for i in range(len(y_true)):
                    if (y_true[i] == label) & (y_pred[i] == label):
                        tp += 1
                precision = tp/tp_fp
                recall = tp/tp_fn
                f1 = 2 * (precision*recall) / (precision+recall)
                score_list.append(f1)
            macro_score = np.average(score_list)            
            return macro_score # the score with macro averaging
    else:
        score_list = []
        for label in labels:
            tp_fp = Counter(y_pred)[label]
            tp_fn = Counter(y_true)[label]
            tp = 0
            for i in range(len(y_true)):
                if (y_true[i] == label) & (y_pred[i] == label):
                    tp += 1
            precision = tp/tp_fp
            recall = tp/tp_fn
            f1 = 2 * (precision*recall) / (precision+recall)
            score_list.append(f1)
            
        return score_list # list of score of each label 
f1_lr = f1_score(y_val, y_pred_lr, average='macro')
print("F1 score of the Logistic Regression Model: %f" %f1_lr)
f1_gnb = f1_score(y_val, y_pred_gnb, average='macro')
print("F1 score of the Gaussian Naive Bayes Model: %f" %f1_gnb)
data_shuffled = data.sample(frac = 1).reset_index() # shuffle the dataframe
X = data_shuffled[['x1', 'x2']].copy()
y = data_shuffled['y'].copy()
macro_f1_lr = []
k = 10
for fold_nbr in range(k):
    split_point_1 = int(float(fold_nbr)/k*len(X))
    split_point_2 = int(float(fold_nbr+1)/k*len(X))
    X_train = pd.concat([X.iloc[:split_point_1], X.iloc[split_point_2:]])
    y_train = pd.concat([y.iloc[:split_point_1], y.iloc[split_point_2:]])
    X_val = X.iloc[split_point_1:split_point_2]
    y_val = list(y.iloc[split_point_1:split_point_2])
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    # Train the model with the train data
    lr.fit(X_train, y_train)
    # apply the classifier to validation data
    predicted = lr.predict(X_val)
    
    # calcultae macro-f1 score
    macro_f1_lr.append(f1_score(y_val, predicted, average='macro'))
macro_f1_gnb = []
k = 10
for fold_nbr in range(k):
    split_point_1 = int(float(fold_nbr)/k*len(X))
    split_point_2 = int(float(fold_nbr+1)/k*len(X))
    X_train = pd.concat([X.iloc[:split_point_1], X.iloc[split_point_2:]])
    y_train = pd.concat([y.iloc[:split_point_1], y.iloc[split_point_2:]])
    X_val = X.iloc[split_point_1:split_point_2]
    y_val = list(y.iloc[split_point_1:split_point_2])
    gnb = GaussianNB()
    # Train the model with the train data
    gnb.fit(X_train, y_train)
    # apply the classifier to validation data
    predicted = gnb.predict(X_val)
    
    # calcultae macro-f1 score
    macro_f1_gnb.append(f1_score(y_val, predicted, average='macro'))
plt.plot(macro_f1_gnb, label='Gaussian Naive Bayes', c='tab:blue')
plt.plot(macro_f1_lr, label='Logistic Regression', c='tab:orange')
plt.title("Macro-F1 score (Logistic Regression vs Gaussian Naive Bayes)")
plt.legend(loc='upper right', bbox_to_anchor=(1.5, 1))
plt.show()
def paired_ttest(data1, data2, alpha=0.05, alternative='two-sided'):
    """
    Calculate the t-test on TWO RELATED samples of scores, data1 and data2.
    This is a test for the null hypothesis that two related or
    repeated samples have identical average (expected) values.
    Parameters
    ----------
    data1, data2 : array_like
        The arrays must have the same shape.
    alpha : int or float, default: 0.05
        The significance level
    alternative : {'two-sided', 'less', 'greater'}, optional
        Defines the alternative hypothesis.
        The following options are available (default is 'two-sided'):
        * 'two-sided': the means of the distributions underlying the samples
          are unequal.
        * 'less': the mean of the distribution underlying the first sample
          is less than the mean of the distribution underlying the second
          sample.
        * 'greater': the mean of the distribution underlying the first
          sample is greater than the mean of the distribution underlying
          the second sample.
    Returns
    -------
        t_stat : The t-statistic.
        p : The p-value associated with the given alternative.
        df : The number of degrees of freedom used in calculation of the
            t-statistic; this is one less than the size of the sample
        cv: critical value   
    """
    if len(data1) != len(data2):
        raise Exception("The size of data1 and data2 should be the same")
        
    data1 = np.array(data1)
    data2 = np.array(data2)
    
    # calculate means and standard devision of data1 and data 2
    mean1, mean2 = np.average(data1), np.average(data2)
    std1, std2 = np.std(data1, ddof=1), np.std(data2, ddof=1)
    
    # sample size
    n = len(data1)
    
    # degree of freedom
    df = n-1
    
    diff = data1 - data2
    mean_diff = np.average(diff)
    
    # standard deviaion of the difference
    s_diff = np.sqrt((1/(n-1))*(sum((diff-mean_diff)**2)))
    # t-test statistic
    t_stat = mean_diff/(s_diff/np.sqrt(n))
    
    # calculate the p-value
    if alternative == 'two-sided':
        p = min([(stats.t.cdf(abs(t_stat), df)),(1.0 - stats.t.cdf(abs(t_stat), df))]) * 2.0
    elif alternative == 'greater':
        p = 1.0 - stats.t.cdf(abs(t_stat), df)
    elif alternative == 'less':
        p = stats.t.cdf(abs(t_stat), df)
    # calculate the critical value
    cv = ((stats.t.ppf(alpha, df)),(stats.t.ppf(1.0 - alpha, df)))
    # return everything
    return t_stat, p, df, cv
ttest, p_value, df, cv  = paired_ttest(macro_f1_gnb, macro_f1_lr, alpha=0.05, alternative='greater')
print("The test statistic: ", ttest)
print("The p value:%.8f" % p_value)
if p_value <0.05:
    print("=> Reject the null hypothesis")
else:
    print("=> Fail to reject the null hypothesis")
ttest, p_value = stats.ttest_rel(macro_f1_gnb, macro_f1_lr, alternative='greater')
print("The test statistic: ", ttest)
print("The p value:%.8f" % p_value)
if p_value <0.05:
    print("=> Reject the null hypothesis")
else:
    print("=> Fail to reject the null hypothesis")