Image borrowed from: https://www.statstest.com/paired-samples-t-test/
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from collections import Counter
data = pd.read_csv('data.csv')
data.head()
data_shuffled = data.sample(frac = 1).reset_index() # shuffle the dataframe
X = data_shuffled[['x1', 'x2']].copy()
y = data_shuffled['y'].copy()
data['y'].value_counts()
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=True, test_size=0.2)
from sklearn.linear_model import LogisticRegression
# Create an instance of Logistic Regression classifier
lr = LogisticRegression(multi_class='ovr', solver='liblinear')
# Train the model with the train data
lr.fit(X_train, y_train)
# Predict the lables of the validation data
y_pred_lr = lr.predict(X_val)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
# Predict the lables of the validation data
y_pred_gnb = gnb.predict(X_val)
def f1_score(y_true, y_pred, labels=None, average=None):
"""
y_true : 1d array-like Ground truth (correct) target values.
y_pred : 1d array-like Estimated targets as returned by a classifier.
labels : list of unique labels for sort out the result
average : string, ['micro', 'macro'] instead of defining labels.
"""
if not isinstance(y_true, np.ndarray):
y_true = np.array(y_true)
if labels is None:
if average == 'micro':
tp_fp_all = len(y_pred)
tp_fn_all = len(y_true)
tp_all = 0
for i in range(len(y_pred)):
if y_true[i] == y_pred[i]:
tp_all += 1
precision = tp_all/tp_fp_all
recall = tp_all/tp_fn_all
micro_f1 = 2 * (precision*recall) / (precision+recall)
return micro_f1 # the score with micro averaging
elif average == 'macro':
score_list = []
count_pred = Counter(y_pred)
count_true = Counter(y_true)
labels = count_pred.keys()
for label in labels:
tp_fp = count_pred[label]
tp_fn = count_true[label]
tp = 0
for i in range(len(y_true)):
if (y_true[i] == label) & (y_pred[i] == label):
tp += 1
precision = tp/tp_fp
recall = tp/tp_fn
f1 = 2 * (precision*recall) / (precision+recall)
score_list.append(f1)
macro_score = np.average(score_list)
return macro_score # the score with macro averaging
else:
score_list = []
for label in labels:
tp_fp = Counter(y_pred)[label]
tp_fn = Counter(y_true)[label]
tp = 0
for i in range(len(y_true)):
if (y_true[i] == label) & (y_pred[i] == label):
tp += 1
precision = tp/tp_fp
recall = tp/tp_fn
f1 = 2 * (precision*recall) / (precision+recall)
score_list.append(f1)
return score_list # list of score of each label
f1_lr = f1_score(y_val, y_pred_lr, average='macro')
print("F1 score of the Logistic Regression Model: %f" %f1_lr)
f1_gnb = f1_score(y_val, y_pred_gnb, average='macro')
print("F1 score of the Gaussian Naive Bayes Model: %f" %f1_gnb)
data_shuffled = data.sample(frac = 1).reset_index() # shuffle the dataframe
X = data_shuffled[['x1', 'x2']].copy()
y = data_shuffled['y'].copy()
macro_f1_lr = []
k = 10
for fold_nbr in range(k):
split_point_1 = int(float(fold_nbr)/k*len(X))
split_point_2 = int(float(fold_nbr+1)/k*len(X))
X_train = pd.concat([X.iloc[:split_point_1], X.iloc[split_point_2:]])
y_train = pd.concat([y.iloc[:split_point_1], y.iloc[split_point_2:]])
X_val = X.iloc[split_point_1:split_point_2]
y_val = list(y.iloc[split_point_1:split_point_2])
lr = LogisticRegression(multi_class='ovr', solver='liblinear')
# Train the model with the train data
lr.fit(X_train, y_train)
# apply the classifier to validation data
predicted = lr.predict(X_val)
# calcultae macro-f1 score
macro_f1_lr.append(f1_score(y_val, predicted, average='macro'))
macro_f1_gnb = []
k = 10
for fold_nbr in range(k):
split_point_1 = int(float(fold_nbr)/k*len(X))
split_point_2 = int(float(fold_nbr+1)/k*len(X))
X_train = pd.concat([X.iloc[:split_point_1], X.iloc[split_point_2:]])
y_train = pd.concat([y.iloc[:split_point_1], y.iloc[split_point_2:]])
X_val = X.iloc[split_point_1:split_point_2]
y_val = list(y.iloc[split_point_1:split_point_2])
gnb = GaussianNB()
# Train the model with the train data
gnb.fit(X_train, y_train)
# apply the classifier to validation data
predicted = gnb.predict(X_val)
# calcultae macro-f1 score
macro_f1_gnb.append(f1_score(y_val, predicted, average='macro'))
plt.plot(macro_f1_gnb, label='Gaussian Naive Bayes', c='tab:blue')
plt.plot(macro_f1_lr, label='Logistic Regression', c='tab:orange')
plt.title("Macro-F1 score (Logistic Regression vs Gaussian Naive Bayes)")
plt.legend(loc='upper right', bbox_to_anchor=(1.5, 1))
plt.show()
def paired_ttest(data1, data2, alpha=0.05, alternative='two-sided'):
"""
Calculate the t-test on TWO RELATED samples of scores, data1 and data2.
This is a test for the null hypothesis that two related or
repeated samples have identical average (expected) values.
Parameters
----------
data1, data2 : array_like
The arrays must have the same shape.
alpha : int or float, default: 0.05
The significance level
alternative : {'two-sided', 'less', 'greater'}, optional
Defines the alternative hypothesis.
The following options are available (default is 'two-sided'):
* 'two-sided': the means of the distributions underlying the samples
are unequal.
* 'less': the mean of the distribution underlying the first sample
is less than the mean of the distribution underlying the second
sample.
* 'greater': the mean of the distribution underlying the first
sample is greater than the mean of the distribution underlying
the second sample.
Returns
-------
t_stat : The t-statistic.
p : The p-value associated with the given alternative.
df : The number of degrees of freedom used in calculation of the
t-statistic; this is one less than the size of the sample
cv: critical value
"""
if len(data1) != len(data2):
raise Exception("The size of data1 and data2 should be the same")
data1 = np.array(data1)
data2 = np.array(data2)
# calculate means and standard devision of data1 and data 2
mean1, mean2 = np.average(data1), np.average(data2)
std1, std2 = np.std(data1, ddof=1), np.std(data2, ddof=1)
# sample size
n = len(data1)
# degree of freedom
df = n-1
diff = data1 - data2
mean_diff = np.average(diff)
# standard deviaion of the difference
s_diff = np.sqrt((1/(n-1))*(sum((diff-mean_diff)**2)))
# t-test statistic
t_stat = mean_diff/(s_diff/np.sqrt(n))
# calculate the p-value
if alternative == 'two-sided':
p = min([(stats.t.cdf(abs(t_stat), df)),(1.0 - stats.t.cdf(abs(t_stat), df))]) * 2.0
elif alternative == 'greater':
p = 1.0 - stats.t.cdf(abs(t_stat), df)
elif alternative == 'less':
p = stats.t.cdf(abs(t_stat), df)
# calculate the critical value
cv = ((stats.t.ppf(alpha, df)),(stats.t.ppf(1.0 - alpha, df)))
# return everything
return t_stat, p, df, cv
ttest, p_value, df, cv = paired_ttest(macro_f1_gnb, macro_f1_lr, alpha=0.05, alternative='greater')
print("The test statistic: ", ttest)
print("The p value:%.8f" % p_value)
if p_value <0.05:
print("=> Reject the null hypothesis")
else:
print("=> Fail to reject the null hypothesis")
ttest, p_value = stats.ttest_rel(macro_f1_gnb, macro_f1_lr, alternative='greater')
print("The test statistic: ", ttest)
print("The p value:%.8f" % p_value)
if p_value <0.05:
print("=> Reject the null hypothesis")
else:
print("=> Fail to reject the null hypothesis")