# Activate big tests
big_tests = True

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from operator import itemgetter
from os import path, scandir
from pathlib import Path
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

#Download and extract data, don't do if data is already present
if not path.exists("20021010_spam.tar.bz2"):
!wget https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2
!wget https://spamassassin.apache.org/old/publiccorpus/20021010_hard_ham.tar.bz2
!wget https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2
!tar -xjf 20021010_easy_ham.tar.bz2
!tar -xjf 20021010_hard_ham.tar.bz2
!tar -xjf 20021010_spam.tar.bz2

!ls -lah

#Reads a folder and returns a list of the contents of all the files in that folder.
#Each entry of the list is the content of one file
def file_list(path):
files = []
for entry in scandir(path):
with open(entry, 'r', encoding = "latin1") as f:
read_data = f.read()
files.append(read_data)
f.closed
return files

#Read all the mails and put them into lists.
list_easy = file_list(r'./easy_ham')
list_hard = file_list(r'./hard_ham')
list_spam = file_list(r'./spam')
state = 0
test = 0.3
#Split the list into training and test data, 70% to train and 30% to test
easy_train, easy_test = train_test_split(list_easy, test_size = test, random_state=state)
hard_train, hard_test = train_test_split(list_hard, test_size = test, random_state=state)
spam_train, spam_test = train_test_split(list_spam, test_size = test, random_state=state)

# Creates a prediction model to train and predict the possibility given mails are spam mails or not.
# Returns a confusion matrix with prediction results
# Takes 8 arguments
# First 4 arguments are lists of mails represented in strings
# distribution to use either the Multinomial ('multi') or Bernoulli ('bern) classification model
# min_df is the minimum frequency of mails containing a word to be used to classify mails
# max_df is the maximum frequency of mails containing a word to be used to classify mails
def probability(hamtrain, spamtrain, hamtest, spamtest, classificator, min_df, max_df):
# Create vectorizer
vect = CountVectorizer(min_df = min_df, max_df = max_df)
# Vectors for training the model
y_train = [0 for i in range(len(hamtrain))] + [1 for i in range(len(spamtrain))]
x_train = vect.fit_transform(hamtrain + spamtrain)
# Vector for testing the predictions of the model
y_truth = [0 for i in range(len(hamtest))] + [1 for i in range(len(spamtest))]
# Select model depending on keyword
if(classificator == 'multi'):
model = MultinomialNB().fit(x_train ,y_train)
elif(classificator == 'bern'):
model = BernoulliNB().fit(x_train, y_train)
else:
raise NameError(distribution + ' is not a viable distribution')
# Predict the result
y_predict = model.predict(vect.transform(hamtest + spamtest))
# Compare with the truth and return a confusion matrix with the results
cm = confusion_matrix(y_truth, y_predict)
return cm

# Print the confusion matrix in a subplot
# cm is the confusion matrix to be plotted
# dist is a string containing the distribution, used in the title
# data is a string conataining the dataset used, will be printed in the title
# ax is the axis in subplot on which to print
def print_cm(cm, dist, data, ax):
# List with the count of the number of elements in each category
counts = ["{0:0.0f}\n".format(value) for value in cm.flatten()]
# Empty list to be filled with normalized value per row
percentages = []
for i in range(2):
percent = cm[i][0]/np.sum(cm[i])
percentages.append("{0:.2%}".format(percent))
percentages.append("{0:.2%}".format(1-percent))
# Zip into one list of strings and reshape into a 2x2 matrix
box_labels = [s1+s2 for s1, s2 in zip(counts,percentages)]
box_labels = np.asarray(box_labels).reshape(cm.shape[0],cm.shape[1])
categories = ['Mail', 'Spam']
# Create the plot
sns.heatmap(cm, annot=box_labels, cmap='Blues', xticklabels=categories, yticklabels=categories, fmt="", ax=ax)
# Set labels
ax.set_ylabel('Actual label', fontsize=15)
ax.set_xlabel('Predicted label', fontsize=15)
# Set title
title = "Using the " + dist + " distribution and the " + data + " dataset"
ax.set_title(title, fontsize=15)

# The main program to run
# Trains and evalutes 4 models. The multnomial with the easy and hard dataset, and the bernoulli with the easy and hard dataset.
# Plots the results with a confusion matrix for eeach models predisctions.
# First 5 arguments are lists of mails represented in strings
# min_df is the minimum frequency of mails containing a word to be used to classify mails
# max_df is the maximum frequency of mails containing a word to be used to classify mails
def spam_prob(e_ham_train, e_ham_test, h_ham_train, h_ham_test, spam_train, spam_test, min_df, max_df):
fig, axs = plt.subplots(2,2, figsize=(16,12))
fig.suptitle('Confusion matrices with predictions,\n normalized per row',fontsize=25 )
m_cm_easy = probability(e_ham_train, spam_train, e_ham_test, spam_test, 'multi', min_df, max_df)
print_cm(m_cm_easy, "multinominal", "easy", axs[0][0] )
m_cm_hard = probability(h_ham_train, spam_train, h_ham_test, spam_test, 'multi', min_df , max_df)
print_cm(m_cm_hard, "multinominal", "hard", axs[0][1])
b_cm_easy = probability(e_ham_train, spam_train, e_ham_test, spam_test, 'bern', min_df , max_df)
print_cm(b_cm_easy, "bernoulli", "easy", axs[1][0])
b_cm_hard= probability(h_ham_train, spam_train, h_ham_test, spam_test, 'bern', min_df , max_df)
print_cm(b_cm_hard, "bernoulli", "hard", axs[1][1])

spam_prob(easy_train, easy_test, hard_train, hard_test, spam_train, spam_test, 0.0, 1.0)

# Calculates and returns the probablilty of a mail being labeled as a mail and spam being labeled as spam from a confusion matrix
# cm is the confusion matrix to calculate from
def prob_from_cm(cm):
ham_prob = cm[0][0]/np.sum(cm[0])
spam_prob = cm[1][1]/np.sum(cm[1])
return ham_prob, spam_prob

# Big function to calculate the optimal values for min and max df
# Iterates over alot of different values
# Prints the values used for attaining the highest probability of a correct priediction using all 4 models.
# Calculates the best using arithmetic and geometric mean
def optimal_filter_limit(e_ham_train, e_ham_test, h_ham_train, h_ham_test, spam_train, spam_test):
if big_tests:
# Determines the best word filter settings
a_t = []
g_t = []
for l in np.arange(0.00, 0.20, 0.05):
for h in np.arange(l+0.05, 1.01, 0.05):
m_false , m_spam = prob_from_cm(probability(e_ham_train, spam_train, e_ham_test, spam_test, 'multi', l, h))
m_false_hard, m_spam_hard =prob_from_cm( probability(h_ham_train, spam_train, h_ham_train, spam_test, 'multi', l , h))
b_false , b_spam = prob_from_cm(probability(e_ham_train, spam_train, e_ham_test, spam_test, 'bern', l , h))
b_false_hard, b_spam_hard, = prob_from_cm(probability(h_ham_train, spam_train, h_ham_train, spam_test, 'bern', l , h))
s = (m_false + m_spam + m_false_hard + m_spam_hard + b_false + b_spam + b_false_hard + b_spam_hard) / 8
p = (m_false * m_spam * m_false_hard * m_spam_hard * b_false * b_spam * b_false_hard * b_spam_hard) ** (1/8)
a_t.append((s, l, h))
g_t.append((p, l, h))
t_at = max(a_t,key=itemgetter(0))
t_gt = max(g_t,key=itemgetter(0))
print("Arithmetic mean")
print("Max probability found:", round(t_at[0], 2,))
print("using settings: ", "min_df =" , round(t_at[1], 2), "and max_df =", round(t_at[2], 2))
print("")
print("Geometric mean")
print("Max probability found:", round(t_gt[0], 2,))
print("Using settings: ", "min_df =" , round(t_gt[1], 2), "and max_df =", round(t_gt[2], 2))
return max(a_t,key=itemgetter(0))

# Runs the funstion to calculate the best min and max df
# TAKES A LONG TIME TO RUN, 4-5 minutes using deepnote
# Below are the results
df_values = optimal_filter_limit(easy_train, easy_test, hard_train, hard_test, spam_train, spam_test)
#Arithmetic mean
#Max probability found: 0.91
#using settings: min_df = 0.05 and max_df = 0.3
#
#Geometric mean
#Max probability found: 0.91
#Using settings: min_df = 0.05 and max_df = 0.3

spam_prob(easy_train, easy_test, hard_train, hard_test, spam_train, spam_test, df_values[1], df_values[2])

# Checks in sequence if 3 following words contain letters not used in the english alphabet, removes the sequences that do
def filter_string(string):
filtered_list =[]
for s in string:
s_r = s.replace('.', '').replace('!', '').replace('?', '').replace(',', '').replace('"', '')
new_string = ""
word_list = s_r.split()
for i in range(2, len(word_list), 3):
if (word_list[i].isalpha() or word_list[i].isnumeric()) and (word_list[i-1].isalpha() or word_list[i-1].isnumeric()) and (word_list[i-2].isalpha() or word_list[i-2].isnumeric()):
new_string = new_string + " " + word_list[i-2]+ " " + word_list[i-1] + " " + word_list[i]
filtered_list.append(new_string)
return filtered_list

print(list_easy[100])

from itertools import takewhile, count, islice
def slice_iterable(iterable, chunk):
_it = iter(iterable)
return takewhile(bool, (tuple(islice(_it, chunk)) for _ in count(0)))
for chunk in slice_iterable(filter_string(list_easy)[100].split(), 15):
print(*chunk)

filt_easy = filter_string(list_easy)
filt_hard = filter_string(list_hard)
filt_spam = filter_string(list_spam)
f_easy_train, f_easy_test = train_test_split(filt_easy, test_size = test, random_state=state)
f_hard_train, f_hard_test = train_test_split(filt_hard, test_size = test, random_state=state)
f_spam_train, f_spam_test = train_test_split(filt_spam, test_size = test, random_state=state)
f_df_values = optimal_filter_limit(f_easy_train, f_easy_test, f_hard_train, f_hard_test, f_spam_train, f_spam_test)
#Arithmetic mean
#Max probability found: 0.87
#using settings: min_df = 0.05 and max_df = 0.4
#
#Geometric mean
#Max probability found: 0.87
#Using settings: min_df = 0.05 and max_df = 0.4

spam_prob(f_easy_train, f_easy_train, f_hard_train, f_hard_test, f_spam_train, f_spam_test, f_df_values[1], f_df_values[2])