Assignment 4

Johanna Wiberg (jwiberg): 19 hours

Oscar Forsberg (oscfors): 19 hours

1a,b)

from sklearn.feature_extraction.text import CountVectorizer import os from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import MultinomialNB import numpy as np #Reads the files in a given a directory and iterates over them in order to append them to a list which we return def get_files(directory): files = os.listdir(os.path.abspath(directory)) list = [] for emails in files: list.append(open(directory + emails, "r", errors='ignore').read()) return list easy_ham = get_files("easy_ham/") hard_ham = get_files("hard_ham/") spam = get_files("spam/") #Devid the data into X and Y. Xhard = hard_ham + spam Yhard = ['ham'] * len(hard_ham) + ['spam'] * len(spam) Xeasy = easy_ham + spam Yeasy = ['ham'] * len(easy_ham) + ['spam'] * len(spam) #Split hard_ham into train and test sets hardTrainHam, hardTestHam, hardTrainSpam, hardTestSpam = train_test_split(Xhard,Yhard, test_size = 0.25, random_state = 0) #Split easy_ham into train and test sets easyTrainHam,easyTestHam,easyTrainSpam,easyTestSpam = train_test_split(Xeasy,Yeasy, test_size = 0.25, random_state = 0)

2)

#https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html #https://towardsdatascience.com/what-and-why-behind-fit-transform-vs-transform-in-scikit-learn-78f915cf96fe from sklearn.naive_bayes import BernoulliNB from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix import seaborn as sns import matplotlib.pyplot as plt #method that plots the confusion matrix #Source: https://www.stackvidhya.com/plot-confusion-matrix-in-python-and-why/ def graph(confusionMatrix,title): ax = sns.heatmap(confusionMatrix, annot=True, cmap='binary') ax.set_title(title); ax.set_xlabel('\nPrediction') ax.set_ylabel('Actual'); ax.xaxis.set_ticklabels(['Ham','Spam']) ax.yaxis.set_ticklabels(['Ham','Spam']) plt.show() #Method to run the program with 4 parameters which is stated in the assignment #Uses countvectorizer to fit and transform the train and test data def run(xTrain, xTest, yTrain, yTest, word): vectorizer = CountVectorizer() x = vectorizer.fit_transform(xTrain) hamTest = vectorizer.transform(xTest) spamTest = vectorizer.transform(yTest) # if statments that decides wether to run the bernoulli or multinomial classifier, also plots the confusion matrix and prints accuracy score if (word == "M"): fitted1 = MultinomialNB().fit(x,yTrain) graph(confusion_matrix(yTest, fitted1.predict(hamTest)),"Multinomial Confusion Matrix") print(accuracy_score(yTest,fitted1.predict(hamTest))) elif (word == "B"): fitted2 = BernoulliNB(binarize=True).fit(x,yTrain) graph(confusion_matrix(yTest, fitted2.predict(hamTest)), "Bernoulli Confusion Matrix") print(accuracy_score(yTest,fitted2.predict(hamTest))) else: print("You need to choose either 'B' for bernoulli or 'M' for multionmial")

The main difference between the two classifiers is that Multinomial counts the frequency of words that occur in the email while Bernoulli only cares about binary values, for instance True or False, 0 or 1, Yes or No. For instance, this means that by removing stop words in 'stop_words='english'' (an existing list with words like "and", "then", "or"...) multinomial will perform better since words like this aren't actually spam even though they have a high frequency, they just occur often in the english language. Removing words like this will not affect the bernoulli naive bayes.

3)

run(easyTrainHam,easyTestHam,easyTrainSpam,easyTestSpam,"M") run(hardTrainHam,hardTestHam,hardTrainSpam,hardTestSpam,"M") run(easyTrainHam,easyTestHam,easyTrainSpam,easyTestSpam,"B") run(hardTrainHam,hardTestHam,hardTrainSpam,hardTestSpam,"B")

4a)

#source: https://stackoverflow.com/questions/56957498/find-most-common-words-from-set-of-sentences-in-python ham = hard_ham + easy_ham data= hard_ham + easy_ham + spam n = 50 def most_common_words(x): v = CountVectorizer() x = v.fit_transform(x) vocabulary = v.get_feature_names_out() #sort the list form smallest to biggets, so we take the n last elements to get the most common words nLastElements = np.argsort(x.toarray().sum(axis=0))[-n:] mostCommonWords = [vocabulary[a] for a in nLastElements] return mostCommonWords def least_common_words(x): v = CountVectorizer() x = v.fit_transform(x) vocabulary = v.get_feature_names_out() #sort the list form smallest to biggets, so we take the n first elements to get the least common words nFirstElements = np.argsort(x.toarray().sum(axis=0))[:n] leastcommonwords = [vocabulary[a] for a in nFirstElements] return leastcommonwords print("Most common words in ham: ",most_common_words(ham)) print("Most common words in spam: ",most_common_words(spam)) print("Least common words in spam: ",least_common_words(spam)) print("Least common words in ham : ",least_common_words(ham)) print("Least common words in both ham and spam: ",least_common_words(data))

With this method we print out the 50 most common words in ham and in spam. If you look carefully at these list, you can see that some of them are included in both lists. If one word is more common in for example spam, we don't want to use this as a stopword, since removing it might lead to poorer predictions. Instead we compare these two list and keep the words that are common in both data sets. These words do we instead use as stop words, and this is the reason why we use stopwords: because words that are "low-level" should not have any impact on the results, thus we remove them.

As you can see with the least common words for spam and for all data sets, most of them seem like names or codes. So it would be more appropriate to remove the words who are only occurring once. So we use the inbuilt parameter min_df = 1.

def same(x,y): samewords= set(x) & set(y) return samewords same(most_common_words(ham),most_common_words(spam)) commonWordsinHamandSpam = same(most_common_words(ham),most_common_words(spam))

4b)

#Same method as above, difference is that we use the most common words that is included in spam and ham and min treshold = 1, to remove the words only occuring once. def run(xTrain, xTest, yTrain, yTest, word): vectorizer = CountVectorizer(stop_words=commonWordsinHamandSpam, min_df=1) x = vectorizer.fit_transform(xTrain) hamTest = vectorizer.transform(xTest) spamTest = vectorizer.transform(yTest) # if statments that decides wether to run the bernoulli or multinomial classifier, also plots the confusion matrix and prints accuracy score if (word == "M"): fitted1 = MultinomialNB().fit(x,yTrain) graph(confusion_matrix(yTest, fitted1.predict(hamTest)),"Multinomial Confusion Matrix") print(accuracy_score(yTest,fitted1.predict(hamTest))) elif (word == "B"): fitted2 = BernoulliNB(binarize=True).fit(x,yTrain) graph(confusion_matrix(yTest, fitted2.predict(hamTest)), "Bernoulli Confusion Matrix") print(accuracy_score(yTest,fitted2.predict(hamTest))) else: print("You need to choose either 'B' for bernoulli or 'M' for multionmial") run(easyTrainHam,easyTestHam,easyTrainSpam,easyTestSpam,"M") run(hardTrainHam,hardTestHam,hardTrainSpam,hardTestSpam,"M") run(easyTrainHam,easyTestHam,easyTrainSpam,easyTestSpam,"B") run(hardTrainHam,hardTestHam,hardTrainSpam,hardTestSpam,"B")

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Assignment 4

Johanna Wiberg (jwiberg): 19 hours

Oscar Forsberg (oscfors): 19 hours

1a,b)

2)

3)

4a)

4b)

Assignment 4