DAT405 - Assignment 4

Disclaimer

Imports and reading data files

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import BernoulliNB from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn import metrics from sklearn.preprocessing import binarize import tarfile

Question 1

# Convert a.bz2 file to a data frame def convert_file(file): lines = [] for file_name in file: tar_file = tarfile.open(file_name, 'r:bz2') for member in tar_file.getmembers(): f = tar_file.extractfile(member) if f is not None: line = f.read() #get all the content of file as a row if file_name == '/work/20030228_easy_ham_2.tar.bz2' or file_name == '/work/20030228_hard_ham.tar.bz2': lines.append({'message': line.decode('latin-1'), 'type': 'ham'}) else: lines.append({'message': line.decode('latin-1'), 'type': 'spam'}) tar_file.close() return pd.DataFrame(lines) # convert files into a data frame df_all_mails = convert_file(['/work/20030228_easy_ham_2.tar.bz2', '/work/20030228_hard_ham.tar.bz2', '/work/20050311_spam_2.tar.bz2']) # Data is split into Training sets and Test sets, where the test size is set to 0.25 ham_train, ham_test, spam_train, spam_test = train_test_split(df_all_mails['message'], df_all_mails['type'], test_size = 0.25, random_state = 0)

Question 2

# Transform text into vectors with CountVectorizer cv = CountVectorizer() ham_train = cv.fit_transform(ham_train) ham_test = cv.transform(ham_test)

# ---------- MultinomialNB ---------- # Creating the classifier mnb = MultinomialNB() # The model is trained based on the data mnb.fit(ham_train, spam_train) # Predictions on the entire data set predictions = mnb.predict(ham_test) # Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points score = round(metrics.accuracy_score(spam_test, predictions), 5) print('Accuracy Score for MNB: ', score) # Creating and plotting a confusion model cm = confusion_matrix(spam_test, predictions, normalize = 'true') plt.figure(figsize=(9,9)) ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r'); plt.ylabel('Actual type'); plt.xlabel('\nPredicted type'); plt.title('Confusion Matrix on MultionomialNB',size = 15); ax.xaxis.set_ticklabels(['Ham', 'Spam']) ax.yaxis.set_ticklabels(['Ham', 'Spam']) plt.show();

# ---------- BernoulliNB ---------- # Creating the classifier, binarize = 0.0 is default bnb = BernoulliNB() # The model is trained based on the data bnb.fit(ham_train, spam_train) # Predictions on the entire data set predictions = bnb.predict(ham_test) # Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points score = round(metrics.accuracy_score(spam_test, predictions), 5) print('Accuracy Score for BNB: ', score) # Creating and plotting a confusion model cm = confusion_matrix(spam_test, predictions, normalize = 'true') plt.figure(figsize=(9,9)) ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r'); plt.ylabel('Actual type'); plt.xlabel('\nPredicted type'); plt.title('Confusion Matrix on BernoulliNB',size = 15); ax.xaxis.set_ticklabels(['Ham', 'Spam']) ax.yaxis.set_ticklabels(['Ham', 'Spam']) plt.show();

Question 3

Spam vs Easy ham

# Create new data frame with spam and easy ham df_easy_ham_and_spam = convert_file(['/work/20030228_easy_ham_2.tar.bz2','/work/20050311_spam_2.tar.bz2']) # Split data into training and test sets ham_train2, ham_test2, spam_train2, spam_test2 = train_test_split(df_easy_ham_and_spam['message'], df_easy_ham_and_spam['type'], test_size = 0.25, random_state = 0) # Transform text into vectors with CountVectorizer ham_train2 = cv.fit_transform(ham_train2) ham_test2 = cv.transform(ham_test2) # MultinomialNB mnb = MultinomialNB() # The model is trained based on the data mnb.fit(ham_train2, spam_train2) # Predictions on the entire data set predictions = mnb.predict(ham_test2) # Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points score = round(metrics.accuracy_score(spam_test2, predictions), 5) print('Accuracy Score for MNB (Spam vs Easy hams): ', score) #Confusion matrix for results cm = confusion_matrix(spam_test2, predictions, normalize = 'true') plt.figure(figsize=(9,9)) ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r'); plt.ylabel('Actual type'); plt.xlabel('\nPredicted type'); plt.title('Confusion Matrix on MultionomialNB, Easy hams',size = 15); ax.xaxis.set_ticklabels(['Ham', 'Spam']) ax.yaxis.set_ticklabels(['Ham', 'Spam']) plt.show();

# BernoulliNB bnb = BernoulliNB() # The model is trained based on the data bnb.fit(ham_train2, spam_train2) # Predictions on the entire data set predictions = bnb.predict(ham_test2) # Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points score = round(metrics.accuracy_score(spam_test2, predictions), 5) print('Accuracy Score for BNB (Spam vs Easy hams): ', score) #Confusion matrix for results cm = confusion_matrix(spam_test2, predictions, normalize = 'true') plt.figure(figsize=(9,9)) ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r'); plt.ylabel('Actual type'); plt.xlabel('\nPredicted type'); plt.title('Confusion Matrix on BernoulliNB, Easy hams',size = 15); ax.xaxis.set_ticklabels(['Ham', 'Spam']) ax.yaxis.set_ticklabels(['Ham', 'Spam']) plt.show();

Spam vs Hard ham

df_hard_ham_and_spam = convert_file(['/work/20030228_hard_ham.tar.bz2','/work/20050311_spam_2.tar.bz2']) # Split intro train and test sets ham_train3, ham_test3, spam_train3, spam_test3 = train_test_split(df_hard_ham_and_spam['message'], df_hard_ham_and_spam['type'], test_size = 0.25, random_state = 0) ham_train3 = cv.fit_transform(ham_train3) ham_test3 = cv.transform(ham_test3) # MultinomialNB mnb = MultinomialNB() # The model is trained based on the data mnb.fit(ham_train3, spam_train3) # Predictions on the entire data set predictions = mnb.predict(ham_test3) # Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points score = round(metrics.accuracy_score(spam_test3, predictions), 5) print('Accuracy Score for MNB (Spam vs Hard hams): ', score) cm = confusion_matrix(spam_test3, predictions, normalize = 'true') plt.figure(figsize=(9,9)) ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r'); plt.ylabel('Actual type'); plt.xlabel('\nPredicted type'); plt.title('Confusion Matrix on MultionomialNB, Hard hams',size = 15); ax.xaxis.set_ticklabels(['Ham', 'Spam']) ax.yaxis.set_ticklabels(['Ham', 'Spam']) plt.show();

# BernoulliNB bnb = BernoulliNB() # The model is trained based on the data bnb.fit(ham_train3, spam_train3) # Predictions on the entire data set predictions = bnb.predict(ham_test3) # Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points score = round(metrics.accuracy_score(spam_test3, predictions), 5) print('Accuracy Score for BNB (Spam vs Hard hams): ', score) #Confusion matrix cm = confusion_matrix(spam_test3, predictions, normalize = 'true') plt.figure(figsize=(9,9)) ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r'); plt.ylabel('Actual type'); plt.xlabel('\nPredicted type'); plt.title('Confusion Matrix on BernoulliNB, Hard hams',size = 15); ax.xaxis.set_ticklabels(['Ham', 'Spam']) ax.yaxis.set_ticklabels(['Ham', 'Spam']) plt.show();

DAT405 - Assignment 4

Disclaimer

Imports and reading data files

Question 1

Question 2

Question 3

Spam vs Easy ham

Spam vs Hard ham

i)

ii)

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}DAT405 - Assignment 4

Disclaimer

Imports and reading data files

Question 1

Question 2

Question 3

Spam vs Easy ham

Spam vs Hard ham

i)

ii)

DAT405 - Assignment 4