DAT405 - Assignment 4
Disclaimer
Imports and reading data files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.preprocessing import binarize
import tarfile
Question 1
# Convert a.bz2 file to a data frame
def convert_file(file):
lines = []
for file_name in file:
tar_file = tarfile.open(file_name, 'r:bz2')
for member in tar_file.getmembers():
f = tar_file.extractfile(member)
if f is not None:
line = f.read()
#get all the content of file as a row
if file_name == '/work/20030228_easy_ham_2.tar.bz2' or file_name == '/work/20030228_hard_ham.tar.bz2':
lines.append({'message': line.decode('latin-1'), 'type': 'ham'})
else:
lines.append({'message': line.decode('latin-1'), 'type': 'spam'})
tar_file.close()
return pd.DataFrame(lines)
# convert files into a data frame
df_all_mails = convert_file(['/work/20030228_easy_ham_2.tar.bz2', '/work/20030228_hard_ham.tar.bz2', '/work/20050311_spam_2.tar.bz2'])
# Data is split into Training sets and Test sets, where the test size is set to 0.25
ham_train, ham_test, spam_train, spam_test = train_test_split(df_all_mails['message'], df_all_mails['type'], test_size = 0.25, random_state = 0)
Question 2
# Transform text into vectors with CountVectorizer
cv = CountVectorizer()
ham_train = cv.fit_transform(ham_train)
ham_test = cv.transform(ham_test)
# ---------- MultinomialNB ----------
# Creating the classifier
mnb = MultinomialNB()
# The model is trained based on the data
mnb.fit(ham_train, spam_train)
# Predictions on the entire data set
predictions = mnb.predict(ham_test)
# Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points
score = round(metrics.accuracy_score(spam_test, predictions), 5)
print('Accuracy Score for MNB: ', score)
# Creating and plotting a confusion model
cm = confusion_matrix(spam_test, predictions, normalize = 'true')
plt.figure(figsize=(9,9))
ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual type');
plt.xlabel('\nPredicted type');
plt.title('Confusion Matrix on MultionomialNB',size = 15);
ax.xaxis.set_ticklabels(['Ham', 'Spam'])
ax.yaxis.set_ticklabels(['Ham', 'Spam'])
plt.show();
# ---------- BernoulliNB ----------
# Creating the classifier, binarize = 0.0 is default
bnb = BernoulliNB()
# The model is trained based on the data
bnb.fit(ham_train, spam_train)
# Predictions on the entire data set
predictions = bnb.predict(ham_test)
# Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points
score = round(metrics.accuracy_score(spam_test, predictions), 5)
print('Accuracy Score for BNB: ', score)
# Creating and plotting a confusion model
cm = confusion_matrix(spam_test, predictions, normalize = 'true')
plt.figure(figsize=(9,9))
ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual type');
plt.xlabel('\nPredicted type');
plt.title('Confusion Matrix on BernoulliNB',size = 15);
ax.xaxis.set_ticklabels(['Ham', 'Spam'])
ax.yaxis.set_ticklabels(['Ham', 'Spam'])
plt.show();
Question 3
Spam vs Easy ham
# Create new data frame with spam and easy ham
df_easy_ham_and_spam = convert_file(['/work/20030228_easy_ham_2.tar.bz2','/work/20050311_spam_2.tar.bz2'])
# Split data into training and test sets
ham_train2, ham_test2, spam_train2, spam_test2 = train_test_split(df_easy_ham_and_spam['message'], df_easy_ham_and_spam['type'], test_size = 0.25, random_state = 0)
# Transform text into vectors with CountVectorizer
ham_train2 = cv.fit_transform(ham_train2)
ham_test2 = cv.transform(ham_test2)
# MultinomialNB
mnb = MultinomialNB()
# The model is trained based on the data
mnb.fit(ham_train2, spam_train2)
# Predictions on the entire data set
predictions = mnb.predict(ham_test2)
# Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points
score = round(metrics.accuracy_score(spam_test2, predictions), 5)
print('Accuracy Score for MNB (Spam vs Easy hams): ', score)
#Confusion matrix for results
cm = confusion_matrix(spam_test2, predictions, normalize = 'true')
plt.figure(figsize=(9,9))
ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual type');
plt.xlabel('\nPredicted type');
plt.title('Confusion Matrix on MultionomialNB, Easy hams',size = 15);
ax.xaxis.set_ticklabels(['Ham', 'Spam'])
ax.yaxis.set_ticklabels(['Ham', 'Spam'])
plt.show();
# BernoulliNB
bnb = BernoulliNB()
# The model is trained based on the data
bnb.fit(ham_train2, spam_train2)
# Predictions on the entire data set
predictions = bnb.predict(ham_test2)
# Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points
score = round(metrics.accuracy_score(spam_test2, predictions), 5)
print('Accuracy Score for BNB (Spam vs Easy hams): ', score)
#Confusion matrix for results
cm = confusion_matrix(spam_test2, predictions, normalize = 'true')
plt.figure(figsize=(9,9))
ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual type');
plt.xlabel('\nPredicted type');
plt.title('Confusion Matrix on BernoulliNB, Easy hams',size = 15);
ax.xaxis.set_ticklabels(['Ham', 'Spam'])
ax.yaxis.set_ticklabels(['Ham', 'Spam'])
plt.show();
Spam vs Hard ham
df_hard_ham_and_spam = convert_file(['/work/20030228_hard_ham.tar.bz2','/work/20050311_spam_2.tar.bz2'])
# Split intro train and test sets
ham_train3, ham_test3, spam_train3, spam_test3 = train_test_split(df_hard_ham_and_spam['message'], df_hard_ham_and_spam['type'], test_size = 0.25, random_state = 0)
ham_train3 = cv.fit_transform(ham_train3)
ham_test3 = cv.transform(ham_test3)
# MultinomialNB
mnb = MultinomialNB()
# The model is trained based on the data
mnb.fit(ham_train3, spam_train3)
# Predictions on the entire data set
predictions = mnb.predict(ham_test3)
# Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points
score = round(metrics.accuracy_score(spam_test3, predictions), 5)
print('Accuracy Score for MNB (Spam vs Hard hams): ', score)
cm = confusion_matrix(spam_test3, predictions, normalize = 'true')
plt.figure(figsize=(9,9))
ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual type');
plt.xlabel('\nPredicted type');
plt.title('Confusion Matrix on MultionomialNB, Hard hams',size = 15);
ax.xaxis.set_ticklabels(['Ham', 'Spam'])
ax.yaxis.set_ticklabels(['Ham', 'Spam'])
plt.show();
# BernoulliNB
bnb = BernoulliNB()
# The model is trained based on the data
bnb.fit(ham_train3, spam_train3)
# Predictions on the entire data set
predictions = bnb.predict(ham_test3)
# Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points
score = round(metrics.accuracy_score(spam_test3, predictions), 5)
print('Accuracy Score for BNB (Spam vs Hard hams): ', score)
#Confusion matrix
cm = confusion_matrix(spam_test3, predictions, normalize = 'true')
plt.figure(figsize=(9,9))
ax = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual type');
plt.xlabel('\nPredicted type');
plt.title('Confusion Matrix on BernoulliNB, Hard hams',size = 15);
ax.xaxis.set_ticklabels(['Ham', 'Spam'])
ax.yaxis.set_ticklabels(['Ham', 'Spam'])
plt.show();