!pip install --upgrade pip
!pip install wordcloud
Spam detection using Machine Learning
Importing Libraries
import pandas as pd
import numpy as np
import re
import collections
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#from sklearn.naive_bayes import MultinomialNB
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.linear_model import LogisticRegression
Importing Data
#Importing our data
df = pd.read_csv('spam.csv')
Looking into data
df.head()
df.info()
df.describe()
Data Preprocessing
Getting required columns
df = df[['v1','v2']]
Changing column names
df.columns = ['Label','Text_data']
Creating 'Target' column to have numerical representation of 'Label' column
df['Target'] = df['Label'].map({'spam':1,'ham':0})
Creating 'Text_length' column
df['Text_length'] = df['Text_data'].apply(len)
df
df.head()
df.info()
df.describe()
df['Label'].value_counts()
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)
Cleaning data
def process_text(data):
sentence_in_lowercase = data.lower()
sentence_with_only_alphanumerics = re.sub(r'[^0-9a-zA-Z]', ' ', sentence_in_lowercase)
sentence_removed_of_extra_spaces = re.sub(r'\s+', ' ', sentence_with_only_alphanumerics)
processed_sentence = " ".join(word for word in sentence_removed_of_extra_spaces.split() if word not in stop_words)
return processed_sentence
df['Cleaned_Text_data'] = df['Text_data'].apply(process_text)
df
df['Cleaned_text_length'] = df['Cleaned_Text_data'].apply(len)
df
Visualization
plt.figure(figsize=(8,8))
df['Label'].value_counts().plot(kind='pie', labels=['Ham','spam'], autopct='%1.0f%%')
plt.title('Ham vs Spam')
plt.show()
plt.figure(figsize=(8,8))
plt.grid(True)
df['Label'].value_counts().plot(kind='bar')
plt.legend = ['ham','spam']
plt.xlabel("Label")
plt.ylabel("Count")
plt.title("Ham vs Spam")
plt.show()
df.hist(bins=40, figsize=(16,10), by='Label', column='Text_length', color=['Green'])
plt.show()
sns.countplot(df['Label'])
plt.show()
def plot_top_10_words(data):
counter = collections.Counter([word for sentence in data for word in sentence.split()])
most_common_10_words = counter.most_common(15)
most_common_10_words_df = pd.DataFrame(most_common_10_words, columns=["Word", "Count"]).sort_values(by="Count")
most_common_10_words_df.plot.barh(x = "Word", y = "Count", color="blue", figsize=(15, 20))
plt.title('Top 10 common words')
plt.grid(True)
plt.xlabel("Count")
Top 10 common words before cleaning data
plot_top_10_words(df["Text_data"])
Top 10 common words after cleaning data
plot_top_10_words(df["Cleaned_Text_data"])
Dividing data so that it can be used later on if required
spam_data = df[df['Label']=='spam']
spam_data.head()
spam_data
ham_data = df[df['Label']=='ham']
ham_data.head()
ham_data
Implementation
Training and Testing set split
X = df['Cleaned_Text_data']
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
X_train.shape
X_test.shape
cv = CountVectorizer().fit(X_train)
training_set = cv.transform(X_train)
training_set
testing_set = cv.transform(X_test)
testing_set
Trying with multiple models
def provide_classification_report(model):
model.fit(training_set,y_train)
y_pred = model.predict(testing_set)
print('Classification Report\n')
print(classification_report(y_test, y_pred))
print('\n Accuracy',accuracy_score(y_test, y_pred))
print('\n Precision',precision_score(y_test, y_pred))
print('\n Recall',recall_score(y_test, y_pred))
print('\n F1 score',f1_score(y_test, y_pred))
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
provide_classification_report(model)
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
provide_classification_report(model)
from sklearn.svm import SVC
model = SVC(C=3)
provide_classification_report(model)
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
provide_classification_report(model)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
provide_classification_report(model)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
provide_classification_report(model)
Creating a pandas data frame having scores of all above models
results = pd.DataFrame({'Model':['LogisticRegression','MultinomialNB','SVC','DecisionTreeClassifier','RandomForestClassifier','KNeighborsClassifier'],
'Accuracy':[0.98, 0.98, 0.98, 0.96, 0.97, 0.90],
'Precision':[1.0, 0.97, 0.99, 0.94, 1.0, 1.0],
'Recall':[0.84, 0.93, 0.87, 0.82, 0.81, 0.32],
'F1 score':[0.91, 0.95, 0.93, 0.87, 0.89, 0.49]})
results