!pip install --upgrade pip !pip install wordcloud

Spam detection using Machine Learning

Importing Libraries

import pandas as pd import numpy as np import re import collections import nltk from nltk.corpus import stopwords from sklearn.model_selection import train_test_split, cross_val_score from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline #from sklearn.naive_bayes import MultinomialNB # from sklearn.ensemble import RandomForestClassifier # from sklearn.svm import SVC # from sklearn.linear_model import LogisticRegression

Importing Data

#Importing our data df = pd.read_csv('spam.csv')

Looking into data

df.head()

df.info()

df.describe()

Data Preprocessing

Getting required columns

df = df[['v1','v2']]

Changing column names

df.columns = ['Label','Text_data']

Creating 'Target' column to have numerical representation of 'Label' column

df['Target'] = df['Label'].map({'spam':1,'ham':0})

Creating 'Text_length' column

df['Text_length'] = df['Text_data'].apply(len)

df.head()

df.info()

df.describe()

df['Label'].value_counts()

nltk.download('stopwords') stop_words = set(stopwords.words('english'))

print(stop_words)

Cleaning data

def process_text(data): sentence_in_lowercase = data.lower() sentence_with_only_alphanumerics = re.sub(r'[^0-9a-zA-Z]', ' ', sentence_in_lowercase) sentence_removed_of_extra_spaces = re.sub(r'\s+', ' ', sentence_with_only_alphanumerics) processed_sentence = " ".join(word for word in sentence_removed_of_extra_spaces.split() if word not in stop_words) return processed_sentence

df['Cleaned_Text_data'] = df['Text_data'].apply(process_text)

df['Cleaned_text_length'] = df['Cleaned_Text_data'].apply(len)

Visualization

plt.figure(figsize=(8,8)) df['Label'].value_counts().plot(kind='pie', labels=['Ham','spam'], autopct='%1.0f%%') plt.title('Ham vs Spam') plt.show()

plt.figure(figsize=(8,8)) plt.grid(True) df['Label'].value_counts().plot(kind='bar') plt.legend = ['ham','spam'] plt.xlabel("Label") plt.ylabel("Count") plt.title("Ham vs Spam") plt.show()

df.hist(bins=40, figsize=(16,10), by='Label', column='Text_length', color=['Green']) plt.show()

sns.countplot(df['Label']) plt.show()

def plot_top_10_words(data): counter = collections.Counter([word for sentence in data for word in sentence.split()]) most_common_10_words = counter.most_common(15) most_common_10_words_df = pd.DataFrame(most_common_10_words, columns=["Word", "Count"]).sort_values(by="Count") most_common_10_words_df.plot.barh(x = "Word", y = "Count", color="blue", figsize=(15, 20)) plt.title('Top 10 common words') plt.grid(True) plt.xlabel("Count")

Top 10 common words before cleaning data

plot_top_10_words(df["Text_data"])

Top 10 common words after cleaning data

plot_top_10_words(df["Cleaned_Text_data"])

Dividing data so that it can be used later on if required

spam_data = df[df['Label']=='spam'] spam_data.head()

spam_data

ham_data = df[df['Label']=='ham'] ham_data.head()

ham_data

Implementation

Training and Testing set split

X = df['Cleaned_Text_data'] y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

X_train.shape

X_test.shape

cv = CountVectorizer().fit(X_train)

training_set = cv.transform(X_train) training_set

testing_set = cv.transform(X_test) testing_set

Trying with multiple models

def provide_classification_report(model): model.fit(training_set,y_train) y_pred = model.predict(testing_set) print('Classification Report\n') print(classification_report(y_test, y_pred)) print('\n Accuracy',accuracy_score(y_test, y_pred)) print('\n Precision',precision_score(y_test, y_pred)) print('\n Recall',recall_score(y_test, y_pred)) print('\n F1 score',f1_score(y_test, y_pred))

from sklearn.linear_model import LogisticRegression model = LogisticRegression() provide_classification_report(model)

from sklearn.naive_bayes import MultinomialNB model = MultinomialNB() provide_classification_report(model)

from sklearn.svm import SVC model = SVC(C=3) provide_classification_report(model)

from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier() provide_classification_report(model)

from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier() provide_classification_report(model)

from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier() provide_classification_report(model)

Creating a pandas data frame having scores of all above models

results = pd.DataFrame({'Model':['LogisticRegression','MultinomialNB','SVC','DecisionTreeClassifier','RandomForestClassifier','KNeighborsClassifier'], 'Accuracy':[0.98, 0.98, 0.98, 0.96, 0.97, 0.90], 'Precision':[1.0, 0.97, 0.99, 0.94, 1.0, 1.0], 'Recall':[0.84, 0.93, 0.87, 0.82, 0.81, 0.32], 'F1 score':[0.91, 0.95, 0.93, 0.87, 0.89, 0.49]})

results

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Spam detection using Machine Learning