from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
dataframe=pd.read_csv("spam.csv")
dataframe.head()
dataframe = dataframe.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
dataframe = dataframe.rename(columns={"v1":"Label", "v2":"Text"})
dataframe.head()
X=dataframe["Text"]
y=dataframe["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13,
shuffle=True , stratify=y)
print(y.value_counts())
pipe = Pipeline(steps=[('vectorize', CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b\w+\b')),
('classifier', MultinomialNB())])
pipe.fit(X_train, y_train)
y_predict = pipe.predict(X_test)
print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))
mat = confusion_matrix(y_test, y_predict)
sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=True, cmap='coolwarm', linewidths=5)
plt.xlabel('predicted value')
plt.ylabel('true value')
plt.show()
SMS = [["You just won 50000 dollars worth cash prizes"],
["You can redeem 5000 dollars in cash"],
["I'll come within 5 minutes to meet you"],
["You just won 50 dollars to play games"],
["How are you doing my friend?"],
["You just won 50 dollars to have sex"],
["Greg, can you call me back once you get this?"],
["You just won 50 dollars to buy food"],
["Winner! To claim your gift call 0908878877"],
["Attend this free COVID webinar today: Book your session now"],
["Your online account has been locked. Please verify payment information"]]
for sms in SMS:
print(pipe.predict(sms), sms)