import spacy
from spacy import displacy
import pandas as pd
import numpy as np
import math
from wordcloud import WordCloud
import re
import string
import nltk
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
!python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")
data = pd.read_csv("data_suicide_detection.csv")
data.head()
data.drop(columns=["Unnamed: 0"], inplace=True)
data.columns = ["text", "label"]
data.head()
data.isnull().values.any()
print("Rows: {}".format(data.shape[0]))
orgs = dict()
for index,row in data[:10000].iterrows():
if row['label'] == "suicide":
doc = nlp(row['text'])
if doc.ents:
for ent in doc.ents:
if ent.label_=="ORG":
if ent.text.lower() in orgs.keys():
orgs[ent.text.lower()] += 1
else:
orgs[ent.text.lower()] = 1
orgs = {k: v for k, v in sorted(orgs.items(), key=lambda item: item[1], reverse=True)}
orgs
temp_orgs = dict.copy(orgs)
for key in temp_orgs.keys():
if len(key) <= 3:
del orgs[key]
elif orgs[key] == 1:
del orgs[key]
orgs
text_wordcloud = str()
for key in orgs.keys():
for i in range(orgs[key]):
text_wordcloud += " " + key
text_wordcloud
wordcloud = WordCloud(collocations=False, background_color = 'white').generate(text_wordcloud)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words("english")
custom_stopwords = ["RT"]
wn = nltk.WordNetLemmatizer()
def clean_text_string(text):
text = re.sub("&#\d+;", "", text) # removing emoji
text = re.sub("@\w+[:\s]", "", text) # removing mentioned usernames
text = re.sub("@\w+;", "", text) # removing urlencoded characters
text = re.sub(r"http\S+", "", text) # removing URLs
text = re.sub("\$\w+\$", "", text) # removing $MENTION$
text = "".join([char for char in text if char not in string.punctuation])
text = " ".join(re.split("\W+", text)) # removing connectives other than space
text = [
word.lower()
for word in text.split()
if word not in stopwords and word not in custom_stopwords
]
text = [wn.lemmatize(word) for word in text]
text = " ".join(text)
text = "".join(
[
char
for char in text
if char
in [
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
" ",
]
]
)
return text
data['text'] = data['text'].apply(lambda x : clean_text_string(x))
data['label'] = data['label'].apply(lambda x : 0 if x=="non-suicide" else 1)
test_size = int(0.05*data.shape[0])
train_size = data.shape[0] - test_size
test_data = data.tail(test_size)
data = data.head(train_size)
print("Train Data Size: {}\nTest Data Size: {}".format(train_size, test_size))
google_encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
SGD_initial = SGDClassifier(loss='log')
start_initial = 0
end_initial = 5000
batchX_initial = data.iloc[start_initial:end_initial, 0]
batchY_initial = data.iloc[start_initial:end_initial, 1]
text_embeddings_initial = google_encoder(batchX_initial.tolist())
X_features_guse_initial = pd.DataFrame(np.array(text_embeddings_initial).tolist())
mse_initial = list()
for z in range(1000):
SGD_initial.partial_fit(X_features_guse_initial, batchY_initial, classes=[0, 1])
mse_initial.append(
mean_squared_error(test_data["label"], SGD_initial.predict(X_test_features))
)
fig, axs = plt.subplots(figsize=(20, 6))
axisX = range(0, 1000, 50)
axisY = np.abs(mse_initial[0:1000:50])
plt.plot(axisX, axisY, "o--")
for x, y in zip(axisX, axisY):
label = "{}".format(x)
plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 10), ha="center")
axs.set_xlabel("Partial fit ending iterations", size=10)
axs.set_title("Mean Squared Error vs Iteration", fontsize=17.5)
plt.show()
fig.set_size_inches(18.5, 10.5)
fig.savefig("Mean Squared Error vs Iteration.png", dpi=100)
final_num_iter = 150
SGD = SGDClassifier(loss='log')
startPoint = 0
mse_scores = list()
for endPoint in range(5000, train_size, 5000):
batchX = data.iloc[startPoint:endPoint, 0]
batchY = data.iloc[startPoint:endPoint, 1]
text_embeddings = google_encoder(batchX.tolist())
X_features_guse = pd.DataFrame(np.array(text_embeddings).tolist())
mse_scores_curr = list()
for z in range(final_num_iter):
SGD.partial_fit(X_features_guse, batchY, classes=[0, 1])
mse_scores_curr.append(mean_squared_error(test_data['label'], SGD.predict(X_test_features)))
mse_scores.append(mse_scores_curr)
del text_embeddings
del X_features_guse
print(endPoint, end=" ")
startPoint = endPoint
minMseList = [min(x) for x in mse_scores]
fig = plt.figure(figsize = (20, 6))
plt.bar([x for x in range(0,len(minMseList))], minMseList, color ='gold')
minMseIterNum = list()
for i in range(len(mse_scores)):
minVal = min(mse_scores[i])
minIdx = mse_scores[i].index(minVal)
minMseIterNum.append(minIdx)
plt.figure(figsize=(10, 6))
plt.scatter(range(len(minMseIterNum)), minMseIterNum, marker="x", color='red')
plt.show()
numIters = len(mse_scores)
cols = 3
rows = math.ceil(numIters/cols)
plt.figure(figsize=(20, 100))
for i in range(numIters):
fig = plt.subplot(rows, cols, i+1)
plt.plot(range(0,150),np.abs(mse_initial[0:150]), color='green')
plt.show()
test_embeddings = google_encoder(test_data['text'].tolist())
X_test_features = pd.DataFrame(np.array(test_embeddings).tolist())
print(classification_report(test_data['label'], SGD.predict(X_test_features)))
pickle.dump(SGD, open("suicide_analyser.pkl", 'wb'))
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='')
def suicide_predict(text):
text = clean_text_string(text)
embeddings = google_encoder([text])
X_features = pd.DataFrame(np.array(embeddings).tolist())
model = pickle.load(open('suicide_analyser.pkl', 'rb'))
pred = model.predict(X_features)[0]
return "Not Suicidal" if pred==0 else "Suicidal"
suicide_predict("I don't see myself living past tonight")
suicide_predict("You're going to kill me with your funny jokes")