import spacy
from spacy import displacy
import pandas as pd
import numpy as np
import math
from wordcloud import WordCloud
import re
import string
import nltk
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
!python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")
data = pd.read_csv("data_suicide_detection.csv")
data.head()
data.drop(columns=["Unnamed: 0"], inplace=True)
data.columns = ["text", "label"]
data.head()
data.isnull().values.any()
print("Rows: {}".format(data.shape[0]))
Rows: 232074
orgs = dict()
for index,row in data[:10000].iterrows():
if row['label'] == "suicide":
doc = nlp(row['text'])
if doc.ents:
for ent in doc.ents:
if ent.label_=="ORG":
if ent.text.lower() in orgs.keys():
orgs[ent.text.lower()] += 1
else:
orgs[ent.text.lower()] = 1
orgs = {k: v for k, v in sorted(orgs.items(), key=lambda item: item[1], reverse=True)}
orgs
temp_orgs = dict.copy(orgs)
for key in temp_orgs.keys():
if len(key) <= 3:
del orgs[key]
elif orgs[key] == 1:
del orgs[key]
orgs
text_wordcloud = str()
for key in orgs.keys():
for i in range(orgs[key]):
text_wordcloud += " " + key
text_wordcloud
wordcloud = WordCloud(collocations=False, background_color = 'white').generate(text_wordcloud)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words("english")
custom_stopwords = ["RT"]
wn = nltk.WordNetLemmatizer()
def clean_text_string(text):
text = re.sub("&#\d+;", "", text) # removing emoji
text = re.sub("@\w+[:\s]", "", text) # removing mentioned usernames
text = re.sub("@\w+;", "", text) # removing urlencoded characters
text = re.sub(r"http\S+", "", text) # removing URLs
text = re.sub("\$\w+\$", "", text) # removing $MENTION$
text = "".join([char for char in text if char not in string.punctuation])
text = " ".join(re.split("\W+", text)) # removing connectives other than space
text = [
word.lower()
for word in text.split()
if word not in stopwords and word not in custom_stopwords
]
text = [wn.lemmatize(word) for word in text]
text = " ".join(text)
text = "".join(
[
char
for char in text
if char
in [
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
" ",
]
]
)
return text
data['text'] = data['text'].apply(lambda x : clean_text_string(x))
data['label'] = data['label'].apply(lambda x : 0 if x=="non-suicide" else 1)
test_size = int(0.05*data.shape[0])
train_size = data.shape[0] - test_size
test_data = data.tail(test_size)
data = data.head(train_size)
print("Train Data Size: {}\nTest Data Size: {}".format(train_size, test_size))
Train Data Size: 220471
Test Data Size: 11603
google_encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
SGD_initial = SGDClassifier(loss='log')
start_initial = 0
end_initial = 5000
batchX_initial = data.iloc[start_initial:end_initial, 0]
batchY_initial = data.iloc[start_initial:end_initial, 1]
text_embeddings_initial = google_encoder(batchX_initial.tolist())
X_features_guse_initial = pd.DataFrame(np.array(text_embeddings_initial).tolist())
mse_initial = list()
for z in range(1000):
SGD_initial.partial_fit(X_features_guse_initial, batchY_initial, classes=[0, 1])
mse_initial.append(
mean_squared_error(test_data["label"], SGD_initial.predict(X_test_features))
)
fig, axs = plt.subplots(figsize=(20, 6))
axisX = range(0, 1000, 50)
axisY = np.abs(mse_initial[0:1000:50])
plt.plot(axisX, axisY, "o--")
for x, y in zip(axisX, axisY):
label = "{}".format(x)
plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 10), ha="center")
axs.set_xlabel("Partial fit ending iterations", size=10)
axs.set_title("Mean Squared Error vs Iteration", fontsize=17.5)
plt.show()
fig.set_size_inches(18.5, 10.5)
fig.savefig("Mean Squared Error vs Iteration.png", dpi=100)
final_num_iter = 150
SGD = SGDClassifier(loss='log')
startPoint = 0
mse_scores = list()
for endPoint in range(5000, train_size, 5000):
batchX = data.iloc[startPoint:endPoint, 0]
batchY = data.iloc[startPoint:endPoint, 1]
text_embeddings = google_encoder(batchX.tolist())
X_features_guse = pd.DataFrame(np.array(text_embeddings).tolist())
mse_scores_curr = list()
for z in range(final_num_iter):
SGD.partial_fit(X_features_guse, batchY, classes=[0, 1])
mse_scores_curr.append(mean_squared_error(test_data['label'], SGD.predict(X_test_features)))
mse_scores.append(mse_scores_curr)
del text_embeddings
del X_features_guse
print(endPoint, end=" ")
startPoint = endPoint
5000 10000 15000 20000 25000 30000 35000 40000 45000 50000 55000 60000 65000 70000 75000 80000 85000 90000 95000 100000 105000 110000 115000 120000 125000 130000 135000 140000 145000 150000 155000 160000 165000 170000 175000 180000 185000 190000 195000 200000 205000 210000 215000 220000
minMseList = [min(x) for x in mse_scores]
fig = plt.figure(figsize = (20, 6))
plt.bar([x for x in range(0,len(minMseList))], minMseList, color ='gold')
minMseIterNum = list()
for i in range(len(mse_scores)):
minVal = min(mse_scores[i])
minIdx = mse_scores[i].index(minVal)
minMseIterNum.append(minIdx)
plt.figure(figsize=(10, 6))
plt.scatter(range(len(minMseIterNum)), minMseIterNum, marker="x", color='red')
plt.show()
numIters = len(mse_scores)
cols = 3
rows = math.ceil(numIters/cols)
plt.figure(figsize=(20, 100))
for i in range(numIters):
fig = plt.subplot(rows, cols, i+1)
plt.plot(range(0,150),np.abs(mse_initial[0:150]), color='green')
plt.show()
test_embeddings = google_encoder(test_data['text'].tolist())
X_test_features = pd.DataFrame(np.array(test_embeddings).tolist())
print(classification_report(test_data['label'], SGD.predict(X_test_features)))
precision recall f1-score support
0 0.92 0.93 0.93 5811
1 0.93 0.92 0.92 5792
accuracy 0.93 11603
macro avg 0.93 0.93 0.93 11603
weighted avg 0.93 0.93 0.93 11603
pickle.dump(SGD, open("suicide_analyser.pkl", 'wb'))
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='')
def suicide_predict(text):
text = clean_text_string(text)
embeddings = google_encoder([text])
X_features = pd.DataFrame(np.array(embeddings).tolist())
model = pickle.load(open('suicide_analyser.pkl', 'rb'))
pred = model.predict(X_features)[0]
return "Not Suicidal" if pred==0 else "Suicidal"
suicide_predict("I don't see myself living past tonight")
suicide_predict("You're going to kill me with your funny jokes")