Suicide Detection

import spacy from spacy import displacy import pandas as pd import numpy as np import math from wordcloud import WordCloud import re import string import nltk import tensorflow as tf import tensorflow_hub as hub from sklearn.metrics import classification_report from sklearn.linear_model import SGDClassifier from sklearn.metrics import mean_squared_error from sklearn.metrics import confusion_matrix import seaborn as sns import matplotlib.pyplot as plt import pickle

!python -m spacy download en_core_web_lg

nlp = spacy.load("en_core_web_lg")

data = pd.read_csv("data_suicide_detection.csv") data.head()

data.drop(columns=["Unnamed: 0"], inplace=True) data.columns = ["text", "label"] data.head()

data.isnull().values.any()

print("Rows: {}".format(data.shape[0]))

Rows: 232074

orgs = dict() for index,row in data[:10000].iterrows(): if row['label'] == "suicide": doc = nlp(row['text']) if doc.ents: for ent in doc.ents: if ent.label_=="ORG": if ent.text.lower() in orgs.keys(): orgs[ent.text.lower()] += 1 else: orgs[ent.text.lower()] = 1

orgs = {k: v for k, v in sorted(orgs.items(), key=lambda item: item[1], reverse=True)} orgs

temp_orgs = dict.copy(orgs) for key in temp_orgs.keys(): if len(key) <= 3: del orgs[key] elif orgs[key] == 1: del orgs[key] orgs

text_wordcloud = str() for key in orgs.keys(): for i in range(orgs[key]): text_wordcloud += " " + key text_wordcloud

wordcloud = WordCloud(collocations=False, background_color = 'white').generate(text_wordcloud) plt.figure(figsize=(20,10)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()

nltk.download('stopwords') nltk.download('wordnet') stopwords = nltk.corpus.stopwords.words("english") custom_stopwords = ["RT"] wn = nltk.WordNetLemmatizer() def clean_text_string(text): text = re.sub("&#\d+;", "", text) # removing emoji text = re.sub("@\w+[:\s]", "", text) # removing mentioned usernames text = re.sub("@\w+;", "", text) # removing urlencoded characters text = re.sub(r"http\S+", "", text) # removing URLs text = re.sub("\$\w+\$", "", text) # removing $MENTION$ text = "".join([char for char in text if char not in string.punctuation]) text = " ".join(re.split("\W+", text)) # removing connectives other than space text = [ word.lower() for word in text.split() if word not in stopwords and word not in custom_stopwords ] text = [wn.lemmatize(word) for word in text] text = " ".join(text) text = "".join( [ char for char in text if char in [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", " ", ] ] ) return text

data['text'] = data['text'].apply(lambda x : clean_text_string(x))

data['label'] = data['label'].apply(lambda x : 0 if x=="non-suicide" else 1)

test_size = int(0.05*data.shape[0]) train_size = data.shape[0] - test_size test_data = data.tail(test_size) data = data.head(train_size) print("Train Data Size: {}\nTest Data Size: {}".format(train_size, test_size))

Train Data Size: 220471
Test Data Size: 11603

google_encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

SGD_initial = SGDClassifier(loss='log')

start_initial = 0 end_initial = 5000 batchX_initial = data.iloc[start_initial:end_initial, 0] batchY_initial = data.iloc[start_initial:end_initial, 1] text_embeddings_initial = google_encoder(batchX_initial.tolist()) X_features_guse_initial = pd.DataFrame(np.array(text_embeddings_initial).tolist()) mse_initial = list() for z in range(1000): SGD_initial.partial_fit(X_features_guse_initial, batchY_initial, classes=[0, 1]) mse_initial.append( mean_squared_error(test_data["label"], SGD_initial.predict(X_test_features)) )

fig, axs = plt.subplots(figsize=(20, 6)) axisX = range(0, 1000, 50) axisY = np.abs(mse_initial[0:1000:50]) plt.plot(axisX, axisY, "o--") for x, y in zip(axisX, axisY): label = "{}".format(x) plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 10), ha="center") axs.set_xlabel("Partial fit ending iterations", size=10) axs.set_title("Mean Squared Error vs Iteration", fontsize=17.5) plt.show() fig.set_size_inches(18.5, 10.5) fig.savefig("Mean Squared Error vs Iteration.png", dpi=100)

final_num_iter = 150

SGD = SGDClassifier(loss='log')

startPoint = 0 mse_scores = list() for endPoint in range(5000, train_size, 5000): batchX = data.iloc[startPoint:endPoint, 0] batchY = data.iloc[startPoint:endPoint, 1] text_embeddings = google_encoder(batchX.tolist()) X_features_guse = pd.DataFrame(np.array(text_embeddings).tolist()) mse_scores_curr = list() for z in range(final_num_iter): SGD.partial_fit(X_features_guse, batchY, classes=[0, 1]) mse_scores_curr.append(mean_squared_error(test_data['label'], SGD.predict(X_test_features))) mse_scores.append(mse_scores_curr) del text_embeddings del X_features_guse print(endPoint, end=" ") startPoint = endPoint

5000 10000 15000 20000 25000 30000 35000 40000 45000 50000 55000 60000 65000 70000 75000 80000 85000 90000 95000 100000 105000 110000 115000 120000 125000 130000 135000 140000 145000 150000 155000 160000 165000 170000 175000 180000 185000 190000 195000 200000 205000 210000 215000 220000

minMseList = [min(x) for x in mse_scores]

fig = plt.figure(figsize = (20, 6)) plt.bar([x for x in range(0,len(minMseList))], minMseList, color ='gold')

minMseIterNum = list() for i in range(len(mse_scores)): minVal = min(mse_scores[i]) minIdx = mse_scores[i].index(minVal) minMseIterNum.append(minIdx) plt.figure(figsize=(10, 6)) plt.scatter(range(len(minMseIterNum)), minMseIterNum, marker="x", color='red') plt.show()

numIters = len(mse_scores) cols = 3 rows = math.ceil(numIters/cols) plt.figure(figsize=(20, 100)) for i in range(numIters): fig = plt.subplot(rows, cols, i+1) plt.plot(range(0,150),np.abs(mse_initial[0:150]), color='green') plt.show()

test_embeddings = google_encoder(test_data['text'].tolist()) X_test_features = pd.DataFrame(np.array(test_embeddings).tolist())

print(classification_report(test_data['label'], SGD.predict(X_test_features)))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93      5811
           1       0.93      0.92      0.92      5792

    accuracy                           0.93     11603
   macro avg       0.93      0.93      0.93     11603
weighted avg       0.93      0.93      0.93     11603

pickle.dump(SGD, open("suicide_analyser.pkl", 'wb'))

group_names = ['True Neg','False Pos','False Neg','True Pos'] group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()] group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)] labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)] labels = np.asarray(labels).reshape(2,2) sns.heatmap(cf_matrix, annot=labels, fmt='')

def suicide_predict(text): text = clean_text_string(text) embeddings = google_encoder([text]) X_features = pd.DataFrame(np.array(embeddings).tolist()) model = pickle.load(open('suicide_analyser.pkl', 'rb')) pred = model.predict(X_features)[0] return "Not Suicidal" if pred==0 else "Suicidal"

suicide_predict("I don't see myself living past tonight")

suicide_predict("You're going to kill me with your funny jokes")