Suicide Detection

import spacy from spacy import displacy import pandas as pd import numpy as np import math from wordcloud import WordCloud import re import string import nltk import tensorflow as tf import tensorflow_hub as hub from sklearn.metrics import classification_report from sklearn.linear_model import SGDClassifier from sklearn.metrics import mean_squared_error from sklearn.metrics import confusion_matrix import seaborn as sns import matplotlib.pyplot as plt import pickle

!python -m spacy download en_core_web_lg

nlp = spacy.load("en_core_web_lg")

data = pd.read_csv("data_suicide_detection.csv") data.head()

data.drop(columns=["Unnamed: 0"], inplace=True) data.columns = ["text", "label"] data.head()

data.isnull().values.any()

print("Rows: {}".format(data.shape[0]))

orgs = dict() for index,row in data[:10000].iterrows(): if row['label'] == "suicide": doc = nlp(row['text']) if doc.ents: for ent in doc.ents: if ent.label_=="ORG": if ent.text.lower() in orgs.keys(): orgs[ent.text.lower()] += 1 else: orgs[ent.text.lower()] = 1

orgs = {k: v for k, v in sorted(orgs.items(), key=lambda item: item[1], reverse=True)} orgs

temp_orgs = dict.copy(orgs) for key in temp_orgs.keys(): if len(key) <= 3: del orgs[key] elif orgs[key] == 1: del orgs[key] orgs

text_wordcloud = str() for key in orgs.keys(): for i in range(orgs[key]): text_wordcloud += " " + key text_wordcloud

wordcloud = WordCloud(collocations=False, background_color = 'white').generate(text_wordcloud) plt.figure(figsize=(20,10)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()

nltk.download('stopwords') nltk.download('wordnet') stopwords = nltk.corpus.stopwords.words("english") custom_stopwords = ["RT"] wn = nltk.WordNetLemmatizer() def clean_text_string(text): text = re.sub("&#\d+;", "", text) # removing emoji text = re.sub("@\w+[:\s]", "", text) # removing mentioned usernames text = re.sub("@\w+;", "", text) # removing urlencoded characters text = re.sub(r"http\S+", "", text) # removing URLs text = re.sub("\$\w+\$", "", text) # removing $MENTION$ text = "".join([char for char in text if char not in string.punctuation]) text = " ".join(re.split("\W+", text)) # removing connectives other than space text = [ word.lower() for word in text.split() if word not in stopwords and word not in custom_stopwords ] text = [wn.lemmatize(word) for word in text] text = " ".join(text) text = "".join( [ char for char in text if char in [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", " ", ] ] ) return text

data['text'] = data['text'].apply(lambda x : clean_text_string(x))

data['label'] = data['label'].apply(lambda x : 0 if x=="non-suicide" else 1)

test_size = int(0.05*data.shape[0]) train_size = data.shape[0] - test_size test_data = data.tail(test_size) data = data.head(train_size) print("Train Data Size: {}\nTest Data Size: {}".format(train_size, test_size))

google_encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

SGD_initial = SGDClassifier(loss='log')

start_initial = 0 end_initial = 5000 batchX_initial = data.iloc[start_initial:end_initial, 0] batchY_initial = data.iloc[start_initial:end_initial, 1] text_embeddings_initial = google_encoder(batchX_initial.tolist()) X_features_guse_initial = pd.DataFrame(np.array(text_embeddings_initial).tolist()) mse_initial = list() for z in range(1000): SGD_initial.partial_fit(X_features_guse_initial, batchY_initial, classes=[0, 1]) mse_initial.append( mean_squared_error(test_data["label"], SGD_initial.predict(X_test_features)) )

fig, axs = plt.subplots(figsize=(20, 6)) axisX = range(0, 1000, 50) axisY = np.abs(mse_initial[0:1000:50]) plt.plot(axisX, axisY, "o--") for x, y in zip(axisX, axisY): label = "{}".format(x) plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 10), ha="center") axs.set_xlabel("Partial fit ending iterations", size=10) axs.set_title("Mean Squared Error vs Iteration", fontsize=17.5) plt.show() fig.set_size_inches(18.5, 10.5) fig.savefig("Mean Squared Error vs Iteration.png", dpi=100)

final_num_iter = 150

SGD = SGDClassifier(loss='log')

startPoint = 0 mse_scores = list() for endPoint in range(5000, train_size, 5000): batchX = data.iloc[startPoint:endPoint, 0] batchY = data.iloc[startPoint:endPoint, 1] text_embeddings = google_encoder(batchX.tolist()) X_features_guse = pd.DataFrame(np.array(text_embeddings).tolist()) mse_scores_curr = list() for z in range(final_num_iter): SGD.partial_fit(X_features_guse, batchY, classes=[0, 1]) mse_scores_curr.append(mean_squared_error(test_data['label'], SGD.predict(X_test_features))) mse_scores.append(mse_scores_curr) del text_embeddings del X_features_guse print(endPoint, end=" ") startPoint = endPoint

minMseList = [min(x) for x in mse_scores]

fig = plt.figure(figsize = (20, 6)) plt.bar([x for x in range(0,len(minMseList))], minMseList, color ='gold')

minMseIterNum = list() for i in range(len(mse_scores)): minVal = min(mse_scores[i]) minIdx = mse_scores[i].index(minVal) minMseIterNum.append(minIdx) plt.figure(figsize=(10, 6)) plt.scatter(range(len(minMseIterNum)), minMseIterNum, marker="x", color='red') plt.show()

numIters = len(mse_scores) cols = 3 rows = math.ceil(numIters/cols) plt.figure(figsize=(20, 100)) for i in range(numIters): fig = plt.subplot(rows, cols, i+1) plt.plot(range(0,150),np.abs(mse_initial[0:150]), color='green') plt.show()

test_embeddings = google_encoder(test_data['text'].tolist()) X_test_features = pd.DataFrame(np.array(test_embeddings).tolist())

print(classification_report(test_data['label'], SGD.predict(X_test_features)))

pickle.dump(SGD, open("suicide_analyser.pkl", 'wb'))

group_names = ['True Neg','False Pos','False Neg','True Pos'] group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()] group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)] labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)] labels = np.asarray(labels).reshape(2,2) sns.heatmap(cf_matrix, annot=labels, fmt='')

def suicide_predict(text): text = clean_text_string(text) embeddings = google_encoder([text]) X_features = pd.DataFrame(np.array(embeddings).tolist()) model = pickle.load(open('suicide_analyser.pkl', 'rb')) pred = model.predict(X_features)[0] return "Not Suicidal" if pred==0 else "Suicidal"

suicide_predict("I don't see myself living past tonight")

suicide_predict("You're going to kill me with your funny jokes")