EMERSE

from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all"

import os import pandas as pd import numpy as np from tqdm import tqdm import seaborn as sns import matplotlib.pyplot as plt import pprint %matplotlib inline %config InlineBackend.figure_format='retina' sns.set(style='whitegrid', palette='muted', font_scale=1.2)

import sklearn import nltk import re nltk.download('stopwords') nltk.download('wordnet') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.tokenize import RegexpTokenizer from nltk.stem.wordnet import WordNetLemmatizer from sklearn.feature_extraction.text import CountVectorizer print("\nAll packages imported")

speech_data = {} home_dir = './ts_edits' fname = 'p001_transcript.txt' with open(f'{home_dir}/{fname}','r') as fh: file_data = [] content = fh.readlines() for line in content: txt_parts = line.split(":") if "CR" not in txt_parts: txt = txt_parts[-1].replace('\n','') file_data.append(txt) #print(txt) speech_data[fname] = " ".join(file_data) #pprint.pprint(speech_data) # print(speech_data)

def load_transcript_text(home_dir='./ts_edits', interviewer_id="CR"): speech_data = {"filename": [], "text": []} for fname in os.listdir(home_dir): print(f"loading {fname}...") with open(f'{home_dir}/{fname}','r') as fh: file_data = [] content = fh.readlines() for line in content: txt_parts = line.split(":") if interviewer_id not in txt_parts: txt = txt_parts[-1].replace('\n','') file_data.append(txt) #print(txt) speech_data['filename'].append(fname) speech_data['text'].append(" ".join(file_data)) return speech_data

interview_dict = load_transcript_text()

df = pd.DataFrame(interview_dict)

df['word_ct'] = df['text'].apply(lambda x: len(str(x).split(" "))) df.word_ct.describe()

# set counter for saved images ct = 0

freq = pd.Series(' '.join(df['text']).split()).value_counts() top_freq = freq[:20] low_freq = freq[-20:] print(f"most frequent words (pre-filtering):\n{top_freq}")

stop_words = set(stopwords.words("english")) new_stop_words = ["yeah","so","cuz","get","sure","really","would","thing","think", "could","also","stuff","good","much","maybe","kind","like","second","third","fourth" "something","say","mean","month","lot","on","know","one","okay", "well","might","going","go","two","to","little","bit","first","year","take", "probably","actually", "blah"] # stop_words2 = stop_words.union(new_stop_words) # stop_words2 = set(stopwords.words("english")) # new_stop_words2 = ["app","food","love","like","good","great","really"] # stop_words2 = stop_words2.union(new_stop_words2)

corpus = [] n_interviews = df.shape[0] n_interviews for i in range(n_interviews): #Remove punctuations text = re.sub('[^a-zA-Z]', ' ', df['text'][i]) #Convert to lowercase text = text.lower() #remove tags text=re.sub("</?.*?>"," <> ",text) # remove special characters and digits text=re.sub("(\\d|\\W)+"," ",text) ##Convert to list from string text = text.split() ##Stemming ps=PorterStemmer() #Lemmatisation lem = WordNetLemmatizer() text = [lem.lemmatize(word) for word in text if not word in stop_words2] text = " ".join(text) corpus.append(text)

#Word cloud from os import path from PIL import Image from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib.pyplot as plt #% matplotlib inline ct += 1 wordcloud = WordCloud( background_color='white', stopwords=stop_words2, max_words=100, max_font_size=50, random_state=42, ).generate(str(corpus)) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show() fig.savefig(f"./emerse_wordcloud{ct}.png", dpi=1500)

# using standard stopwords set cv = CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3)) X = cv.fit_transform(corpus) # using custom stopwords set cv2 = CountVectorizer(max_df=0.8,stop_words=stop_words2, max_features=10000, ngram_range=(1,3)) X2 = cv2.fit_transform(corpus)

k1 = list(cv.vocabulary_.keys())[0:10] k2 = list(cv2.vocabulary_.keys())[0:10] print(f"Count Vectorization with default stop-words: {k1}\n\nCount Vectorization with updated stop-words: {k2}")

#Most frequently occuring words def get_top_n_words(corpus, stop_word_selection, n=None): vec = CountVectorizer(max_df=0.8,stop_words=stop_word_selection, max_features=10000).fit(corpus) bag_of_words = vec.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n]

# #Convert most freq words to dataframe for plotting bar plot top_words = get_top_n_words(corpus, stop_words, n=20) top_df = pd.DataFrame(top_words) top_df.columns=["Word", "Freq"] #Barplot of most freq words sns.set(rc={'figure.figsize':(13,8)}) g = sns.barplot(x="Word", y="Freq", data=top_df) g.set_xticklabels(g.get_xticklabels(), rotation=30);

# #Convert most freq words to dataframe for plotting bar plot top_words2 = get_top_n_words(corpus, stop_words2, n=20) top_df2 = pd.DataFrame(top_words) top_df2.columns=["Word", "Freq"] #Barplot of most freq words sns.set(rc={'figure.figsize':(13,8)}) g2 = sns.barplot(x="Word", y="Freq", data=top_df) g2.set_xticklabels(g.get_xticklabels(), rotation=30);

#Most frequently occuring Bi-grams def get_top_n2_words(corpus, stop_word_selection, n=None): vec1 = CountVectorizer(max_df=0.8,stop_words=stop_word_selection, ngram_range=(2,2), max_features=2000).fit(corpus) bag_of_words = vec1.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec1.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n]

top2_words = get_top_n2_words(corpus, stop_words, n=20) top2_df = pd.DataFrame(top2_words) top2_df.columns=["Bi-gram", "Freq"] print(top2_df) #Barplot of most freq Bi-grams sns.set(rc={'figure.figsize':(13,8)}) h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df) h.set_xticklabels(h.get_xticklabels(), rotation=45);

top2_words = get_top_n2_words(corpus, stop_words2, n=20) top2_df = pd.DataFrame(top2_words) top2_df.columns=["Bi-gram", "Freq"] print(top2_df) #Barplot of most freq Bi-grams sns.set(rc={'figure.figsize':(13,8)}) h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df) h.set_xticklabels(h.get_xticklabels(), rotation=45);

#Most frequently occuring Tri-grams def get_top_n3_words(corpus, stop_word_selection, n=None): vec1 = CountVectorizer(max_df=0.8,stop_words=stop_word_selection, ngram_range=(3,3), max_features=2000).fit(corpus) bag_of_words = vec1.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec1.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n]

top3_words = get_top_n3_words(corpus, stop_words, n=20) top3_df = pd.DataFrame(top3_words) top3_df.columns=["Tri-gram", "Freq"] print(top3_df) #Barplot of most freq Tri-grams sns.set(rc={'figure.figsize':(13,8)}) j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df) j.set_xticklabels(j.get_xticklabels(), rotation=45);

top3_words = get_top_n3_words(corpus, stop_words2, n=20) top3_df = pd.DataFrame(top3_words) top3_df.columns=["Tri-gram", "Freq"] print(top3_df) #Barplot of most freq Tri-grams sns.set(rc={'figure.figsize':(13,8)}) j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df) j.set_xticklabels(j.get_xticklabels(), rotation=45);

#Most frequently occuring Tri-grams def get_top_n4_words(corpus, stop_word_selection, n=None): vec1 = CountVectorizer(max_df=0.8,stop_words=stop_word_selection, ngram_range=(4,4), max_features=2000).fit(corpus) bag_of_words = vec1.transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec1.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n]

top4_words = get_top_n4_words(corpus, stop_words, n=20) top4_df = pd.DataFrame(top4_words) top4_df.columns=["4-gram", "Freq"] print(top4_df) #Barplot of most freq Tri-grams sns.set(rc={'figure.figsize':(13,8)}) j=sns.barplot(x="4-gram", y="Freq", data=top4_df) j.set_xticklabels(j.get_xticklabels(), rotation=45);

top4_words = get_top_n4_words(corpus, stop_words2, n=20) top4_df = pd.DataFrame(top4_words) top4_df.columns=["4-gram", "Freq"] print(top4_df) #Barplot of most freq Tri-grams sns.set(rc={'figure.figsize':(13,8)}) j=sns.barplot(x="4-gram", y="Freq", data=top4_df) j.set_xticklabels(j.get_xticklabels(), rotation=45);

import nltk.collocations as nc bigram_measures = nc.BigramAssocMeasures() trigram_measures = nc.TrigramAssocMeasures() finder = nc.BigramCollocationFinder.from_words(" ".join(corpus).split()) finder.apply_freq_filter(5) bi_list = [" ".join(b) for b in finder.nbest(bigram_measures.pmi,20)] bi_list

tfinder = nc.TrigramCollocationFinder.from_words(" ".join(corpus).split()) #tfinder.apply_word_filter(lambda x: x in stop_words2) tfinder.apply_freq_filter(3) # len(tfinder.nbest(trigram_measures.pmi,10000)) # print() tri_list = [" ".join(x) for x in tfinder.nbest(trigram_measures.raw_freq,20)] tri_list

# uncomment and manually edit lists below to choose keyphrases, otehrwise use last line in cell # k_bigram = ['bone marrow', # 'cardiac surgery', # 'carpal tunnel', # 'tunnel syndrome', # 'machine redacted', # 'virtual machine', # 'infectious disease', # 'uc irvine', # 'blah blah', # 'quality improvement', # 'current status', # 'faculty member', # 'challenging aspect', # 'icd code', # 'irb approval', # 'playing around', # 'biomedical informatics', # 'electronic health', # 'pharmacy school', # 'sql server'] # k_trigram = ['medical record system', # 'done better make', # 'temporary patient list', # 'better make implementation', # 'carpal tunnel syndrome', # 'electronic health record', # 'four year ago', # 'free text search', # 'full text search', # 'open source tool', # 'right right sound', # 'clinical data warehouse', # 'current status emerse', # 'done done better', # 'medical record data', # 'self service tool', # 'text search capability', # 'back local patient', # 'bone marrow transplant'] #keyphrases = k_bigram + k_trigram # automatically use top bi-grams and tri-grams as keyphrases keyphrases = bi_list + tri_list

# reorganize df if 'word_ct' in list(df.keys()): df = df.drop('word_ct',axis=1)

# add new column for each factor category and init with 0-value new_cols = keyphrases df[new_cols] = 0 df.head()

df.loc[0,'text'] print("") t = corpus[0] t print("") x = t.count('nurse') print(f"{x} occurrences of the phrase `data direct` in {df.loc[0,'filename']}")

for k in keyphrases: print(f"searching for n-gram: {k}") for lin in range(len(corpus)): try: text = corpus[lin] k_ct = text.count(k) df.loc[lin,k] = k_ct except KeyError as ke: print(f"key not found: {ke}") except ValueError as ve: print(f"value of lin ({lin}) is not a recognized line in dataframe: {ve}") except IndexError as ie: print(f"value of lin ({lin}) is out of range of corpus: {ie}") else: print(f"found {k_ct} occurrences of {k} in vectorized text of {df.loc[lin,'filename']}")

df.to_csv('./emerse_text_features.csv',index=False)

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('./emerse_text_features.csv') factors_df = df.drop(labels=['filename','text'],axis=1) factors_df.head()

plt.figure(figsize=(25,25)) corr = factors_df.corr() sns.heatmap(corr, annot=True, cmap="Blues") print("\ntext feature correlations heatmap") plt.show()