from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import pprint
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
import sklearn
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
print("\nAll packages imported")
speech_data = {}
home_dir = './ts_edits'
fname = 'p001_transcript.txt'
with open(f'{home_dir}/{fname}','r') as fh:
file_data = []
content = fh.readlines()
for line in content:
txt_parts = line.split(":")
if "CR" not in txt_parts:
txt = txt_parts[-1].replace('\n','')
file_data.append(txt)
#print(txt)
speech_data[fname] = " ".join(file_data)
#pprint.pprint(speech_data)
# print(speech_data)
def load_transcript_text(home_dir='./ts_edits', interviewer_id="CR"):
speech_data = {"filename": [], "text": []}
for fname in os.listdir(home_dir):
print(f"loading {fname}...")
with open(f'{home_dir}/{fname}','r') as fh:
file_data = []
content = fh.readlines()
for line in content:
txt_parts = line.split(":")
if interviewer_id not in txt_parts:
txt = txt_parts[-1].replace('\n','')
file_data.append(txt)
#print(txt)
speech_data['filename'].append(fname)
speech_data['text'].append(" ".join(file_data))
return speech_data
interview_dict = load_transcript_text()
df = pd.DataFrame(interview_dict)
df['word_ct'] = df['text'].apply(lambda x: len(str(x).split(" ")))
df.word_ct.describe()
# set counter for saved images
ct = 0
freq = pd.Series(' '.join(df['text']).split()).value_counts()
top_freq = freq[:20]
low_freq = freq[-20:]
print(f"most frequent words (pre-filtering):\n{top_freq}")
stop_words = set(stopwords.words("english"))
new_stop_words = ["yeah","so","cuz","get","sure","really","would","thing","think",
"could","also","stuff","good","much","maybe","kind","like","second","third","fourth"
"something","say","mean","month","lot","on","know","one","okay",
"well","might","going","go","two","to","little","bit","first","year","take",
"probably","actually", "blah"] #
stop_words2 = stop_words.union(new_stop_words)
# stop_words2 = set(stopwords.words("english"))
# new_stop_words2 = ["app","food","love","like","good","great","really"]
# stop_words2 = stop_words2.union(new_stop_words2)
corpus = []
n_interviews = df.shape[0]
n_interviews
for i in range(n_interviews):
#Remove punctuations
text = re.sub('[^a-zA-Z]', ' ', df['text'][i])
#Convert to lowercase
text = text.lower()
#remove tags
text=re.sub("</?.*?>"," <> ",text)
# remove special characters and digits
text=re.sub("(\\d|\\W)+"," ",text)
##Convert to list from string
text = text.split()
##Stemming
ps=PorterStemmer()
#Lemmatisation
lem = WordNetLemmatizer()
text = [lem.lemmatize(word) for word in text if not word in stop_words2]
text = " ".join(text)
corpus.append(text)
#Word cloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
#% matplotlib inline
ct += 1
wordcloud = WordCloud(
background_color='white',
stopwords=stop_words2,
max_words=100,
max_font_size=50,
random_state=42,
).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig(f"./emerse_wordcloud{ct}.png", dpi=1500)
# using standard stopwords set
cv = CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X = cv.fit_transform(corpus)
# using custom stopwords set
cv2 = CountVectorizer(max_df=0.8,stop_words=stop_words2, max_features=10000, ngram_range=(1,3))
X2 = cv2.fit_transform(corpus)
k1 = list(cv.vocabulary_.keys())[0:10]
k2 = list(cv2.vocabulary_.keys())[0:10]
print(f"Count Vectorization with default stop-words: {k1}\n\nCount Vectorization with updated stop-words: {k2}")
#Most frequently occuring words
def get_top_n_words(corpus, stop_word_selection, n=None):
vec = CountVectorizer(max_df=0.8,stop_words=stop_word_selection, max_features=10000).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
# #Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, stop_words, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
#Barplot of most freq words
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=30);
# #Convert most freq words to dataframe for plotting bar plot
top_words2 = get_top_n_words(corpus, stop_words2, n=20)
top_df2 = pd.DataFrame(top_words)
top_df2.columns=["Word", "Freq"]
#Barplot of most freq words
sns.set(rc={'figure.figsize':(13,8)})
g2 = sns.barplot(x="Word", y="Freq", data=top_df)
g2.set_xticklabels(g.get_xticklabels(), rotation=30);
#Most frequently occuring Bi-grams
def get_top_n2_words(corpus, stop_word_selection, n=None):
vec1 = CountVectorizer(max_df=0.8,stop_words=stop_word_selection, ngram_range=(2,2), max_features=2000).fit(corpus)
bag_of_words = vec1.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec1.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
top2_words = get_top_n2_words(corpus, stop_words, n=20)
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
print(top2_df)
#Barplot of most freq Bi-grams
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
h.set_xticklabels(h.get_xticklabels(), rotation=45);
top2_words = get_top_n2_words(corpus, stop_words2, n=20)
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
print(top2_df)
#Barplot of most freq Bi-grams
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
h.set_xticklabels(h.get_xticklabels(), rotation=45);
#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, stop_word_selection, n=None):
vec1 = CountVectorizer(max_df=0.8,stop_words=stop_word_selection, ngram_range=(3,3), max_features=2000).fit(corpus)
bag_of_words = vec1.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec1.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
top3_words = get_top_n3_words(corpus, stop_words, n=20)
top3_df = pd.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
print(top3_df)
#Barplot of most freq Tri-grams
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45);
top3_words = get_top_n3_words(corpus, stop_words2, n=20)
top3_df = pd.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
print(top3_df)
#Barplot of most freq Tri-grams
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45);
#Most frequently occuring Tri-grams
def get_top_n4_words(corpus, stop_word_selection, n=None):
vec1 = CountVectorizer(max_df=0.8,stop_words=stop_word_selection, ngram_range=(4,4), max_features=2000).fit(corpus)
bag_of_words = vec1.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec1.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
top4_words = get_top_n4_words(corpus, stop_words, n=20)
top4_df = pd.DataFrame(top4_words)
top4_df.columns=["4-gram", "Freq"]
print(top4_df)
#Barplot of most freq Tri-grams
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="4-gram", y="Freq", data=top4_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45);
top4_words = get_top_n4_words(corpus, stop_words2, n=20)
top4_df = pd.DataFrame(top4_words)
top4_df.columns=["4-gram", "Freq"]
print(top4_df)
#Barplot of most freq Tri-grams
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="4-gram", y="Freq", data=top4_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45);
import nltk.collocations as nc
bigram_measures = nc.BigramAssocMeasures()
trigram_measures = nc.TrigramAssocMeasures()
finder = nc.BigramCollocationFinder.from_words(" ".join(corpus).split())
finder.apply_freq_filter(5)
bi_list = [" ".join(b) for b in finder.nbest(bigram_measures.pmi,20)]
bi_list
tfinder = nc.TrigramCollocationFinder.from_words(" ".join(corpus).split())
#tfinder.apply_word_filter(lambda x: x in stop_words2)
tfinder.apply_freq_filter(3)
# len(tfinder.nbest(trigram_measures.pmi,10000))
# print()
tri_list = [" ".join(x) for x in tfinder.nbest(trigram_measures.raw_freq,20)]
tri_list
# uncomment and manually edit lists below to choose keyphrases, otehrwise use last line in cell
# k_bigram = ['bone marrow',
# 'cardiac surgery',
# 'carpal tunnel',
# 'tunnel syndrome',
# 'machine redacted',
# 'virtual machine',
# 'infectious disease',
# 'uc irvine',
# 'blah blah',
# 'quality improvement',
# 'current status',
# 'faculty member',
# 'challenging aspect',
# 'icd code',
# 'irb approval',
# 'playing around',
# 'biomedical informatics',
# 'electronic health',
# 'pharmacy school',
# 'sql server']
# k_trigram = ['medical record system',
# 'done better make',
# 'temporary patient list',
# 'better make implementation',
# 'carpal tunnel syndrome',
# 'electronic health record',
# 'four year ago',
# 'free text search',
# 'full text search',
# 'open source tool',
# 'right right sound',
# 'clinical data warehouse',
# 'current status emerse',
# 'done done better',
# 'medical record data',
# 'self service tool',
# 'text search capability',
# 'back local patient',
# 'bone marrow transplant']
#keyphrases = k_bigram + k_trigram
# automatically use top bi-grams and tri-grams as keyphrases
keyphrases = bi_list + tri_list
# reorganize df
if 'word_ct' in list(df.keys()):
df = df.drop('word_ct',axis=1)
# add new column for each factor category and init with 0-value
new_cols = keyphrases
df[new_cols] = 0
df.head()
df.loc[0,'text']
print("")
t = corpus[0]
t
print("")
x = t.count('nurse')
print(f"{x} occurrences of the phrase `data direct` in {df.loc[0,'filename']}")
for k in keyphrases:
print(f"searching for n-gram: {k}")
for lin in range(len(corpus)):
try:
text = corpus[lin]
k_ct = text.count(k)
df.loc[lin,k] = k_ct
except KeyError as ke:
print(f"key not found: {ke}")
except ValueError as ve:
print(f"value of lin ({lin}) is not a recognized line in dataframe: {ve}")
except IndexError as ie:
print(f"value of lin ({lin}) is out of range of corpus: {ie}")
else:
print(f"found {k_ct} occurrences of {k} in vectorized text of {df.loc[lin,'filename']}")
df.to_csv('./emerse_text_features.csv',index=False)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('./emerse_text_features.csv')
factors_df = df.drop(labels=['filename','text'],axis=1)
factors_df.head()
plt.figure(figsize=(25,25))
corr = factors_df.corr()
sns.heatmap(corr, annot=True, cmap="Blues")
print("\ntext feature correlations heatmap")
plt.show()