Speech-to-text and Topic modeling

import json import pandas as pd import spacy modelSpacy = spacy.load('en')

sentencesFile = open("sentences.json") transcriptFile = open("transcript.json") sentencesData = json.load(sentencesFile) transcriptData = json.load(transcriptFile)

sentences = {} sentIdx = 0 docIdx = 0 currentSpeaker = sentencesData['data'][0]['speaker'] for line in sentencesData['data']: if int(line['speaker']) != currentSpeaker: docIdx+=1 currentSpeaker = line['speaker'] for sent in line['sentence']: sentences[sentIdx] = {"speaker": line['speaker'], "document":docIdx, "sentence": sent, "sentWordCount":len(sent.split())} sentIdx+=1 sentencesDf = pd.DataFrame.from_dict(sentences, orient='index') print("Number of sentences:", len(sentencesDf)) print("Number of documents:", len(sentencesDf.document.unique())) print("Number of speaker:", len(sentencesDf.speaker.unique()), sentencesDf.speaker.unique())

sentencesDf.head()

sentencesDf.groupby(by='speaker').agg(documentQty=('document', 'nunique'), sentenceQty=('sentence', 'count'))

sentencesDf.groupby(by='document').agg(sentenceQty=('sentence', 'count')).sort_values(by='sentenceQty', ascending=False)

stopwords = modelSpacy.Defaults.stop_words vocabulary = {} for line in transcriptData["monologues"]: for element in line["elements"]: if element["type"]=="text": value = modelSpacy(element["value"].lower()) for word in value: lemmaWord = word.lemma_ if lemmaWord not in vocabulary: vocabulary[lemmaWord] = {"frequency":1, "confidence":element["confidence"], "speakers":[line["speaker"]]} else: vocabulary[lemmaWord]["frequency"]+=1 vocabulary[lemmaWord]["confidence"]+=element["confidence"] if line["speaker"] not in vocabulary[lemmaWord]["speakers"]: vocabulary[lemmaWord]["speakers"].append(line["speaker"]) for word in vocabulary: vocabulary[word]["confidence"] = vocabulary[word]["confidence"]/vocabulary[word]["frequency"] vocabulary[word]["speakerQty"] = len(vocabulary[word]["speakers"]) vocabularyDf = pd.DataFrame.from_dict(vocabulary, orient='index').reset_index().rename(columns={"index": "word"})

stopwords.add("-PRON-") vocabularyDf[~vocabularyDf["word"].isin(stopwords)].sort_values(by=['frequency'], ascending=False).head(10)

!pip install --upgrade gensim

from gensim.corpora import Dictionary from gensim.models import Phrases, LdaModel, CoherenceModel, nmf def preprocessing(documents): documentsPreprocessed = [] for _ in documents: docIn = modelSpacy(_.lower()) docOut = [] for token in docIn: if token.lemma_ not in stopwords and not token.is_punct: docOut.append(token.lemma_) documentsPreprocessed.append(docOut) bigram = Phrases(documentsPreprocessed, min_count=5) for idx in range(len(documentsPreprocessed)): for token in bigram[documentsPreprocessed[idx]]: if '_' in token: documentsPreprocessed[idx].append(token) return documentsPreprocessed def computePerformance(ldaModel, corpus, documentsPreprocessed, dictionary): # Compute Perplexity print('Perplexity: ', ldaModel.log_perplexity(corpus)) # Compute Coherence Score coherenceLdaModel = CoherenceModel(model=ldaModel, texts=documentsPreprocessed, dictionary=dictionary, coherence='c_v') coherenceLda = coherenceLdaModel.get_coherence() print('Coherence Score: ', coherenceLda) return ldaModel, documentsPreprocessed def trainLDA(documents, num_topics = 10, chunksize = 2000, passes = 20, iterations = 100, eval_every = None): documentsPreprocessed = preprocessing(documents) dictionary = Dictionary(documentsPreprocessed) corpus = [dictionary.doc2bow(doc) for doc in documentsPreprocessed] print('Number of unique tokens:', len(dictionary)) print('Number of documents:', len(corpus)) # Make a index to word dictionary. temp = dictionary[0] id2word = dictionary.id2token ldaModel = LdaModel( corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every ) computePerformance(ldaModel, corpus, documentsPreprocessed, dictionary) return ldaModel def printBigram(documentsPreprocessed): bigrams = [] for doc in documentsPreprocessed: for token in doc: if len(token.split('_')) > 1 and token not in bigrams: bigrams.append(token) print(bigrams)

# Based on Documents print("Topics based on documents:") documents = sentencesDf[['document','sentence']].groupby(by='document')['sentence'].apply(lambda x: ' '.join(x)) ldaModel = trainLDA(documents) ldaModel.print_topics(num_topics=-1, num_words=5)

# Based on Sentences print("Topics based on sentences:") documentsBySentence = sentencesDf['sentence'].to_list() ldaModel = trainLDA(documentsBySentence) ldaModel.print_topics(num_topics=-1, num_words=5)

pos = ["NOUN", "PROPN", "ADV", "ADJ"] def preprocessing(documents): documentsPreprocessed = [] for _ in documents: docIn = modelSpacy(_.lower()) docOut = [] for token in docIn: if token.lemma_ not in stopwords and not token.is_punct and token.pos_ in pos: docOut.append(token.lemma_) documentsPreprocessed.append(docOut) bigram = Phrases(documentsPreprocessed, min_count=5) for idx in range(len(documentsPreprocessed)): for token in bigram[documentsPreprocessed[idx]]: if '_' in token: documentsPreprocessed[idx].append(token) return documentsPreprocessed

documentsBySentence = sentencesDf['sentence'].to_list() ldaModel = trainLDA(documentsBySentence) print("Topics:") ldaModel.print_topics(num_topics=-1, num_words=5)

from gensim.models import EnsembleLda documentsBySentence = sentencesDf['sentence'].to_list() documentsPreprocessed = preprocessing(documentsBySentence) dictionary = Dictionary(documentsPreprocessed) corpus = [dictionary.doc2bow(doc) for doc in documentsPreprocessed] num_topics = 10 chunksize = 2000 passes = 20 iterations = 100 epsilon = 0.5 temp = dictionary[0] id2word = dictionary.id2token num_models = 10 ensemble = EnsembleLda( corpus=corpus, id2word=id2word, num_topics=num_topics, passes=passes, epsilon=epsilon, num_models=num_models, topic_model_class='lda', iterations=iterations ) print("Number of topics: ", len(ensemble.ttda)) print("Number of stable topics: ", len(ensemble.get_topics()))

ensemble.print_topics(num_topics=-1, num_words=10)

import re topics = [] for topic in ensemble.print_topics(num_topics=-1, num_words=20): for word in topic[1].split('+'): for token in re.search('[a-z_]+', word)[0].split('_'): if token not in topics: topics.append(token) print(topics) vocabularyDf[vocabularyDf["word"].isin(topics)].sort_values(by=['frequency'], ascending=False)

!pip install transformers

from transformers import pipeline unmasker = pipeline('fill-mask', model='distilbert-base-cased')

sentencesProcessed = [] threshold = 0.90 for line in transcriptData["monologues"]: sentence = [] confidence = [] for element in line["elements"]: if element["type"] == 'punct' and element["value"] == '.': sentence.append(element['value']) confidence.append(1.0) sentences = [" ".join(sentence)] for idx, conf in enumerate(confidence): if conf < threshold: maskedSentence = sentence.copy() maskedSentence[idx] = "[MASK]" newSentences = unmasker(" ".join(maskedSentence)) for _ in newSentences[:2]: sentences.append(_['sequence']) sentencesProcessed.append(sentences) sentence = [] confidence = [] else: if element["type"] == 'punct': if element["value"] != ' ': sentence.append(element['value']) confidence.append(1.0) else: if element["type"] != 'unknown': sentence.append(element['value']) confidence.append(element['confidence'])

for idx, sent in enumerate(sentencesProcessed[:20]): if len(sent) > 1: print("Sentence %d:" % idx, sent)

!pip install sentence-transformers

from sentence_transformers import SentenceTransformer model = SentenceTransformer('bert-base-nli-mean-tokens')

print("Original 0:", sentencesProcessed[4][0]) for idx, sent in enumerate(sentencesProcessed[4][1:]): print("Replacement %d: " % (idx+1), sent)

neighborsName = [] neighbors = [] for idx, sent in enumerate(sentencesProcessed[0:10]): if idx != 4: print("Sentence %d:" % idx, sent) neighbors.append(sent[0]) neighborsName.append("Sent %d" % idx)

sentenceEmb = model.encode(sentencesProcessed[4]) neighborsEmb = model.encode(neighbors)

from sklearn.metrics.pairwise import cosine_similarity simDf = pd.DataFrame(cosine_similarity(sentenceEmb,neighborsEmb), columns =neighborsName) simDf['sumSimilarity'] = simDf[list(simDf.columns)].sum(axis=1) simDf