import json
import pandas as pd
import spacy
modelSpacy = spacy.load('en')
sentencesFile = open("sentences.json")
transcriptFile = open("transcript.json")
sentencesData = json.load(sentencesFile)
transcriptData = json.load(transcriptFile)
sentences = {}
sentIdx = 0
docIdx = 0
currentSpeaker = sentencesData['data'][0]['speaker']
for line in sentencesData['data']:
if int(line['speaker']) != currentSpeaker:
docIdx+=1
currentSpeaker = line['speaker']
for sent in line['sentence']:
sentences[sentIdx] = {"speaker": line['speaker'], "document":docIdx, "sentence": sent, "sentWordCount":len(sent.split())}
sentIdx+=1
sentencesDf = pd.DataFrame.from_dict(sentences, orient='index')
print("Number of sentences:", len(sentencesDf))
print("Number of documents:", len(sentencesDf.document.unique()))
print("Number of speaker:", len(sentencesDf.speaker.unique()), sentencesDf.speaker.unique())
sentencesDf.head()
sentencesDf.groupby(by='speaker').agg(documentQty=('document', 'nunique'), sentenceQty=('sentence', 'count'))
sentencesDf.groupby(by='document').agg(sentenceQty=('sentence', 'count')).sort_values(by='sentenceQty', ascending=False)
stopwords = modelSpacy.Defaults.stop_words
vocabulary = {}
for line in transcriptData["monologues"]:
for element in line["elements"]:
if element["type"]=="text":
value = modelSpacy(element["value"].lower())
for word in value:
lemmaWord = word.lemma_
if lemmaWord not in vocabulary:
vocabulary[lemmaWord] = {"frequency":1, "confidence":element["confidence"], "speakers":[line["speaker"]]}
else:
vocabulary[lemmaWord]["frequency"]+=1
vocabulary[lemmaWord]["confidence"]+=element["confidence"]
if line["speaker"] not in vocabulary[lemmaWord]["speakers"]:
vocabulary[lemmaWord]["speakers"].append(line["speaker"])
for word in vocabulary:
vocabulary[word]["confidence"] = vocabulary[word]["confidence"]/vocabulary[word]["frequency"]
vocabulary[word]["speakerQty"] = len(vocabulary[word]["speakers"])
vocabularyDf = pd.DataFrame.from_dict(vocabulary, orient='index').reset_index().rename(columns={"index": "word"})
stopwords.add("-PRON-")
vocabularyDf[~vocabularyDf["word"].isin(stopwords)].sort_values(by=['frequency'], ascending=False).head(10)
!pip install --upgrade gensim
from gensim.corpora import Dictionary
from gensim.models import Phrases, LdaModel, CoherenceModel, nmf
def preprocessing(documents):
documentsPreprocessed = []
for _ in documents:
docIn = modelSpacy(_.lower())
docOut = []
for token in docIn:
if token.lemma_ not in stopwords and not token.is_punct:
docOut.append(token.lemma_)
documentsPreprocessed.append(docOut)
bigram = Phrases(documentsPreprocessed, min_count=5)
for idx in range(len(documentsPreprocessed)):
for token in bigram[documentsPreprocessed[idx]]:
if '_' in token:
documentsPreprocessed[idx].append(token)
return documentsPreprocessed
def computePerformance(ldaModel, corpus, documentsPreprocessed, dictionary):
# Compute Perplexity
print('Perplexity: ', ldaModel.log_perplexity(corpus))
# Compute Coherence Score
coherenceLdaModel = CoherenceModel(model=ldaModel, texts=documentsPreprocessed, dictionary=dictionary, coherence='c_v')
coherenceLda = coherenceLdaModel.get_coherence()
print('Coherence Score: ', coherenceLda)
return ldaModel, documentsPreprocessed
def trainLDA(documents, num_topics = 10, chunksize = 2000, passes = 20, iterations = 100, eval_every = None):
documentsPreprocessed = preprocessing(documents)
dictionary = Dictionary(documentsPreprocessed)
corpus = [dictionary.doc2bow(doc) for doc in documentsPreprocessed]
print('Number of unique tokens:', len(dictionary))
print('Number of documents:', len(corpus))
# Make a index to word dictionary.
temp = dictionary[0]
id2word = dictionary.id2token
ldaModel = LdaModel(
corpus=corpus,
id2word=id2word,
chunksize=chunksize,
alpha='auto',
eta='auto',
iterations=iterations,
num_topics=num_topics,
passes=passes,
eval_every=eval_every
)
computePerformance(ldaModel, corpus, documentsPreprocessed, dictionary)
return ldaModel
def printBigram(documentsPreprocessed):
bigrams = []
for doc in documentsPreprocessed:
for token in doc:
if len(token.split('_')) > 1 and token not in bigrams:
bigrams.append(token)
print(bigrams)
# Based on Documents
print("Topics based on documents:")
documents = sentencesDf[['document','sentence']].groupby(by='document')['sentence'].apply(lambda x: ' '.join(x))
ldaModel = trainLDA(documents)
ldaModel.print_topics(num_topics=-1, num_words=5)
# Based on Sentences
print("Topics based on sentences:")
documentsBySentence = sentencesDf['sentence'].to_list()
ldaModel = trainLDA(documentsBySentence)
ldaModel.print_topics(num_topics=-1, num_words=5)
pos = ["NOUN", "PROPN", "ADV", "ADJ"]
def preprocessing(documents):
documentsPreprocessed = []
for _ in documents:
docIn = modelSpacy(_.lower())
docOut = []
for token in docIn:
if token.lemma_ not in stopwords and not token.is_punct and token.pos_ in pos:
docOut.append(token.lemma_)
documentsPreprocessed.append(docOut)
bigram = Phrases(documentsPreprocessed, min_count=5)
for idx in range(len(documentsPreprocessed)):
for token in bigram[documentsPreprocessed[idx]]:
if '_' in token:
documentsPreprocessed[idx].append(token)
return documentsPreprocessed
documentsBySentence = sentencesDf['sentence'].to_list()
ldaModel = trainLDA(documentsBySentence)
print("Topics:")
ldaModel.print_topics(num_topics=-1, num_words=5)
from gensim.models import EnsembleLda
documentsBySentence = sentencesDf['sentence'].to_list()
documentsPreprocessed = preprocessing(documentsBySentence)
dictionary = Dictionary(documentsPreprocessed)
corpus = [dictionary.doc2bow(doc) for doc in documentsPreprocessed]
num_topics = 10
chunksize = 2000
passes = 20
iterations = 100
epsilon = 0.5
temp = dictionary[0]
id2word = dictionary.id2token
num_models = 10
ensemble = EnsembleLda(
corpus=corpus,
id2word=id2word,
num_topics=num_topics,
passes=passes,
epsilon=epsilon,
num_models=num_models,
topic_model_class='lda',
iterations=iterations
)
print("Number of topics: ", len(ensemble.ttda))
print("Number of stable topics: ", len(ensemble.get_topics()))
ensemble.print_topics(num_topics=-1, num_words=10)
import re
topics = []
for topic in ensemble.print_topics(num_topics=-1, num_words=20):
for word in topic[1].split('+'):
for token in re.search('[a-z_]+', word)[0].split('_'):
if token not in topics:
topics.append(token)
print(topics)
vocabularyDf[vocabularyDf["word"].isin(topics)].sort_values(by=['frequency'], ascending=False)
!pip install transformers
from transformers import pipeline
unmasker = pipeline('fill-mask', model='distilbert-base-cased')
sentencesProcessed = []
threshold = 0.90
for line in transcriptData["monologues"]:
sentence = []
confidence = []
for element in line["elements"]:
if element["type"] == 'punct' and element["value"] == '.':
sentence.append(element['value'])
confidence.append(1.0)
sentences = [" ".join(sentence)]
for idx, conf in enumerate(confidence):
if conf < threshold:
maskedSentence = sentence.copy()
maskedSentence[idx] = "[MASK]"
newSentences = unmasker(" ".join(maskedSentence))
for _ in newSentences[:2]:
sentences.append(_['sequence'])
sentencesProcessed.append(sentences)
sentence = []
confidence = []
else:
if element["type"] == 'punct':
if element["value"] != ' ':
sentence.append(element['value'])
confidence.append(1.0)
else:
if element["type"] != 'unknown':
sentence.append(element['value'])
confidence.append(element['confidence'])
for idx, sent in enumerate(sentencesProcessed[:20]):
if len(sent) > 1:
print("Sentence %d:" % idx, sent)
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
print("Original 0:", sentencesProcessed[4][0])
for idx, sent in enumerate(sentencesProcessed[4][1:]):
print("Replacement %d: " % (idx+1), sent)
neighborsName = []
neighbors = []
for idx, sent in enumerate(sentencesProcessed[0:10]):
if idx != 4:
print("Sentence %d:" % idx, sent)
neighbors.append(sent[0])
neighborsName.append("Sent %d" % idx)
sentenceEmb = model.encode(sentencesProcessed[4])
neighborsEmb = model.encode(neighbors)
from sklearn.metrics.pairwise import cosine_similarity
simDf = pd.DataFrame(cosine_similarity(sentenceEmb,neighborsEmb), columns =neighborsName)
simDf['sumSimilarity'] = simDf[list(simDf.columns)].sum(axis=1)
simDf