import json
import pandas as pd
import spacy
modelSpacy = spacy.load('en')
sentencesFile = open("sentences.json")
transcriptFile = open("transcript.json")
 
sentencesData = json.load(sentencesFile)
transcriptData = json.load(transcriptFile)
sentences = {}
sentIdx = 0
docIdx = 0
currentSpeaker = sentencesData['data'][0]['speaker']
for line in sentencesData['data']:
    if int(line['speaker']) != currentSpeaker:
        docIdx+=1
        currentSpeaker = line['speaker']
    for sent in line['sentence']:
        sentences[sentIdx] = {"speaker": line['speaker'], "document":docIdx, "sentence": sent, "sentWordCount":len(sent.split())}
        sentIdx+=1
sentencesDf = pd.DataFrame.from_dict(sentences, orient='index')
print("Number of sentences:", len(sentencesDf))
print("Number of documents:", len(sentencesDf.document.unique()))
print("Number of speaker:", len(sentencesDf.speaker.unique()), sentencesDf.speaker.unique())
sentencesDf.head()
sentencesDf.groupby(by='speaker').agg(documentQty=('document', 'nunique'), sentenceQty=('sentence', 'count'))
sentencesDf.groupby(by='document').agg(sentenceQty=('sentence', 'count')).sort_values(by='sentenceQty', ascending=False)
stopwords = modelSpacy.Defaults.stop_words
vocabulary = {}
for line in transcriptData["monologues"]:
    for element in line["elements"]:
        if element["type"]=="text":
            value = modelSpacy(element["value"].lower())
            for word in value:
                lemmaWord = word.lemma_
                if lemmaWord not in vocabulary:
                    vocabulary[lemmaWord] = {"frequency":1, "confidence":element["confidence"], "speakers":[line["speaker"]]}
                else:
                    vocabulary[lemmaWord]["frequency"]+=1
                    vocabulary[lemmaWord]["confidence"]+=element["confidence"]
                    if line["speaker"] not in vocabulary[lemmaWord]["speakers"]:
                        vocabulary[lemmaWord]["speakers"].append(line["speaker"])
for word in vocabulary:
    vocabulary[word]["confidence"] = vocabulary[word]["confidence"]/vocabulary[word]["frequency"]
    vocabulary[word]["speakerQty"] = len(vocabulary[word]["speakers"])
vocabularyDf = pd.DataFrame.from_dict(vocabulary, orient='index').reset_index().rename(columns={"index": "word"})
stopwords.add("-PRON-")
vocabularyDf[~vocabularyDf["word"].isin(stopwords)].sort_values(by=['frequency'], ascending=False).head(10)
!pip install --upgrade gensim
from gensim.corpora import Dictionary
from gensim.models import Phrases, LdaModel, CoherenceModel, nmf
def preprocessing(documents):
    documentsPreprocessed = []
    for _ in documents:
        docIn = modelSpacy(_.lower())
        docOut = []
        for token in docIn:
            if token.lemma_ not in stopwords and not token.is_punct:
                docOut.append(token.lemma_)
        documentsPreprocessed.append(docOut)
    
    bigram = Phrases(documentsPreprocessed, min_count=5)
    for idx in range(len(documentsPreprocessed)):
        for token in bigram[documentsPreprocessed[idx]]:
            if '_' in token:
                documentsPreprocessed[idx].append(token)
    
    return documentsPreprocessed
def computePerformance(ldaModel, corpus, documentsPreprocessed, dictionary):
    # Compute Perplexity
    print('Perplexity: ', ldaModel.log_perplexity(corpus))
    # Compute Coherence Score
    coherenceLdaModel = CoherenceModel(model=ldaModel, texts=documentsPreprocessed, dictionary=dictionary, coherence='c_v')
    coherenceLda = coherenceLdaModel.get_coherence()
    print('Coherence Score: ', coherenceLda)
    return ldaModel, documentsPreprocessed
def trainLDA(documents, num_topics = 10, chunksize = 2000, passes = 20, iterations = 100, eval_every = None):
    documentsPreprocessed = preprocessing(documents)
    
    dictionary = Dictionary(documentsPreprocessed)
    corpus = [dictionary.doc2bow(doc) for doc in documentsPreprocessed]
    print('Number of unique tokens:', len(dictionary))
    print('Number of documents:', len(corpus))
    # Make a index to word dictionary.
    temp = dictionary[0]
    id2word = dictionary.id2token
    ldaModel = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )
    computePerformance(ldaModel, corpus, documentsPreprocessed, dictionary)
    return ldaModel
def printBigram(documentsPreprocessed):
    bigrams = []
    for doc in documentsPreprocessed:
        for token in doc:
            if len(token.split('_')) > 1 and token not in bigrams:
                bigrams.append(token)
    print(bigrams)
# Based on Documents
print("Topics based on documents:")
documents = sentencesDf[['document','sentence']].groupby(by='document')['sentence'].apply(lambda x: ' '.join(x))
ldaModel = trainLDA(documents)
ldaModel.print_topics(num_topics=-1, num_words=5)
# Based on Sentences
print("Topics based on sentences:")
documentsBySentence = sentencesDf['sentence'].to_list()
ldaModel = trainLDA(documentsBySentence)
ldaModel.print_topics(num_topics=-1, num_words=5)
pos = ["NOUN", "PROPN", "ADV", "ADJ"]
def preprocessing(documents):
    documentsPreprocessed = []
    for _ in documents:
        docIn = modelSpacy(_.lower())
        docOut = []
        for token in docIn:
            if token.lemma_ not in stopwords and not token.is_punct and token.pos_ in pos:
                docOut.append(token.lemma_)
        documentsPreprocessed.append(docOut)
    
    bigram = Phrases(documentsPreprocessed, min_count=5)
    for idx in range(len(documentsPreprocessed)):
        for token in bigram[documentsPreprocessed[idx]]:
            if '_' in token:
                documentsPreprocessed[idx].append(token)
    
    return documentsPreprocessed
documentsBySentence = sentencesDf['sentence'].to_list()
ldaModel = trainLDA(documentsBySentence)
print("Topics:")
ldaModel.print_topics(num_topics=-1, num_words=5)
from gensim.models import EnsembleLda
documentsBySentence = sentencesDf['sentence'].to_list()
documentsPreprocessed = preprocessing(documentsBySentence)
dictionary = Dictionary(documentsPreprocessed)
corpus = [dictionary.doc2bow(doc) for doc in documentsPreprocessed]
num_topics = 10
chunksize = 2000
passes = 20
iterations = 100
epsilon = 0.5
temp = dictionary[0]
id2word = dictionary.id2token
num_models = 10
ensemble = EnsembleLda(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    passes=passes,
    epsilon=epsilon,
    num_models=num_models,
    topic_model_class='lda',
    iterations=iterations
)
print("Number of topics: ", len(ensemble.ttda))
print("Number of stable topics: ", len(ensemble.get_topics()))
ensemble.print_topics(num_topics=-1, num_words=10)
import re
topics = []
for topic in ensemble.print_topics(num_topics=-1, num_words=20):
    for word in topic[1].split('+'):
        for token in re.search('[a-z_]+', word)[0].split('_'):
            if token not in topics:
                topics.append(token)
print(topics)
vocabularyDf[vocabularyDf["word"].isin(topics)].sort_values(by=['frequency'], ascending=False)
!pip install transformers
from transformers import pipeline
unmasker = pipeline('fill-mask', model='distilbert-base-cased')
sentencesProcessed = []
threshold = 0.90
for line in transcriptData["monologues"]:
    sentence = []
    confidence = []
    
    for element in line["elements"]:
        if element["type"] == 'punct' and element["value"] == '.':
            sentence.append(element['value'])
            confidence.append(1.0)
            sentences = [" ".join(sentence)]
            for idx, conf in enumerate(confidence):
                if conf < threshold:
                    maskedSentence = sentence.copy()
                    maskedSentence[idx] = "[MASK]"
                    newSentences = unmasker(" ".join(maskedSentence))
                    for _ in newSentences[:2]:
                        sentences.append(_['sequence'])
            sentencesProcessed.append(sentences)
            sentence = []
            confidence = []
        else:
            if element["type"] == 'punct':
                if element["value"] != ' ':
                    sentence.append(element['value'])
                    confidence.append(1.0)
            else:
                if element["type"] != 'unknown':
                    sentence.append(element['value'])
                    confidence.append(element['confidence'])
for idx, sent in enumerate(sentencesProcessed[:20]):
    if len(sent) > 1:
        print("Sentence %d:" % idx, sent)
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
print("Original    0:", sentencesProcessed[4][0])
for idx, sent in enumerate(sentencesProcessed[4][1:]):
    print("Replacement %d: " % (idx+1), sent)
neighborsName = []
neighbors = []
for idx, sent in enumerate(sentencesProcessed[0:10]):
    if idx != 4:
        print("Sentence %d:" % idx, sent)
        neighbors.append(sent[0])
        neighborsName.append("Sent %d" % idx)
sentenceEmb = model.encode(sentencesProcessed[4])
neighborsEmb = model.encode(neighbors)
from sklearn.metrics.pairwise import cosine_similarity
simDf = pd.DataFrame(cosine_similarity(sentenceEmb,neighborsEmb), columns =neighborsName)
simDf['sumSimilarity'] = simDf[list(simDf.columns)].sum(axis=1)
simDf