import json
import pandas as pd
import spacy
modelSpacy = spacy.load('en')
sentencesFile = open("sentences.json")
transcriptFile = open("transcript.json")
sentencesData = json.load(sentencesFile)
transcriptData = json.load(transcriptFile)
sentences = {}
sentIdx = 0
docIdx = 0
currentSpeaker = sentencesData['data'][0]['speaker']
for line in sentencesData['data']:
if int(line['speaker']) != currentSpeaker:
docIdx+=1
currentSpeaker = line['speaker']
for sent in line['sentence']:
sentences[sentIdx] = {"speaker": line['speaker'], "document":docIdx, "sentence": sent, "sentWordCount":len(sent.split())}
sentIdx+=1
sentencesDf = pd.DataFrame.from_dict(sentences, orient='index')
print("Number of sentences:", len(sentencesDf))
print("Number of documents:", len(sentencesDf.document.unique()))
print("Number of speaker:", len(sentencesDf.speaker.unique()), sentencesDf.speaker.unique())
Number of sentences: 644
Number of documents: 35
Number of speaker: 7 [1 0 2 3 4 5 6]
sentencesDf.head()
sentencesDf.groupby(by='speaker').agg(documentQty=('document', 'nunique'), sentenceQty=('sentence', 'count'))
sentencesDf.groupby(by='document').agg(sentenceQty=('sentence', 'count')).sort_values(by='sentenceQty', ascending=False)
stopwords = modelSpacy.Defaults.stop_words
vocabulary = {}
for line in transcriptData["monologues"]:
for element in line["elements"]:
if element["type"]=="text":
value = modelSpacy(element["value"].lower())
for word in value:
lemmaWord = word.lemma_
if lemmaWord not in vocabulary:
vocabulary[lemmaWord] = {"frequency":1, "confidence":element["confidence"], "speakers":[line["speaker"]]}
else:
vocabulary[lemmaWord]["frequency"]+=1
vocabulary[lemmaWord]["confidence"]+=element["confidence"]
if line["speaker"] not in vocabulary[lemmaWord]["speakers"]:
vocabulary[lemmaWord]["speakers"].append(line["speaker"])
for word in vocabulary:
vocabulary[word]["confidence"] = vocabulary[word]["confidence"]/vocabulary[word]["frequency"]
vocabulary[word]["speakerQty"] = len(vocabulary[word]["speakers"])
vocabularyDf = pd.DataFrame.from_dict(vocabulary, orient='index').reset_index().rename(columns={"index": "word"})
stopwords.add("-PRON-")
vocabularyDf[~vocabularyDf["word"].isin(stopwords)].sort_values(by=['frequency'], ascending=False).head(10)
!pip install --upgrade gensim
from gensim.corpora import Dictionary
from gensim.models import Phrases, LdaModel, CoherenceModel, nmf
def preprocessing(documents):
documentsPreprocessed = []
for _ in documents:
docIn = modelSpacy(_.lower())
docOut = []
for token in docIn:
if token.lemma_ not in stopwords and not token.is_punct:
docOut.append(token.lemma_)
documentsPreprocessed.append(docOut)
bigram = Phrases(documentsPreprocessed, min_count=5)
for idx in range(len(documentsPreprocessed)):
for token in bigram[documentsPreprocessed[idx]]:
if '_' in token:
documentsPreprocessed[idx].append(token)
return documentsPreprocessed
def computePerformance(ldaModel, corpus, documentsPreprocessed, dictionary):
# Compute Perplexity
print('Perplexity: ', ldaModel.log_perplexity(corpus))
# Compute Coherence Score
coherenceLdaModel = CoherenceModel(model=ldaModel, texts=documentsPreprocessed, dictionary=dictionary, coherence='c_v')
coherenceLda = coherenceLdaModel.get_coherence()
print('Coherence Score: ', coherenceLda)
return ldaModel, documentsPreprocessed
def trainLDA(documents, num_topics = 10, chunksize = 2000, passes = 20, iterations = 100, eval_every = None):
documentsPreprocessed = preprocessing(documents)
dictionary = Dictionary(documentsPreprocessed)
corpus = [dictionary.doc2bow(doc) for doc in documentsPreprocessed]
print('Number of unique tokens:', len(dictionary))
print('Number of documents:', len(corpus))
# Make a index to word dictionary.
temp = dictionary[0]
id2word = dictionary.id2token
ldaModel = LdaModel(
corpus=corpus,
id2word=id2word,
chunksize=chunksize,
alpha='auto',
eta='auto',
iterations=iterations,
num_topics=num_topics,
passes=passes,
eval_every=eval_every
)
computePerformance(ldaModel, corpus, documentsPreprocessed, dictionary)
return ldaModel
def printBigram(documentsPreprocessed):
bigrams = []
for doc in documentsPreprocessed:
for token in doc:
if len(token.split('_')) > 1 and token not in bigrams:
bigrams.append(token)
print(bigrams)
# Based on Documents
print("Topics based on documents:")
documents = sentencesDf[['document','sentence']].groupby(by='document')['sentence'].apply(lambda x: ' '.join(x))
ldaModel = trainLDA(documents)
ldaModel.print_topics(num_topics=-1, num_words=5)
Topics based on documents:
Number of unique tokens: 1045
Number of documents: 35
Perplexity: -6.5617880875509105
Coherence Score: 0.23608711616131087
# Based on Sentences
print("Topics based on sentences:")
documentsBySentence = sentencesDf['sentence'].to_list()
ldaModel = trainLDA(documentsBySentence)
ldaModel.print_topics(num_topics=-1, num_words=5)
Topics based on sentences:
Number of unique tokens: 1045
Number of documents: 644
Perplexity: -6.935174328913833
Coherence Score: 0.3939808402838773
pos = ["NOUN", "PROPN", "ADV", "ADJ"]
def preprocessing(documents):
documentsPreprocessed = []
for _ in documents:
docIn = modelSpacy(_.lower())
docOut = []
for token in docIn:
if token.lemma_ not in stopwords and not token.is_punct and token.pos_ in pos:
docOut.append(token.lemma_)
documentsPreprocessed.append(docOut)
bigram = Phrases(documentsPreprocessed, min_count=5)
for idx in range(len(documentsPreprocessed)):
for token in bigram[documentsPreprocessed[idx]]:
if '_' in token:
documentsPreprocessed[idx].append(token)
return documentsPreprocessed
documentsBySentence = sentencesDf['sentence'].to_list()
ldaModel = trainLDA(documentsBySentence)
print("Topics:")
ldaModel.print_topics(num_topics=-1, num_words=5)
Number of unique tokens: 803
Number of documents: 644
Perplexity: -6.862700005709316
Coherence Score: 0.5115125855999766
Topics:
from gensim.models import EnsembleLda
documentsBySentence = sentencesDf['sentence'].to_list()
documentsPreprocessed = preprocessing(documentsBySentence)
dictionary = Dictionary(documentsPreprocessed)
corpus = [dictionary.doc2bow(doc) for doc in documentsPreprocessed]
num_topics = 10
chunksize = 2000
passes = 20
iterations = 100
epsilon = 0.5
temp = dictionary[0]
id2word = dictionary.id2token
num_models = 10
ensemble = EnsembleLda(
corpus=corpus,
id2word=id2word,
num_topics=num_topics,
passes=passes,
epsilon=epsilon,
num_models=num_models,
topic_model_class='lda',
iterations=iterations
)
print("Number of topics: ", len(ensemble.ttda))
print("Number of stable topics: ", len(ensemble.get_topics()))
Number of topics: 100
Number of stable topics: 3
ensemble.print_topics(num_topics=-1, num_words=10)
import re
topics = []
for topic in ensemble.print_topics(num_topics=-1, num_words=20):
for word in topic[1].split('+'):
for token in re.search('[a-z_]+', word)[0].split('_'):
if token not in topics:
topics.append(token)
print(topics)
vocabularyDf[vocabularyDf["word"].isin(topics)].sort_values(by=['frequency'], ascending=False)
['ai', 'datum', 'actually', 'bias', 'intelligence', 'example', 'solution', 'lot', 'course', 'kind', 'time', 'important', 'business', 'specific', 'people', 'system', 'human', 'artificial', 'certain', 'big', 'learning', 'deep', 'machine', 'model', 'aspect', 'security', 'usually', 'good', 'way', 'question', 'maybe', 'ethical', 'hand', 'bit', 'ethic', 'practical', 'user', 'moment', 'technical']
!pip install transformers
from transformers import pipeline
unmasker = pipeline('fill-mask', model='distilbert-base-cased')
sentencesProcessed = []
threshold = 0.90
for line in transcriptData["monologues"]:
sentence = []
confidence = []
for element in line["elements"]:
if element["type"] == 'punct' and element["value"] == '.':
sentence.append(element['value'])
confidence.append(1.0)
sentences = [" ".join(sentence)]
for idx, conf in enumerate(confidence):
if conf < threshold:
maskedSentence = sentence.copy()
maskedSentence[idx] = "[MASK]"
newSentences = unmasker(" ".join(maskedSentence))
for _ in newSentences[:2]:
sentences.append(_['sequence'])
sentencesProcessed.append(sentences)
sentence = []
confidence = []
else:
if element["type"] == 'punct':
if element["value"] != ' ':
sentence.append(element['value'])
confidence.append(1.0)
else:
if element["type"] != 'unknown':
sentence.append(element['value'])
confidence.append(element['confidence'])
for idx, sent in enumerate(sentencesProcessed[:20]):
if len(sent) > 1:
print("Sentence %d:" % idx, sent)
Sentence 4: ["It was the mother young Maya , she's the CEO and founder of Skype form .", "she was the mother young Maya, she's the CEO and founder of Skype form.", "She was the mother young Maya, she's the CEO and founder of Skype form.", "It is the mother young Maya, she's the CEO and founder of Skype form.", "It : the mother young Maya, she's the CEO and founder of Skype form.", "It was her mother young Maya, she's the CEO and founder of Skype form.", "It was my mother young Maya, she's the CEO and founder of Skype form.", "It was the mother of Maya, she's the CEO and founder of Skype form.", "It was the mother for Maya, she's the CEO and founder of Skype form.", "It was the mother young lady, she's the CEO and founder of Skype form.", "It was the mother young entrepreneur, she's the CEO and founder of Skype form.", 'It was the mother young Maya, also the CEO and founder of Skype form.', 'It was the mother young Maya, later the CEO and founder of Skype form.', "It was the mother young Maya, she's the CEO and founder of Skype entertainment.", "It was the mother young Maya, she's the CEO and founder of Skypevision."]
Sentence 6: ["And for all of you who really want to know a little bit more about who she is , I'm just like quoting her from her website .", "And for all of you who really want to know a little bit more about who she is, I'm just like quoting her from her website.", "And for all of you who really need to know a little bit more about who she is, I'm just like quoting her from her website."]
Sentence 7: ["She's a computer national science scientist engineer , and then award-winning serial entrepreneur .", 'became a computer national science scientist engineer, and then award - winning serial entrepreneur.', 'was a computer national science scientist engineer, and then award - winning serial entrepreneur.', "She's a computer and science scientist engineer, and then award - winning serial entrepreneur.", "She's a computer - science scientist engineer, and then award - winning serial entrepreneur."]
Sentence 9: ['And she gained an in-depth education and experience in distributed computing and applied mathematics , as well as interdisciplinary plenary , collaboration and leadership across various business domains .', 'And she gained an extensive education and experience in distributed computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an academic education and experience in distributed computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience in distributed computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience spanning distributed computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience in quantum computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience in computational computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience in distributed computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience in distributed computing and computational mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.']
Sentence 10: ["All's last strongly believes that if science and tech is developed within with human beings and the environment in mind , it solves a lot of burning issues .", 'the last strongly believes that if science and tech is developed within with human beings and the environment in mind, it solves a lot of burning issues.', 'he last strongly believes that if science and tech is developed within with human beings and the environment in mind, it solves a lot of burning issues.', "All's foundation strongly believes that if science and tech is developed within with human beings and the environment in mind, it solves a lot of burning issues.", "All's leadership strongly believes that if science and tech is developed within with human beings and the environment in mind, it solves a lot of burning issues.", "All's last strongly believes that if science and tech is developed within both human beings and the environment in mind, it solves a lot of burning issues.", "All's last strongly believes that if science and tech is developed within ordinary human beings and the environment in mind, it solves a lot of burning issues.", "All's last strongly believes that if science and tech is developed within with human beings and the environment in mind, it raises a lot of burning issues.", "All's last strongly believes that if science and tech is developed within with human beings and the environment in mind, it poses a lot of burning issues."]
Sentence 11: ['We face currently as a society .', 'We face currently becoming a society.', 'We face currently being a society.', 'We face currently asian society.', 'We face currently as a society.']
Sentence 12: ['So welcome with me again , Oslo , Medea , Maya , an expert in AI , big data , high performance computing , and quantum quantum computing welcome a lot .', 'So come with me again, Oslo, Medea, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a lot.', 'So meet with me again, Oslo, Medea, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a lot.', 'So welcome with me again, honey, Medea, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a lot.', 'So welcome with me again, yes, Medea, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a lot.', 'So welcome with me again, Oslo, scientist, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a lot.', 'So welcome with me again, Oslo, physicist, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a lot.', 'So welcome with me again, Oslo, Medea, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a lot.', 'So welcome with me again, Oslo, Medea, Maya, an expert in AI, big data, low performance computing, and quantum quantum computing welcome a lot.', 'So welcome with me again, Oslo, Medea, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a lot.', 'So welcome with me again, Oslo, Medea, Maya, an expert in AI, big data, high speed computing, and quantum quantum computing welcome a lot.', 'So welcome with me again, Oslo, Medea, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a lot.', 'So welcome with me again, Oslo, Medea, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome your lot.', 'So welcome with me again, Oslo, Medea, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a welcome.', 'So welcome with me again, Oslo, Medea, Maya, an expert in AI, big data, high performance computing, and quantum quantum computing welcome a guest.']
Sentence 14: ['Yeah , I was .', 'Yeah, it was.', 'Yeah, he was.', 'Yeah, I thought.', 'Yeah, I said.']
Sentence 15: ['Thank you very much , Petra for this very , very coming in production .', 'Thank you very much, thanks for this very, very coming in production.', 'Thank you very much, thank for this very, very coming in production.', 'Thank you very much, Petra for this very, exciting coming in production.', 'Thank you very much, Petra for this very, special coming in production.']
Sentence 16: ["I it's really a pleasure to be here and to speak about AI in business and how to develop it ethically , which I think is really , uh , very , very important that it also helps and supports , um , all of us .", "• it's really a pleasure to be here and to speak about AI in business and how to develop it ethically, which I think is really, uh, very, very important that it also helps and supports, um, all of us.", "“ it's really a pleasure to be here and to speak about AI in business and how to develop it ethically, which I think is really, uh, very, very important that it also helps and supports, um, all of us.", "I it's really a pleasure to be here and to speak about AI in business and how to develop it ethically, which I think is very, uh, very, very important that it also helps and supports, um, all of us.", "I it's really a pleasure to be here and to speak about AI in business and how to develop it ethically, which I think is really, uh, very, very important that it also helps and supports, um, all of us.", "I it's really a pleasure to be here and to speak about AI in business and how to develop it ethically, which I think is really, very, very, very important that it also helps and supports, um, all of us.", "I it's really a pleasure to be here and to speak about AI in business and how to develop it ethically, which I think is really, really, very, very important that it also helps and supports, um, all of us.", "I it's really a pleasure to be here and to speak about AI in business and how to develop it ethically, which I think is really, uh, very, very important that it also helps and supports, hopefully, all of us.", "I it's really a pleasure to be here and to speak about AI in business and how to develop it ethically, which I think is really, uh, very, very important that it also helps and supports, really, all of us."]
Sentence 17: ["Well , it's not , may I jump in for one second ? Sorry for that .", "Well, it's okay, may I jump in for one second? Sorry for that.", "Well, it's alright, may I jump in for one second? Sorry for that."]
Sentence 18: ['For all of you , you can rise questions while it was talking .', 'For all of you, you can ask questions while it was talking.', 'For all of you, you can answer questions while it was talking.', 'For all of you, you can rise questions while everyone was talking.', 'For all of you, you can rise questions while everybody was talking.', 'For all of you, you can rise questions while it keeps talking.', 'For all of you, you can rise questions while it is talking.']
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
print("Original 0:", sentencesProcessed[4][0])
for idx, sent in enumerate(sentencesProcessed[4][1:]):
print("Replacement %d: " % (idx+1), sent)
Original 0: It was the mother young Maya , she's the CEO and founder of Skype form .
Replacement 1: she was the mother young Maya, she's the CEO and founder of Skype form.
Replacement 2: She was the mother young Maya, she's the CEO and founder of Skype form.
Replacement 3: It is the mother young Maya, she's the CEO and founder of Skype form.
Replacement 4: It : the mother young Maya, she's the CEO and founder of Skype form.
Replacement 5: It was her mother young Maya, she's the CEO and founder of Skype form.
Replacement 6: It was my mother young Maya, she's the CEO and founder of Skype form.
Replacement 7: It was the mother of Maya, she's the CEO and founder of Skype form.
Replacement 8: It was the mother for Maya, she's the CEO and founder of Skype form.
Replacement 9: It was the mother young lady, she's the CEO and founder of Skype form.
Replacement 10: It was the mother young entrepreneur, she's the CEO and founder of Skype form.
Replacement 11: It was the mother young Maya, also the CEO and founder of Skype form.
Replacement 12: It was the mother young Maya, later the CEO and founder of Skype form.
Replacement 13: It was the mother young Maya, she's the CEO and founder of Skype entertainment.
Replacement 14: It was the mother young Maya, she's the CEO and founder of Skypevision.
neighborsName = []
neighbors = []
for idx, sent in enumerate(sentencesProcessed[0:10]):
if idx != 4:
print("Sentence %d:" % idx, sent)
neighbors.append(sent[0])
neighborsName.append("Sent %d" % idx)
Sentence 0: ['Yes .']
Sentence 1: ['Hello everyone .']
Sentence 2: ["And welcome to today's master class ."]
Sentence 3: ["It's about how to develop ethical AI in business and what come with me ."]
Sentence 5: ['And she basically sits in Zurich .']
Sentence 6: ["And for all of you who really want to know a little bit more about who she is , I'm just like quoting her from her website .", "And for all of you who really want to know a little bit more about who she is, I'm just like quoting her from her website.", "And for all of you who really need to know a little bit more about who she is, I'm just like quoting her from her website."]
Sentence 7: ["She's a computer national science scientist engineer , and then award-winning serial entrepreneur .", 'became a computer national science scientist engineer, and then award - winning serial entrepreneur.', 'was a computer national science scientist engineer, and then award - winning serial entrepreneur.', "She's a computer and science scientist engineer, and then award - winning serial entrepreneur.", "She's a computer - science scientist engineer, and then award - winning serial entrepreneur."]
Sentence 8: ["She's fascinated by almost any cutting edge science and tech topics ."]
Sentence 9: ['And she gained an in-depth education and experience in distributed computing and applied mathematics , as well as interdisciplinary plenary , collaboration and leadership across various business domains .', 'And she gained an extensive education and experience in distributed computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an academic education and experience in distributed computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience in distributed computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience spanning distributed computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience in quantum computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience in computational computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience in distributed computing and applied mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.', 'And she gained an in - depth education and experience in distributed computing and computational mathematics, as well as interdisciplinary plenary, collaboration and leadership across various business domains.']
sentenceEmb = model.encode(sentencesProcessed[4])
neighborsEmb = model.encode(neighbors)
from sklearn.metrics.pairwise import cosine_similarity
simDf = pd.DataFrame(cosine_similarity(sentenceEmb,neighborsEmb), columns =neighborsName)
simDf['sumSimilarity'] = simDf[list(simDf.columns)].sum(axis=1)
simDf