import seaborn as sns
import string
# Set of words in text (extra check to keep only not null in the set)
def set_of_words(text):
return set([i for i in text.lower().split(' ') if i])
# Parse document, opens file, removes lines spaces and punctuation.
def parse_document(document_name):
doc_read = open(document_name, "r").read()
doc_read = doc_read.replace('\n', ' ')
doc_read = doc_read.translate(str.maketrans('', '', string.punctuation))
return doc_read
# Vectorize document (string) according to a basis set
def vectorize_document(basis, document):
document = document.lower()
return [document.count(i.lower()) for i in basis]
# We choose three extracts of Walt Whitman's poems, two dummy inputs and a half of the searc hdocument as inputs documents
input_documents = ['I_Hear_America_Singing.txt',
'I_Sing_the_Body_Electric.txt',
# 'Song_Of_Myself.txt',
'Come_Up_from_the_Fields_Father.txt',
'dummy1.txt',
'dummy2.txt',
'WhenLilacsCutted.txt']
# We define a search document
search_document = 'When_Lilacs_Last_in_the_Dooryard_Bloom.txt'
from stopwords import stopwords
# We store all sets of words from input documents in one list
input_documents_sets = []
for doc in input_documents:
doc_vector = set_of_words(parse_document(doc))
input_documents_sets.append(doc_vector)
# Create a basis set into which all documents are merged.
def basis_of_documents(*args):
all_words = []
for i in args:
all_words += i
return set(all_words)
# Merge all the sets into one set, that will be called "basis"
basis = basis_of_documents(*input_documents_sets)
from stopwords import stopwords
# Count the original basis len
original_len_basis = len(basis)
# Filter and keep only words that are not in stopwords
basis = set(filter(lambda x: x not in stopwords, basis))
print(f"{original_len_basis - len(basis)} stopwords removed from basis.")
search_document = parse_document(document_name=search_document)
search_document_vector = vectorize_document(basis=basis, document=search_document)
results = {}
for basis_document in input_documents:
results[basis_document] = {}
results[basis_document]['Vector'] = vectorize_document(basis=basis, document=parse_document(basis_document))
def dot_product(v1, v2):
# Vector size check
if len(v1) != len(v2):
raise Exception("Lists/vectors provided should have the same dimension")
total_sum = 0
for n in range(len(v1)):
# On each iteration sum the product of the values in that position of the array
total_sum += v1[n]*v2[n]
return total_sum
def euclidean_distance(v1, v2):
# Vector size check
if len(v1) != len(v2):
raise Exception("Lists/vectors provided should have the same dimension")
total_sum = 0
for n in range(len(v1)):
# On each iteration sum the square of the distance between
# the two postions in the vector
total_sum += (v1[n] - v2[n])**2
# Return the square root of the total sum, (will be always positive)
return total_sum**0.5
def cosine_similarity_distance(v1, v2):
# Vector size check
if len(v1) != len(v2):
raise Exception("Lists/vectors provided should have the same dimension")
# Zero vector in the appropiate dimension
zero_vec = [0 for i in range(len(v1))]
# The cosine as shown before will be the dot product divided by the product of the norms.
# The norm of a vector will be the euclidean distance of the vector to the zero vector.
return dot_product(v1, v2) / (euclidean_distance(v1, zero_vec) * euclidean_distance(v2, zero_vec))
# Results of the vector to vector operations are stored in a dictionary.
for basis_document in input_documents:
results[basis_document]['Similarity'] = {
'DotProduct': dot_product(v1=results[basis_document]['Vector'], v2=search_document_vector),
'Euclidean Distance': euclidean_distance(v1=results[basis_document]['Vector'], v2=search_document_vector),
'Cosine Similarity': cosine_similarity_distance(v1=results[basis_document]['Vector'], v2=search_document_vector)
}
import pandas as pd
dot_product_list = [results[i]['Similarity']['DotProduct'] for i in input_documents]
euclidean_distance_list = [results[i]['Similarity']['Euclidean Distance'] for i in input_documents]
cosine_similarity_list = [results[i]['Similarity']['Cosine Similarity'] for i in input_documents]
df = pd.DataFrame({'Documents': input_documents, 'Dot Product': dot_product_list,
'Euclidean Distance': euclidean_distance_list,
'Cosine Similarity': cosine_similarity_list})
df
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
for y in ['Euclidean Distance', 'Dot Product', 'Cosine Similarity']:
fig = go.Figure(data=go.Bar(name=f'{y}', x=df['Documents'], y=df[y]))
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'}, title=f'{y} to Search Document')
fig.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df_corr = df[['Dot Product', 'Euclidean Distance', 'Cosine Similarity']]
#plt.matshow(df_corr.corr())
#df_corr.corr()
corr = df_corr.corr()
# plot the heatmap
sns.heatmap(corr,
xticklabels=corr.columns,
yticklabels=corr.columns,
annot=True)
#
corr.style.background_gradient(cmap='cividis').set_precision(2)
import plotly.graph_objects as go
import numpy as np
fig = go.Figure(data=go.Scatter(x=df['Dot Product'], y=df['Euclidean Distance'], mode='markers', text=df['Documents']))
fig.update_layout(title=f"Similarity of search document to input documents", xaxis_title="Dot Product", yaxis_title="Euclidean Distance")
fig.show()
fig = go.Figure(data=go.Scatter(x=df['Cosine Similarity'], y=df['Dot Product'], mode='markers', text=df['Documents']))
fig.update_layout(title=f"Similarity of search document to input documents", xaxis_title="Cosine Similarity", yaxis_title="Dot Product")
fig.show()
fig = go.Figure(data=go.Scatter(x=df['Cosine Similarity'], y=df['Euclidean Distance'], mode='markers', text=df['Documents']))
fig.update_layout(title=f"Similarity of search document to input documents", xaxis_title="Cosine Similarity", yaxis_title="Euclidean Distance")
fig.show()