Poem Cosine Similarity NLP

import seaborn as sns

import string # Set of words in text (extra check to keep only not null in the set) def set_of_words(text): return set([i for i in text.lower().split(' ') if i]) # Parse document, opens file, removes lines spaces and punctuation. def parse_document(document_name): doc_read = open(document_name, "r").read() doc_read = doc_read.replace('\n', ' ') doc_read = doc_read.translate(str.maketrans('', '', string.punctuation)) return doc_read # Vectorize document (string) according to a basis set def vectorize_document(basis, document): document = document.lower() return [document.count(i.lower()) for i in basis]

# We choose three extracts of Walt Whitman's poems, two dummy inputs and a half of the searc hdocument as inputs documents input_documents = ['I_Hear_America_Singing.txt', 'I_Sing_the_Body_Electric.txt', # 'Song_Of_Myself.txt', 'Come_Up_from_the_Fields_Father.txt', 'dummy1.txt', 'dummy2.txt', 'WhenLilacsCutted.txt'] # We define a search document search_document = 'When_Lilacs_Last_in_the_Dooryard_Bloom.txt'

from stopwords import stopwords # We store all sets of words from input documents in one list input_documents_sets = [] for doc in input_documents: doc_vector = set_of_words(parse_document(doc)) input_documents_sets.append(doc_vector) # Create a basis set into which all documents are merged. def basis_of_documents(*args): all_words = [] for i in args: all_words += i return set(all_words) # Merge all the sets into one set, that will be called "basis" basis = basis_of_documents(*input_documents_sets)

from stopwords import stopwords # Count the original basis len original_len_basis = len(basis) # Filter and keep only words that are not in stopwords basis = set(filter(lambda x: x not in stopwords, basis)) print(f"{original_len_basis - len(basis)} stopwords removed from basis.")

search_document = parse_document(document_name=search_document) search_document_vector = vectorize_document(basis=basis, document=search_document)

results = {} for basis_document in input_documents: results[basis_document] = {} results[basis_document]['Vector'] = vectorize_document(basis=basis, document=parse_document(basis_document))

def dot_product(v1, v2): # Vector size check if len(v1) != len(v2): raise Exception("Lists/vectors provided should have the same dimension") total_sum = 0 for n in range(len(v1)): # On each iteration sum the product of the values in that position of the array total_sum += v1[n]*v2[n] return total_sum def euclidean_distance(v1, v2): # Vector size check if len(v1) != len(v2): raise Exception("Lists/vectors provided should have the same dimension") total_sum = 0 for n in range(len(v1)): # On each iteration sum the square of the distance between # the two postions in the vector total_sum += (v1[n] - v2[n])**2 # Return the square root of the total sum, (will be always positive) return total_sum**0.5 def cosine_similarity_distance(v1, v2): # Vector size check if len(v1) != len(v2): raise Exception("Lists/vectors provided should have the same dimension") # Zero vector in the appropiate dimension zero_vec = [0 for i in range(len(v1))] # The cosine as shown before will be the dot product divided by the product of the norms. # The norm of a vector will be the euclidean distance of the vector to the zero vector. return dot_product(v1, v2) / (euclidean_distance(v1, zero_vec) * euclidean_distance(v2, zero_vec)) # Results of the vector to vector operations are stored in a dictionary. for basis_document in input_documents: results[basis_document]['Similarity'] = { 'DotProduct': dot_product(v1=results[basis_document]['Vector'], v2=search_document_vector), 'Euclidean Distance': euclidean_distance(v1=results[basis_document]['Vector'], v2=search_document_vector), 'Cosine Similarity': cosine_similarity_distance(v1=results[basis_document]['Vector'], v2=search_document_vector) }

import pandas as pd dot_product_list = [results[i]['Similarity']['DotProduct'] for i in input_documents] euclidean_distance_list = [results[i]['Similarity']['Euclidean Distance'] for i in input_documents] cosine_similarity_list = [results[i]['Similarity']['Cosine Similarity'] for i in input_documents] df = pd.DataFrame({'Documents': input_documents, 'Dot Product': dot_product_list, 'Euclidean Distance': euclidean_distance_list, 'Cosine Similarity': cosine_similarity_list}) df

import plotly.graph_objects as go import plotly.express as px import numpy as np for y in ['Euclidean Distance', 'Dot Product', 'Cosine Similarity']: fig = go.Figure(data=go.Bar(name=f'{y}', x=df['Documents'], y=df[y])) fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'}, title=f'{y} to Search Document') fig.show()

import pandas as pd import numpy as np import matplotlib.pyplot as plt df_corr = df[['Dot Product', 'Euclidean Distance', 'Cosine Similarity']] #plt.matshow(df_corr.corr()) #df_corr.corr() corr = df_corr.corr() # plot the heatmap sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True) # corr.style.background_gradient(cmap='cividis').set_precision(2)

import plotly.graph_objects as go import numpy as np fig = go.Figure(data=go.Scatter(x=df['Dot Product'], y=df['Euclidean Distance'], mode='markers', text=df['Documents'])) fig.update_layout(title=f"Similarity of search document to input documents", xaxis_title="Dot Product", yaxis_title="Euclidean Distance") fig.show() fig = go.Figure(data=go.Scatter(x=df['Cosine Similarity'], y=df['Dot Product'], mode='markers', text=df['Documents'])) fig.update_layout(title=f"Similarity of search document to input documents", xaxis_title="Cosine Similarity", yaxis_title="Dot Product") fig.show() fig = go.Figure(data=go.Scatter(x=df['Cosine Similarity'], y=df['Euclidean Distance'], mode='markers', text=df['Documents'])) fig.update_layout(title=f"Similarity of search document to input documents", xaxis_title="Cosine Similarity", yaxis_title="Euclidean Distance") fig.show()