P3- Matriz de similitudes

1-Preprocesamiento

import os import numpy as np import nltk from nltk.tokenize import word_tokenize from nltk.stem.snowball import SnowballStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # Descargar los recursos necesarios de NLTK nltk.download('punkt') # Configurar el stemmer para español stemmer = SnowballStemmer("spanish") # Función para cargar stopwords desde un archivo def cargar_stopwords(archivo_path): with open(archivo_path, 'r', encoding='utf-8') as archivo: return set(archivo.read().split()) def preprocesamiento(texto, stopwords_set): # Tokenización tokens = word_tokenize(texto, language='spanish') # Filtrar stopwords y aplicar stemming tokens_procesados = [stemmer.stem(token) for token in tokens if token.lower() not in stopwords_set] # Reducción de palabras texto_procesado = ' '.join(tokens_procesados) return texto_procesado def compute_tfidf(collection): vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(collection) return tfidf_matrix def cosine_sim(Q, Doc): return cosine_similarity(Q.reshape(1, -1), Doc.reshape(1, -1))[0][0]

2-Similitud de coseno

# Lista de nombres de archivos de libros textos = ["libro1.txt", "libro2.txt", "libro3.txt", "libro4.txt", "libro5.txt", "libro6.txt"] # Cargar las stopwords en español desde el archivo stopwords_espanol = cargar_stopwords("stop_words_spanish.txt") # Cargar los textos de los libros y preprocesarlos textos_procesados = [] for file_name in textos: with open(os.path.join("resumenes_libros", file_name), 'r', encoding='utf-8') as file: texto = file.read().rstrip() texto_procesado = preprocesamiento(texto, stopwords_espanol) textos_procesados.append(texto_procesado) # Calcular TF-IDF textos_tfidf = compute_tfidf(textos_procesados) # Calcular la matriz de similitud de coseno matriz_similitud = np.zeros((len(textos), len(textos))) for i, doc1 in enumerate(textos_tfidf): for j, doc2 in enumerate(textos_tfidf): matriz_similitud[i][j] = cosine_sim(doc1, doc2) print(matriz_similitud)

4-Indice invertido con similitud de coseno

1- Estructura del indice invertido en Python

""" index = { w1 : [(doc1, tf_w1_doc1), (doc3, tf_w1_doc3),(doc4, tf_w1_doc4),(doc10, tf_w1_doc10)], w2 : [(doc1, tf_w2_doc1 ), (doc2, tf_w2_doc2)], w3 : [(doc2, tf_w3_doc2), (doc3, tf_w3_doc3),(doc7, tf_w3_doc7)], } idf = { w1 : idf_w1, w2 : idf_w2, w3 : idf_w3, } length ={ doc1: norm_doc1, doc2: norm_doc2, doc3: norm_doc3, ... } """

2- Algoritmo para construir el indice

3- Funcion de recuperación usando similitud de coseno

class InvertIndex: def __init__(self, index_file): self.index_file = index_file self.textos_procesados = [] self.index = {} self.tfidf = [] self.length = [] def building(self, collection_text): # build the inverted index with the collection count=1 for file_name in collection_text: file = open(file_name) texto = file.read().strip().lower() texto_p = preprocesamiento(texto) for word in texto_p: if word not in self.index: self.index[word] = set() self.index[word].add(count) self.textos_procesados.append(texto_p) count+=1 # compute the tf & idf self.tfidf = compute_tfidf(self.textos_procesados) # compute the length (norm) for i,file in enumerate(collection_text): words = preprocesamiento(file) text_keys = list(self.tfidf[i].values()) norm = np.linalg.norm(text_keys) self.length.append(norm) def retrieval(self, query, k): self.load_index(self.index_file) # diccionario para el score score = {} # preprocesar la query: extraer los terminos unicos texto_p = preprocesamiento(query) # calcular el tf-idf del query query_tfidf = compute_tfidf(texto_p) # aplicar similitud de coseno para cada doc y guardarlo en el diccionario score for i, doc1 in enumerate(self.tfidf): score[i] =cosine_sim(doc1, query_tfidf,i,j) # ordenar el score de forma descendente result = sorted(score.items(), key= lambda tup: tup[1], reverse=True) # retornamos los k documentos mas relevantes (de mayor similitud al query) return result[:k]

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}P3- Matriz de similitudes

1-Preprocesamiento

2-Similitud de coseno

4-Indice invertido con similitud de coseno

1- Estructura del indice invertido en Python

2- Algoritmo para construir el indice

3- Funcion de recuperación usando similitud de coseno

P3- Matriz de similitudes