Untitled Python Project

#Import nltk import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize from sklearn.feature_extraction.text import CountVectorizer

#Consider a corpus of 3 documents/sentences text="This is a text also known as a corpus (corpus). This text is made of multiple sentences, which we also call documents, which we have to tokenize. Each sentence is made up of mutliple words." #Tokenize the sentences from the text corpus tokenized_sentence=sent_tokenize(text) print(tokenized_sentence)

#Create a CountVectorizer object and use special parameters stopwords and lowercase vectorizer= CountVectorizer(lowercase=True,stop_words='english') # Create a BOW matrix of word vectors - each vector is bag of words vector for a sentence vector_counts=vectorizer.fit_transform(tokenized_sentence) #Extract the unique word tokens that occurs in these documents print(vectorizer.get_feature_names_out()) # Print out the list of words used, and their index in the vectors print(vectorizer.vocabulary_) #Print the shape of the matrix print(vector_counts.shape)

print(vector_counts.toarray())

import numpy as np print((5/100)*np.log(100/10))

# Import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer corpus=['Today, we will learn about Tfidf',"Tfidf stands for term frequency–inverse document frequency", "tfidf tells us how important a word is in a corpus or document."] # Create TfidfVectorizer object Tfidf_vectorizer = TfidfVectorizer(lowercase=True,stop_words='english') # Generate matrix of word vectors --each acts as feature in ML probelm tfidf_matrix = Tfidf_vectorizer.fit_transform(corpus) # Print the shape of tfidf_matrix print(tfidf_matrix.shape) #Extract the unique word tokens that occurs in these documents print(Tfidf_vectorizer.get_feature_names_out()) # Print out the list of words used, and their index in the vectors print(Tfidf_vectorizer.vocabulary_)

#Print out the tfidf vectors of sentences print(tfidf_matrix.toarray())

# Initialize numpy vectors A = np.array([10,40]) B = np.array([-4, 4]) # Calculate the dot product dot_prod = np.dot(A, B) # Print dot product print(dot_prod)

from sklearn.metrics.pairwise import cosine_similarity # Initialize an instance of tf-idf Vectorizer tfidf_vectorizer = TfidfVectorizer() # Generate the tf-idf vectors for the corpus tfidf_matrix = tfidf_vectorizer.fit_transform(corpus) # Compute and print the cosine similarity matrix cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix) print(cosine_sim)

corpus=["the movie was great and not bad.","the movie was not great and bad."] # Generate n-grams upto n=2 bi_vectorizer = CountVectorizer(ngram_range=(2,2),lowercase=True) bi_matrix = bi_vectorizer.fit_transform(corpus) # Print the shape of tfidf_matrix print(bi_matrix.shape) #Extract the unique word tokens that occurs in these documents print(bi_vectorizer.get_feature_names_out()) # Print out the list of words used, and their index in the vectors print(bi_vectorizer.vocabulary_)

# Compute and print the cosine similarity matrix cosine_sim = cosine_similarity(bi_matrix,bi_matrix) print(cosine_sim)