#Import nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
#Consider a corpus of 3 documents/sentences
text="This is a text also known as a corpus (corpus). This text is made of multiple sentences, which we also call documents, which we have to tokenize. Each sentence is made up of mutliple words."
#Tokenize the sentences from the text corpus
tokenized_sentence=sent_tokenize(text)
print(tokenized_sentence)
#Create a CountVectorizer object and use special parameters stopwords and lowercase
vectorizer= CountVectorizer(lowercase=True,stop_words='english')
# Create a BOW matrix of word vectors - each vector is bag of words vector for a sentence
vector_counts=vectorizer.fit_transform(tokenized_sentence)
#Extract the unique word tokens that occurs in these documents
print(vectorizer.get_feature_names_out())
# Print out the list of words used, and their index in the vectors
print(vectorizer.vocabulary_)
#Print the shape of the matrix
print(vector_counts.shape)
print(vector_counts.toarray())
import numpy as np
print((5/100)*np.log(100/10))
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
corpus=['Today, we will learn about Tfidf',"Tfidf stands for term frequency–inverse document frequency",
"tfidf tells us how important a word is in a corpus or document."]
# Create TfidfVectorizer object
Tfidf_vectorizer = TfidfVectorizer(lowercase=True,stop_words='english')
# Generate matrix of word vectors --each acts as feature in ML probelm
tfidf_matrix = Tfidf_vectorizer.fit_transform(corpus)
# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)
#Extract the unique word tokens that occurs in these documents
print(Tfidf_vectorizer.get_feature_names_out())
# Print out the list of words used, and their index in the vectors
print(Tfidf_vectorizer.vocabulary_)
#Print out the tfidf vectors of sentences
print(tfidf_matrix.toarray())
# Initialize numpy vectors
A = np.array([10,40])
B = np.array([-4, 4])
# Calculate the dot product
dot_prod = np.dot(A, B)
# Print dot product
print(dot_prod)
from sklearn.metrics.pairwise import cosine_similarity
# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
print(cosine_sim)
corpus=["the movie was great and not bad.","the movie was not great and bad."]
# Generate n-grams upto n=2
bi_vectorizer = CountVectorizer(ngram_range=(2,2),lowercase=True)
bi_matrix = bi_vectorizer.fit_transform(corpus)
# Print the shape of tfidf_matrix
print(bi_matrix.shape)
#Extract the unique word tokens that occurs in these documents
print(bi_vectorizer.get_feature_names_out())
# Print out the list of words used, and their index in the vectors
print(bi_vectorizer.vocabulary_)
# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(bi_matrix,bi_matrix)
print(cosine_sim)