#Import nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
#Consider a corpus of 3 documents/sentences
text="This is a text also known as a corpus (corpus). This text is made of multiple sentences, which we also call documents, which we have to tokenize. Each sentence is made up of mutliple words."
#Tokenize the sentences from the text corpus
tokenized_sentence=sent_tokenize(text)
print(tokenized_sentence)
['This is a text also known as a corpus (corpus).', 'This text is made of multiple sentences, which we also call documents, which we have to tokenize.', 'Each sentence is made up of mutliple words.']
#Create a CountVectorizer object and use special parameters stopwords and lowercase
vectorizer= CountVectorizer(lowercase=True,stop_words='english')
# Create a BOW matrix of word vectors - each vector is bag of words vector for a sentence
vector_counts=vectorizer.fit_transform(tokenized_sentence)
#Extract the unique word tokens that occurs in these documents
print(vectorizer.get_feature_names_out())
# Print out the list of words used, and their index in the vectors
print(vectorizer.vocabulary_)
#Print the shape of the matrix
print(vector_counts.shape)
['corpus' 'documents' 'known' 'multiple' 'mutliple' 'sentence' 'sentences'
'text' 'tokenize' 'words']
{'text': 7, 'known': 2, 'corpus': 0, 'multiple': 3, 'sentences': 6, 'documents': 1, 'tokenize': 8, 'sentence': 5, 'mutliple': 4, 'words': 9}
(3, 10)
print(vector_counts.toarray())
[[2 0 1 0 0 0 0 1 0 0]
[0 1 0 1 0 0 1 1 1 0]
[0 0 0 0 1 1 0 0 0 1]]
import numpy as np
print((5/100)*np.log(100/10))
0.1151292546497023
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
corpus=['Today, we will learn about Tfidf',"Tfidf stands for term frequency–inverse document frequency",
"tfidf tells us how important a word is in a corpus or document."]
# Create TfidfVectorizer object
Tfidf_vectorizer = TfidfVectorizer(lowercase=True,stop_words='english')
# Generate matrix of word vectors --each acts as feature in ML probelm
tfidf_matrix = Tfidf_vectorizer.fit_transform(corpus)
# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)
#Extract the unique word tokens that occurs in these documents
print(Tfidf_vectorizer.get_feature_names_out())
# Print out the list of words used, and their index in the vectors
print(Tfidf_vectorizer.vocabulary_)
(3, 12)
['corpus' 'document' 'frequency' 'important' 'inverse' 'learn' 'stands'
'tells' 'term' 'tfidf' 'today' 'word']
{'today': 10, 'learn': 5, 'tfidf': 9, 'stands': 6, 'term': 8, 'frequency': 2, 'inverse': 4, 'document': 1, 'tells': 7, 'important': 3, 'word': 11, 'corpus': 0}
#Print out the tfidf vectors of sentences
print(tfidf_matrix.toarray())
[[0. 0. 0. 0. 0. 0.65249088
0. 0. 0. 0.38537163 0.65249088 0. ]
[0. 0.27011786 0.71034504 0. 0.35517252 0.
0.35517252 0. 0.35517252 0.20977061 0. 0. ]
[0.45050407 0.34261996 0. 0.45050407 0. 0.
0. 0.45050407 0. 0.26607496 0. 0.45050407]]
# Initialize numpy vectors
A = np.array([10,40])
B = np.array([-4, 4])
# Calculate the dot product
dot_prod = np.dot(A, B)
# Print dot product
print(dot_prod)
120
from sklearn.metrics.pairwise import cosine_similarity
# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
print(cosine_sim)
[[1. 0.05048042 0.04787043]
[0.05048042 1. 0.0984949 ]
[0.04787043 0.0984949 1. ]]
corpus=["the movie was great and not bad.","the movie was not great and bad."]
# Generate n-grams upto n=2
bi_vectorizer = CountVectorizer(ngram_range=(2,2),lowercase=True)
bi_matrix = bi_vectorizer.fit_transform(corpus)
# Print the shape of tfidf_matrix
print(bi_matrix.shape)
#Extract the unique word tokens that occurs in these documents
print(bi_vectorizer.get_feature_names_out())
# Print out the list of words used, and their index in the vectors
print(bi_vectorizer.vocabulary_)
(2, 9)
['and bad' 'and not' 'great and' 'movie was' 'not bad' 'not great'
'the movie' 'was great' 'was not']
{'the movie': 6, 'movie was': 3, 'was great': 7, 'great and': 2, 'and not': 1, 'not bad': 4, 'was not': 8, 'not great': 5, 'and bad': 0}
# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(bi_matrix,bi_matrix)
print(cosine_sim)
[[1. 0.5]
[0.5 1. ]]