!pip install nltk
Requirement already satisfied: nltk in /shared-libs/python3.7/py/lib/python3.7/site-packages (3.5)
Requirement already satisfied: joblib in /shared-libs/python3.7/py/lib/python3.7/site-packages (from nltk) (1.0.0)
Requirement already satisfied: regex in /shared-libs/python3.7/py/lib/python3.7/site-packages (from nltk) (2020.11.13)
Requirement already satisfied: tqdm in /shared-libs/python3.7/py/lib/python3.7/site-packages (from nltk) (4.56.0)
Requirement already satisfied: click in /shared-libs/python3.7/py/lib/python3.7/site-packages (from nltk) (7.1.2)
WARNING: You are using pip version 20.1.1; however, version 21.0.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# use natural language toolkit
import nltk
from nltk.stem.lancaster import LancasterStemmer
# word stemmer
stemmer = LancasterStemmer()
# 3 classes of training data
training_data = []
training_data.append({"class":"greeting", "sentence":"how are you?"})
training_data.append({"class":"greeting", "sentence":"how is your day?"})
training_data.append({"class":"greeting", "sentence":"good day"})
training_data.append({"class":"greeting", "sentence":"how is it going today?"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"see you later"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"talk to you soon"})
training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"})
training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"})
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})
print ("%s sentences of training data" % len(training_data))
12 sentences of training data
nltk.download('punkt')
# capture unique stemmed words in the training corpus
corpus_words = {}
class_words = {}
# turn a list into a set (of unique items) and then a list again (this removes duplicates)
classes = list(set([a['class'] for a in training_data]))
for c in classes:
# prepare a list of words within each class
class_words[c] = []
# loop through each sentence in our training data
for data in training_data:
# tokenize each sentence into words
for word in nltk.word_tokenize(data['sentence']):
# ignore a some things
if word not in ["?", "'s"]:
# stem and lowercase each word
stemmed_word = stemmer.stem(word.lower())
# have we not seen this word already?
if stemmed_word not in corpus_words:
corpus_words[stemmed_word] = 1
else:
corpus_words[stemmed_word] += 1
# add the word to our words in class list
class_words[data['class']].extend([stemmed_word])
# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)
print ("Corpus words and counts: %s \n" % corpus_words)
# also we have all words in each class
print ("Class words: %s" % class_words)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
Corpus words and counts: {'how': 3, 'ar': 1, 'you': 4, 'is': 2, 'yo': 1, 'day': 4, 'good': 1, 'it': 1, 'going': 1, 'today': 2, 'hav': 3, 'a': 5, 'nic': 2, 'see': 1, 'lat': 1, 'talk': 1, 'to': 1, 'soon': 1, 'mak': 2, 'me': 1, 'sandwich': 3, 'can': 1, 'what': 1, 'for': 1, 'lunch': 1}
Class words: {'greeting': ['how', 'ar', 'you', 'how', 'is', 'yo', 'day', 'good', 'day', 'how', 'is', 'it', 'going', 'today'], 'goodbye': ['hav', 'a', 'nic', 'day', 'see', 'you', 'lat', 'hav', 'a', 'nic', 'day', 'talk', 'to', 'you', 'soon'], 'sandwich': ['mak', 'me', 'a', 'sandwich', 'can', 'you', 'mak', 'a', 'sandwich', 'hav', 'a', 'sandwich', 'today', 'what', 'for', 'lunch']}
# calculate a score for a given class
def calculate_class_score(sentence, class_name, show_details=True):
score = 0
# tokenize each word in our new sentence
for word in nltk.word_tokenize(sentence):
# check to see if the stem of the word is in any of our classes
if stemmer.stem(word.lower()) in class_words[class_name]:
# treat each word with same weight
score += 1
if show_details:
print (" match: %s" % stemmer.stem(word.lower() ))
return score
# we can now calculate a score for a new sentence
sentence = "good day for us to have lunch?"
# now we can find the class with the highest score
for c in class_words.keys():
print ("Class: %s Score: %s \n" % (c, calculate_class_score(sentence, c)))
match: good
match: day
Class: greeting Score: 2
match: day
match: to
match: hav
Class: goodbye Score: 3
match: for
match: hav
match: lunch
Class: sandwich Score: 3
# calculate a score for a given class taking into account word commonality
def calculate_class_score_commonality(sentence, class_name, show_details=True):
score = 0
# tokenize each word in our new sentence
for word in nltk.word_tokenize(sentence):
# check to see if the stem of the word is in any of our classes
if stemmer.stem(word.lower()) in class_words[class_name]:
# treat each word with relative weight
score += (1 / corpus_words[stemmer.stem(word.lower())])
if show_details:
print (" match: %s (%s)" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))
return score
# now we can find the class with the highest score
for c in class_words.keys():
print ("Class: %s Score: %s \n" % (c, calculate_class_score_commonality(sentence, c)))
match: good (1.0)
match: day (0.25)
Class: greeting Score: 1.25
match: day (0.25)
match: to (1.0)
match: hav (0.3333333333333333)
Class: goodbye Score: 1.5833333333333333
match: for (1.0)
match: hav (0.3333333333333333)
match: lunch (1.0)
Class: sandwich Score: 2.333333333333333
# return the class with highest score for sentence
def classify(sentence):
high_class = None
high_score = 0
# loop through our classes
for c in class_words.keys():
# calculate score of sentence for each class
score = calculate_class_score_commonality(sentence, c, show_details=False)
# keep track of highest score
if score > high_score:
high_class = c
high_score = score
return high_class, high_score
classify("make me some lunch?")
classify("sudo make me a sandwich")
classify("how are you doing today?")
classify("talk to you tommorow")
classify("who are you?")