#Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pickle
import heapq
#loading the data to the variable
link = '1661-0.txt'
with open(link) as f:
text= f.readlines()
#Reading and converting data to lower case
text = open(link).read().lower()
print('corpus length:', len(text))
text
#Importing RegexTeokenizer module from NLTK library
from nltk.tokenize import RegexpTokenizer
#Tokenizing the data and converting to tokens
tokenizer = RegexpTokenizer(r'\w+')
#lowering the case
wo = text.lower()
words = tokenizer.tokenize(wo)
print(words)
#Extracting the unique words using the np.unique function
unique_words = np.unique(words)
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))
#Finding the previous and next words
LENGTH = 5
prev = []
next= []
for i in range(len(words) - LENGTH):
prev.append(words[i:i + LENGTH])
next.append(words[i + LENGTH])
#Printing the previous and next words
print("Previous :",prev[0])
print("Next :",next[0])
#Creating the array with number of zeros
X = np.zeros((len(prev), LENGTH, len(unique_words)), dtype=bool)
Y = np.zeros((len(next), len(unique_words)), dtype=bool)
#Properly indexing the previous and next words
for i, each_words in enumerate(prev):
for j, word in enumerate(each_words):
X[i, j, unique_word_index[word]] = 1
Y[i, unique_word_index[next[i]]] = 1
print(X[0][0])
print(Y[0][0])
#Importing the Keras library for models and model operations
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop
#Selecting the Sequential model
model = Sequential()
model.add(LSTM(128, input_shape=(LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))
#setting Softmax Activation function
model.add(Activation('softmax'))
#setting the RMSprop Optimizer
optimizer = RMSprop(learning_rate=0.01)
#Compiling the model
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
#Training the model.
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=2, shuffle=True).history
#Saving the model as h5 file
model.save('next_word_model.h5')
pickle.dump(history, open("history.p", "wb"))
#Loading the model
model = load_model('next_word_model.h5')
history = pickle.load(open("history.p", "rb"))
#Defining a function to prepare input for using in model
def prep(text):
x = np.zeros((1, LENGTH, len(unique_words)))
for a, word in enumerate(text.split()):
print(word)
x[0, a, unique_word_index[word]] = 1
return x
#using the prep funtion we are transforming the data
prep("It is not a lack".lower())
#Defining a function called sample
def sample(preds, top_n=3):
preds = np.asarray(preds).astype('float64')
preds = np.log(preds)
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
return heapq.nlargest(top_n, range(len(preds)), preds.take)
#Defining a function predict_completion to predict the data using model
def completion(text):
original_text = text
generated = text
completion = ''
indices_char={}
while True:
x = prep(text)
preds = model.predict(x, verbose=0)[0]
next_index = sample(preds, top_n=1)[0]
next_char = indices_char[next_index]
text = text[1:] + next_char
completion += next_char
if len(original_text + completion) + 2 > len(original_text) and next_char == ' ':
return completion
def completions(text, n=3):
x = prep(text)
preds = model.predict(x, verbose=0)[0]
next_indices = sample(preds, n)
return [unique_words[idx] + completion(text[1:] + completion[idx]) for idx in next_indices]
def completions(text, n=3):
if text == "":
return("0")
x = prep(text)
preds = model.predict(x, verbose=0)[0]
next_indices = sample(preds, n)
return [unique_words[idx] for idx in next_indices]
q = "Your life will never be the same again"
print("correct sentence: ",q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", predict_completions(seq, 5))