NLP - Next Word Prediction Model

#Importing the libraries import numpy as np import matplotlib.pyplot as plt import pickle import heapq

#loading the data to the variable link = '1661-0.txt' with open(link) as f: text= f.readlines()

#Reading and converting data to lower case text = open(link).read().lower() print('corpus length:', len(text)) text

#Importing RegexTeokenizer module from NLTK library from nltk.tokenize import RegexpTokenizer

#Tokenizing the data and converting to tokens tokenizer = RegexpTokenizer(r'\w+') #lowering the case wo = text.lower() words = tokenizer.tokenize(wo) print(words)

#Extracting the unique words using the np.unique function unique_words = np.unique(words) unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

#Finding the previous and next words LENGTH = 5 prev = [] next= [] for i in range(len(words) - LENGTH): prev.append(words[i:i + LENGTH]) next.append(words[i + LENGTH]) #Printing the previous and next words print("Previous :",prev[0]) print("Next :",next[0])

#Creating the array with number of zeros X = np.zeros((len(prev), LENGTH, len(unique_words)), dtype=bool) Y = np.zeros((len(next), len(unique_words)), dtype=bool)

#Properly indexing the previous and next words for i, each_words in enumerate(prev): for j, word in enumerate(each_words): X[i, j, unique_word_index[word]] = 1 Y[i, unique_word_index[next[i]]] = 1

print(X[0][0])

print(Y[0][0])

#Importing the Keras library for models and model operations from keras.models import Sequential, load_model from keras.layers import LSTM from keras.layers.core import Dense, Activation from keras.optimizers import RMSprop

#Selecting the Sequential model model = Sequential() model.add(LSTM(128, input_shape=(LENGTH, len(unique_words)))) model.add(Dense(len(unique_words))) #setting Softmax Activation function model.add(Activation('softmax')) #setting the RMSprop Optimizer optimizer = RMSprop(learning_rate=0.01)

#Compiling the model model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

#Training the model. history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=2, shuffle=True).history

#Saving the model as h5 file model.save('next_word_model.h5') pickle.dump(history, open("history.p", "wb")) #Loading the model model = load_model('next_word_model.h5') history = pickle.load(open("history.p", "rb"))

#Defining a function to prepare input for using in model def prep(text): x = np.zeros((1, LENGTH, len(unique_words))) for a, word in enumerate(text.split()): print(word) x[0, a, unique_word_index[word]] = 1 return x

#using the prep funtion we are transforming the data prep("It is not a lack".lower())

#Defining a function called sample def sample(preds, top_n=3): preds = np.asarray(preds).astype('float64') preds = np.log(preds) exp_preds = np.exp(preds) preds = exp_preds / np.sum(exp_preds) return heapq.nlargest(top_n, range(len(preds)), preds.take)

#Defining a function predict_completion to predict the data using model def completion(text): original_text = text generated = text completion = '' indices_char={} while True: x = prep(text) preds = model.predict(x, verbose=0)[0] next_index = sample(preds, top_n=1)[0] next_char = indices_char[next_index] text = text[1:] + next_char completion += next_char if len(original_text + completion) + 2 > len(original_text) and next_char == ' ': return completion def completions(text, n=3): x = prep(text) preds = model.predict(x, verbose=0)[0] next_indices = sample(preds, n) return [unique_words[idx] + completion(text[1:] + completion[idx]) for idx in next_indices]

def completions(text, n=3): if text == "": return("0") x = prep(text) preds = model.predict(x, verbose=0)[0] next_indices = sample(preds, n) return [unique_words[idx] for idx in next_indices]

q = "Your life will never be the same again" print("correct sentence: ",q) seq = " ".join(tokenizer.tokenize(q.lower())[0:5]) print("Sequence: ",seq) print("next possible words: ", predict_completions(seq, 5))