#Loading the data to variable data using pandas library.
import pandas as pd
data=pd.read_csv("ner_dataset.csv",encoding= 'unicode_escape')
data.head(5)
#Creating a function to make filter the token and tag data
#importing itertools library
from itertools import chain
def make_dict_map(data, tokentag):
token_to_idx = {}
idx_to_token = {}
#Checking for tokentag to filter
if tokentag == 'token':
voc = list(set(data['Word'].to_list()))
else:
voc = list(set(data['Tag'].to_list()))
#Creating dictionary for idx_to_token and token_to_idx
idx_to_token = {idx:tok for idx, tok in enumerate(voc)}
token_to_idx = {tok:idx for idx, tok in enumerate(voc)}
return token_to_idx , idx_to_token
#Filtering the token and tag using make_dict_map function
token_to_idx, idx_to_token = make_dict_map(data, 'token')
tag_to_idx, idx_to_tag = make_dict_map(data, 'tag')
#mapping the data with token and tag
data['Word_idx'] = data['Word'].map(token_to_idx)
data['Tag_idx'] = data['Tag'].map(tag_to_idx)
#Filling the Nan values in the dataset
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
data_group.head(5)
#Importing train_test_split to split the training and testing data
from sklearn.model_selection import train_test_split
#Importing libraries from keras
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
#Function to extract train_tokens, test_tokens, val_tokens, train_tags,test_tags,val_tags
def get_train_test_val(data_group, datas):
#Creating pad_tokens (X var)
tokens = data_group['Word_idx'].tolist()
maxlen = max([len(s) for s in tokens])
#getting the maximum token length and tag length
ntoken = len(list(set(datas['Word'].to_list())))
ntag = len(list(set(datas['Tag'].to_list())))
padtokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= ntoken - 1)
#Creating Pad Tags (y var) and converting into one hot encoding
tags = data_group['Tag_idx'].tolist()
padtags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag_to_idx["O"])
ntags = len(tag_to_idx)
padtags = [to_categorical(i, num_classes=ntags) for i in padtags]
#Splitting the train, test and validation set
tokens, testtokens, tags, testtags = train_test_split(padtokens, padtags, test_size=0.1, train_size=0.9, random_state=2020)
traintokens, valtokens, traintags, valtags = train_test_split(tokens,tags,test_size = 0.25,train_size =0.75, random_state=2020)
print(
'length of train tokens :', len(train_tokens),
'\nlength of train tags :', len(train_tags),
'\nlength of test tokens :', len(test_tokens),
'\nlength of test tags :', len(test_tags),
'\nlength of val tokens :', len(val_tokens),
'\nlength of val tags :', len(val_tags),
)
return traintokens, testtokens, valtokens, traintags,testtags,valtags
#printing the lengths of train_tokens, test_tokens, val_tokens, train_tags,test_tags,val_tags
traintokens, testtokens, valtokens, traintags,testtags,valtags= get_train_test_val(data_group, data)
#Importing numpy library and tensorflow.keras library for model building.
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)
#Finding the input and output dimension for Data
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
#Finding the length of tag_to_idx and saving in ntags variable
ntags = len(tag_to_idx)
ntags
#Function for the architecture of model.
def get_bilstmlstm():
#selecting Sequential model
model = Sequential()
# Adding Embedding layer to the model
model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
# Adding bidirectional LSTM to the model
model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
# Adding LSTM to the model
model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
# Adding timeDistributed Layer to the model
model.add(TimeDistributed(Dense(n_tags, activation="relu")))
#Adding Adam optimizer to the model
# Compile model
#Adding Adam optimizer to the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
#Function to find the loss of the model
def train_model(X, y, model):
loss = list()
for i in range(25):
# fit model for one epoch on this sequence
hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
loss.append(hist.history['loss'][0])
return loss
#The Final output will be obtained after the 25 epochs as we set the loop to run 25 times
results = pd.DataFrame()
model_bilstm_lstm = get_bilstmlstm()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)