#Loading the data to variable data using pandas library.
import pandas as pd
data=pd.read_csv("ner_dataset.csv",encoding= 'unicode_escape')
data.head(5)
#Creating a function to make filter the token and tag data
#importing itertools library
from itertools import chain
def make_dict_map(data, tokentag):
token_to_idx = {}
idx_to_token = {}
#Checking for tokentag to filter
if tokentag == 'token':
voc = list(set(data['Word'].to_list()))
else:
voc = list(set(data['Tag'].to_list()))
#Creating dictionary for idx_to_token and token_to_idx
idx_to_token = {idx:tok for idx, tok in enumerate(voc)}
token_to_idx = {tok:idx for idx, tok in enumerate(voc)}
return token_to_idx , idx_to_token
#Filtering the token and tag using make_dict_map function
token_to_idx, idx_to_token = make_dict_map(data, 'token')
tag_to_idx, idx_to_tag = make_dict_map(data, 'tag')
#mapping the data with token and tag
data['Word_idx'] = data['Word'].map(token_to_idx)
data['Tag_idx'] = data['Tag'].map(tag_to_idx)
#Filling the Nan values in the dataset
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
data_group.head(5)
#Importing train_test_split to split the training and testing data
from sklearn.model_selection import train_test_split
#Importing libraries from keras
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
#Function to extract train_tokens, test_tokens, val_tokens, train_tags,test_tags,val_tags
def get_train_test_val(data_group, datas):
#Creating pad_tokens (X var)
tokens = data_group['Word_idx'].tolist()
maxlen = max([len(s) for s in tokens])
#getting the maximum token length and tag length
ntoken = len(list(set(datas['Word'].to_list())))
ntag = len(list(set(datas['Tag'].to_list())))
padtokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= ntoken - 1)
#Creating Pad Tags (y var) and converting into one hot encoding
tags = data_group['Tag_idx'].tolist()
padtags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag_to_idx["O"])
ntags = len(tag_to_idx)
padtags = [to_categorical(i, num_classes=ntags) for i in padtags]
#Splitting the train, test and validation set
tokens, testtokens, tags, testtags = train_test_split(padtokens, padtags, test_size=0.1, train_size=0.9, random_state=2020)
traintokens, valtokens, traintags, valtags = train_test_split(tokens,tags,test_size = 0.25,train_size =0.75, random_state=2020)
print(
'length of train tokens :', len(train_tokens),
'\nlength of train tags :', len(train_tags),
'\nlength of test tokens :', len(test_tokens),
'\nlength of test tags :', len(test_tags),
'\nlength of val tokens :', len(val_tokens),
'\nlength of val tags :', len(val_tags),
)
return traintokens, testtokens, valtokens, traintags,testtags,valtags
#printing the lengths of train_tokens, test_tokens, val_tokens, train_tags,test_tags,val_tags
traintokens, testtokens, valtokens, traintags,testtags,valtags= get_train_test_val(data_group, data)
length of train tokens : 32372
length of train tags : 32372
length of test tokens : 4796
length of test tags : 4796
length of val tokens : 10791
length of val tags : 10791
#Importing numpy library and tensorflow.keras library for model building.
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)
#Finding the input and output dimension for Data
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
#Finding the length of tag_to_idx and saving in ntags variable
ntags = len(tag_to_idx)
ntags
#Function for the architecture of model.
def get_bilstmlstm():
#selecting Sequential model
model = Sequential()
# Adding Embedding layer to the model
model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
# Adding bidirectional LSTM to the model
model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
# Adding LSTM to the model
model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
# Adding timeDistributed Layer to the model
model.add(TimeDistributed(Dense(n_tags, activation="relu")))
#Adding Adam optimizer to the model
# Compile model
#Adding Adam optimizer to the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
#Function to find the loss of the model
def train_model(X, y, model):
loss = list()
for i in range(25):
# fit model for one epoch on this sequence
hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
loss.append(hist.history['loss'][0])
return loss
#The Final output will be obtained after the 25 epochs as we set the loop to run 25 times
results = pd.DataFrame()
model_bilstm_lstm = get_bilstmlstm()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 104, 64) 2251456
_________________________________________________________________
bidirectional_1 (Bidirection (None, 104, 128) 66048
_________________________________________________________________
lstm_3 (LSTM) (None, 104, 64) 49408
_________________________________________________________________
time_distributed_1 (TimeDist (None, 104, 17) 1105
=================================================================
Total params: 2,368,017
Trainable params: 2,368,017
Non-trainable params: 0
_________________________________________________________________
26/26 [==============================] - 151s 6s/step - loss: nan - accuracy: 0.0083 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 139s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 138s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 136s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 144s 6s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 141s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 140s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 144s 6s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 145s 6s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 142s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 142s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 140s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 139s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 139s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 140s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 140s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 143s 6s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 141s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 139s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 139s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 140s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 140s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 140s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 139s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074
26/26 [==============================] - 140s 5s/step - loss: nan - accuracy: 0.0076 - val_loss: nan - val_accuracy: 0.0074