Numan SAHNOU x Matthieu ECCHER
Question Answering on the SQuAD Dataset
Stanford Question Answering Dataset (SQuAD) is a new reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage. With 100,000+ question-answer pairs on 500+ articles, SQuAD is significantly larger than previous reading comprehension datasets.
import numpy as np
import pandas as pd
import json
Transform the JSON format into pandas dataframe
def json_to_dataframe(file):
f = open (file , "r")
data = json.loads(f.read())#Loading the json file.
#Creating empty lists to store values.
con = []
Que = []
Txt = []
for i in range(len(data['data'])): #Root tag of the json file contains 'title' tag & 'paragraphs' list.
title = data['data'][i]['title']
for p in range(len(data['data'][i]['paragraphs'])): # 'paragraphs' list contains 'context' tag & 'qas' list.
context = data['data'][i]['paragraphs'][p]['context']
for q in range(len(data['data'][i]['paragraphs'][p]['qas'])): # 'qas' list contains 'question', 'Id' tag & 'answers' list.
question = data['data'][i]['paragraphs'][p]['qas'][q]['question']
Id = data['data'][i]['paragraphs'][p]['qas'][q]['id']
is_impossible = data['data'][i]['paragraphs'][p]['qas'][q]['is_impossible']
for a in range(len(data['data'][i]['paragraphs'][p]['qas'][q]['answers'])): # 'answers' list contains 'ans_start', 'text' tags.
text = data['data'][i]['paragraphs'][p]['qas'][q]['answers'][a]['text']
con.append(context)
Que.append(question)
Txt.append(text)
new_df = pd.DataFrame(columns=['Context','Question','Answer']) # Creating empty DataFrame.
new_df.Context = con
new_df.Question = Que
new_df.Answer = Txt
print('Done')
final_df = new_df.drop_duplicates(keep='first') # Dropping duplicate rows from the create Dataframe.
return final_df
Load the train and test datasets
train = json_to_dataframe('train-v2.0.json')
test = json_to_dataframe('dev-v2.0.json')
Visualize the data
train
test = test.reset_index(drop=True, inplace=False)
test
Get the start and end index words of the answers
from keras.preprocessing.text import text_to_word_sequence
def get_start_end_words(row):
answer = text_to_word_sequence(row['Answer'])
context = text_to_word_sequence(row['Context'])
start_word=end_word=-1
match=False
if not answer:
row['Start_Word'] = start_word
row['End_Word'] = end_word
return row
for j in range(len(context)-len(answer)):
if context[j] == answer[0]:
match=True
k=0
for k in range(1, len(answer)):
if context[j+k] != answer[k]:
match=False
if match==True:
start_word=j
end_word=j+k
break
row['Start_Word'] = start_word
row['End_Word'] = end_word
return row
train = train.apply(get_start_end_words, axis=1)
test = test.apply(get_start_end_words, axis=1)
train
We remove the rows where we could not find the answers indexes
train = train[train['Start_Word']!=-1]
test = test[test['Start_Word']!=-1]
train = train.reset_index(drop=True, inplace=False)
test = test.reset_index(drop=True, inplace=False)
We transform the Context and the Question to sequences
Keras Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.Context)
tokenizer.fit_on_texts(train.Question)
tokenizer.fit_on_texts(test.Context)
tokenizer.fit_on_texts(test.Question)
Vocab size
vocab_size = len(tokenizer.word_index) + 1
vocab_size
Transform the text into word sequences based on the vocabulary
#train
context_sequences = tokenizer.texts_to_sequences(train.Context)
question_sequences = tokenizer.texts_to_sequences(train.Question)
#test
test_context_sequences = tokenizer.texts_to_sequences(test.Context)
test_question_sequences = tokenizer.texts_to_sequences(test.Question)
#train
train["Context_Sequences"] = context_sequences
train["Question_Sequences"] = question_sequences
#test
test["Context_Sequences"] = test_context_sequences
test["Question_Sequences"] = test_question_sequences
We pad the sequences in order to have the same lenght for each Context and each Question
#train
context_padded = pad_sequences(context_sequences, maxlen = 700, padding="post")
question_padded = pad_sequences(question_sequences, maxlen = 50, padding="post")
#test
test_context_padded = pad_sequences(test_context_sequences, maxlen = 700, padding="post")
test_question_padded = pad_sequences(test_question_sequences, maxlen = 50, padding="post")
#train
train["Context_Sequences"] = context_padded.tolist()
train["Question_Sequences"] = question_padded.tolist()
#test
test["Context_Sequences"] = test_context_padded.tolist()
test["Question_Sequences"] = test_question_padded.tolist()
train
max_length_context = len(train.Context_Sequences[0])
print(max_length_context)
max_length_question = len(train.Question_Sequences[0])
print(max_length_question)
We define our features and labels for the train and test sets
X_train = train[['Context_Sequences','Question_Sequences']]
y_train = train[['Start_Word','End_Word']]
X_test = test[['Context_Sequences','Question_Sequences']]
y_test = test[['Start_Word','End_Word']]
X_train
y_train
Load the GloVe word embedding
GloVe is an unsupervised learning algorithm for obtaining vector representations for words (https://nlp.stanford.edu/projects/glove/)
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
Now we create the embedding matrix. We match each word in the GloVe word embeddings vector and our words in the vocabulary
count = 0
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
count += 1
embedding_matrix[i] = embedding_vector
print("Percentage of words covered by Glove vectors:", count/len(tokenizer.word_index)*100)
Defintion of the model (RNN Bi LSTM)
Hyperparameters
vocab_size = len(tokenizer.word_index) + 1
embedding_vector_length = 100
max_span_begin = np.amax(y_train.Start_Word)
max_span_end = np.amax(y_train.End_Word)
batch = 32
# slice of data to be used as one epoch training on full data is expensive
slce = 10000
Context input
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, MultiHeadAttention, concatenate, Flatten, Activation, dot
context_input = Input(shape=(max_length_context, ), dtype='int32', name='context_input')
x = Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix],
input_length=max_length_context, trainable=False)(context_input)
context_LSTM = Bidirectional(LSTM(100, return_sequences=True, dropout=0.2), merge_mode='concat')(x)
Question input
question_input = Input(shape=(max_length_question, ), dtype='int32', name='ques_input')
x = Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix],
input_length=max_length_question, trainable=False)(question_input)
question_LSTM, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(100, return_sequences=True, return_state=True, dropout=0.2), merge_mode='concat')(x)
# /!\ We are not sure if add is the right approach or should be concat
question_hidden_states = concatenate([forward_h, backward_h])
Attention layer
#ATTENTION LAYER
import tensorflow as tf
attention_scores = dot([context_LSTM, question_LSTM], axes=2)
attention_distribution = tf.nn.softmax(attention_scores, axis=2)
#attention output is also the question vector
attention_output = dot([attention_distribution, question_LSTM],axes=[2,1])
question_to_context = concatenate([context_LSTM, attention_output])
#We used tanh activation because most of time tanh is quickly converge than sigmoid and performs better accuracy
attention_vector = Dense(200, use_bias=False, activation='tanh')(question_to_context)
Output layer (Start and End word predictions)
#START WORD PREDICTION
start = Dense(200)(question_hidden_states)
#We use expand_dims from tensorflow to use the start vectors as a matrix to multiply the start vector with the attention vector
start_matrix = tf.expand_dims(start, 2)
#Squeeze removes dimensions of size 1 from the shape of a tensor we added before
start_word = tf.squeeze(tf.matmul(attention_vector, start_matrix), 2)
start_word = tf.nn.softmax(start_word, axis=1,name = "Start_Word_Prediction")
#END WORD PREDICTION
end = Dense(200)(question_hidden_states)
end_matrix = tf.expand_dims(end, 2)
end_word = tf.squeeze(tf.matmul(attention_vector, end_matrix), 2)
end_word = tf.nn.softmax(end_word, axis=1,name = "End_Word_Prediction")
We define the model and the metrics
#ATTEMPT FAIL
def compute_loss(start, end):
J = tf.math.reduce_sum(tf.cast(start, tf.float64)) - tf.math.log(tf.cast(start, tf.float64)) + tf.math.reduce_sum(tf.cast(end, tf.float64)) - tf.math.log(tf.cast(end, tf.float64))
return J
#DEFINE THE MODEL
model = Model(inputs=[context_input, question_input], outputs=[start_word, end_word])
model.compile(optimizer='Adam', loss="sparse_categorical_crossentropy", metrics=['acc'])
model.summary()
Fitting of the model
model_history = model.fit([context_padded, question_padded], [y_train.Start_Word, y_train.End_Word], verbose=2, batch_size=batch, epochs=20)
Predictions
y_pred = model.predict([test_context_padded, test_question_padded])
y_pred_start= y_pred[0].argmax(axis=-1)
y_pred_end= y_pred[1].argmax(axis=-1)
y_pred_start = pd.DataFrame(y_pred_start, columns=["Start"])
y_pred_end = pd.DataFrame(y_pred_end, columns=["End"])
predictions = pd.DataFrame(y_pred_start)
predictions["End"] = y_pred_end
predictions
Evaluation of the model on test set
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
d = {'Start_Word': [precision_score(y_test['Start_Word'], predictions['Start'], average="macro"), recall_score(y_test['Start_Word'], predictions['Start'], average="macro"), f1_score(y_test['Start_Word'], predictions['Start'], average="macro")],
'End_Word': [precision_score(y_test['End_Word'], predictions['End'], average="macro"), recall_score(y_test['End_Word'], predictions['End'], average="macro"), f1_score(y_test['End_Word'], predictions['End'], average="macro")]}
scores = pd.DataFrame(data=d)
scores.insert (0, "Raw", ['Precision', 'Recall', 'F1'])
scores.set_index('Raw')
scores
Evalutation of the model
def index_to_word(context_seq, context, question, answer, word_start, word_end):
sentence = []
sentence.append(context_seq[word_start:word_end+1])
text = tokenizer.sequences_to_texts(sentence)
print("Context : ", context, "\nQuestion:", question, "\nActual Answer:", answer, "\nPredicted Answer:", text[0], "\n\n\n")
for i in range(5,25):
index_to_word(X_test.Context_Sequences[i], test.Context[i], test.Question[i], test['Answer'][i], predictions["Start"][i], predictions["End"][i])