import numpy as np
###########################################
### Just run this cell to load the data ###
###########################################
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import text_to_word_sequence
def load_data(percentage_of_sentences=None):
train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True)
train_sentences, y_train = tfds.as_numpy(train_data)
test_sentences, y_test = tfds.as_numpy(test_data)
# Take only a given percentage of the entire data
if percentage_of_sentences is not None:
assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
len_train = int(percentage_of_sentences/100*len(train_sentences))
train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
len_test = int(percentage_of_sentences/100*len(test_sentences))
test_sentences, y_test = test_sentences[:len_test], y_test[:len_test]
X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]
return X_train, y_train, X_test, y_test
X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=10)
from gensim.models import Word2Vec
word2vec = Word2Vec(sentences=X_train, vector_size=10)
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
embedded_sentence = []
for word in sentence:
if word in word2vec.wv:
embedded_sentence.append(word2vec.wv[word])
return np.array(embedded_sentence)
# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
embed = []
for sentence in sentences:
embedded_sentence = embed_sentence(word2vec, sentence)
embed.append(embedded_sentence)
return embed
# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)
# Pad the training and test embedded sentences
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=500)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=500)
# TEST ME
for X in [X_train_pad, X_test_pad]:
assert type(X) == np.ndarray
assert X.shape[-1] == word2vec.wv.vector_size
assert X_train_pad.shape[0] == len(X_train)
assert X_test_pad.shape[0] == len(X_test)
0.5
from tensorflow.keras import layers, Sequential
model = Sequential()
model.add(layers.Masking(mask_value=-1000))
model.add(layers.LSTM(20))
model.add(layers.Dense(10, activation="tanh"))
model.add(layers.Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
from tensorflow.keras import callbacks
es = callbacks.EarlyStopping(patience=5)
history = model.fit(X_train_pad, y_train,
batch_size=64,
epochs=20,
validation_split=0.3,
callbacks=[es])
np.array(history.history['accuracy']).mean()
model.evaluate(X_test_pad, y_test)
import gensim.downloader as api
print(list(api.info()['models'].keys()))
word2vec_transfer = api.load('glove-wiki-gigaword-50')
len(word2vec_transfer)
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
embedded_sentence = []
for word in sentence:
if word in word2vec:
embedded_sentence.append(word2vec[word])
return np.array(embedded_sentence)
# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
embed = []
for sentence in sentences:
embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
embed.append(embedded_sentence)
return embed
# Embed the training and test sentences
X_train_embed_2 = embedding(word2vec_transfer, X_train)
X_test_embed_2 = embedding(word2vec_transfer, X_test)
X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post', maxlen=500)
X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post', maxlen=500)
model = Sequential()
model.add(layers.Masking(mask_value=-1000))
model.add(layers.LSTM(20))
model.add(layers.Dense(10, activation="tanh"))
model.add(layers.Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
history = model.fit(X_train_pad_2, y_train,
batch_size=64,
epochs=20,
validation_split=0.3,
callbacks=[es])
np.array(history.history['accuracy']).mean()
res = model.evaluate(X_test_pad_2, y_test, verbose=0)
print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')