import numpy as np
###########################################
### Just run this cell to load the data ###
###########################################
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import text_to_word_sequence
def load_data(percentage_of_sentences=None):
train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True)
train_sentences, y_train = tfds.as_numpy(train_data)
test_sentences, y_test = tfds.as_numpy(test_data)
# Take only a given percentage of the entire data
if percentage_of_sentences is not None:
assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
len_train = int(percentage_of_sentences/100*len(train_sentences))
train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
len_test = int(percentage_of_sentences/100*len(test_sentences))
test_sentences, y_test = test_sentences[:len_test], y_test[:len_test]
X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]
return X_train, y_train, X_test, y_test
X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=10)
/shared-libs/python3.9/py/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
2023-01-01 18:39:14.535873: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-01 18:39:14.690301: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-01 18:39:14.690342: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-01 18:39:14.729110: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-01-01 18:39:15.714255: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-01 18:39:15.714334: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-01-01 18:39:15.714343: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2023-01-01 18:39:17.643045: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-01-01 18:39:17.643083: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-01-01 18:39:17.643100: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (p-1fa92a8c-dd92-471b-8c6c-bf2adef19748): /proc/driver/nvidia/version does not exist
2023-01-01 18:39:17.643343: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
from gensim.models import Word2Vec
word2vec = Word2Vec(sentences=X_train, vector_size=10)
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
embedded_sentence = []
for word in sentence:
if word in word2vec.wv:
embedded_sentence.append(word2vec.wv[word])
return np.array(embedded_sentence)
# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
embed = []
for sentence in sentences:
embedded_sentence = embed_sentence(word2vec, sentence)
embed.append(embedded_sentence)
return embed
# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)
# Pad the training and test embedded sentences
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=500)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=500)
# TEST ME
for X in [X_train_pad, X_test_pad]:
assert type(X) == np.ndarray
assert X.shape[-1] == word2vec.wv.vector_size
assert X_train_pad.shape[0] == len(X_train)
assert X_test_pad.shape[0] == len(X_test)
0.5
from tensorflow.keras import layers, Sequential
model = Sequential()
model.add(layers.Masking(mask_value=-1000))
model.add(layers.LSTM(20))
model.add(layers.Dense(10, activation="tanh"))
model.add(layers.Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
from tensorflow.keras import callbacks
es = callbacks.EarlyStopping(patience=5)
history = model.fit(X_train_pad, y_train,
batch_size=64,
epochs=20,
validation_split=0.3,
callbacks=[es])
Epoch 1/20
28/28 [==============================] - 28s 818ms/step - loss: 0.6942 - accuracy: 0.4943 - val_loss: 0.6935 - val_accuracy: 0.4733
Epoch 2/20
28/28 [==============================] - 21s 770ms/step - loss: 0.6932 - accuracy: 0.5023 - val_loss: 0.6944 - val_accuracy: 0.4707
Epoch 3/20
28/28 [==============================] - 21s 762ms/step - loss: 0.6929 - accuracy: 0.5126 - val_loss: 0.6940 - val_accuracy: 0.4720
Epoch 4/20
28/28 [==============================] - 20s 723ms/step - loss: 0.6926 - accuracy: 0.5137 - val_loss: 0.6932 - val_accuracy: 0.5253
Epoch 5/20
28/28 [==============================] - 20s 734ms/step - loss: 0.6920 - accuracy: 0.4994 - val_loss: 0.6938 - val_accuracy: 0.4693
Epoch 6/20
28/28 [==============================] - 21s 748ms/step - loss: 0.6918 - accuracy: 0.5086 - val_loss: 0.6944 - val_accuracy: 0.4747
Epoch 7/20
28/28 [==============================] - 22s 777ms/step - loss: 0.6914 - accuracy: 0.5126 - val_loss: 0.6952 - val_accuracy: 0.4733
Epoch 8/20
28/28 [==============================] - 21s 756ms/step - loss: 0.6911 - accuracy: 0.5149 - val_loss: 0.6943 - val_accuracy: 0.4747
Epoch 9/20
28/28 [==============================] - 21s 752ms/step - loss: 0.6908 - accuracy: 0.5149 - val_loss: 0.6943 - val_accuracy: 0.4707
np.array(history.history['accuracy']).mean()
model.evaluate(X_test_pad, y_test)
79/79 [==============================] - 11s 137ms/step - loss: 0.6924 - accuracy: 0.5096
import gensim.downloader as api
print(list(api.info()['models'].keys()))
['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']
word2vec_transfer = api.load('glove-wiki-gigaword-50')
[==================================================] 100.0% 66.0/66.0MB downloaded
len(word2vec_transfer)
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
embedded_sentence = []
for word in sentence:
if word in word2vec:
embedded_sentence.append(word2vec[word])
return np.array(embedded_sentence)
# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
embed = []
for sentence in sentences:
embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
embed.append(embedded_sentence)
return embed
# Embed the training and test sentences
X_train_embed_2 = embedding(word2vec_transfer, X_train)
X_test_embed_2 = embedding(word2vec_transfer, X_test)
X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post', maxlen=500)
X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post', maxlen=500)
model = Sequential()
model.add(layers.Masking(mask_value=-1000))
model.add(layers.LSTM(20))
model.add(layers.Dense(10, activation="tanh"))
model.add(layers.Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
history = model.fit(X_train_pad_2, y_train,
batch_size=64,
epochs=20,
validation_split=0.3,
callbacks=[es])
Epoch 1/20
28/28 [==============================] - 28s 842ms/step - loss: 0.6944 - accuracy: 0.4823 - val_loss: 0.6944 - val_accuracy: 0.4693
Epoch 2/20
28/28 [==============================] - 22s 793ms/step - loss: 0.6920 - accuracy: 0.5103 - val_loss: 0.6931 - val_accuracy: 0.4693
Epoch 3/20
28/28 [==============================] - 22s 778ms/step - loss: 0.6908 - accuracy: 0.5171 - val_loss: 0.6934 - val_accuracy: 0.4733
Epoch 4/20
28/28 [==============================] - 22s 774ms/step - loss: 0.6896 - accuracy: 0.5206 - val_loss: 0.6925 - val_accuracy: 0.5320
Epoch 5/20
28/28 [==============================] - 22s 782ms/step - loss: 0.6882 - accuracy: 0.5131 - val_loss: 0.6907 - val_accuracy: 0.4800
Epoch 6/20
28/28 [==============================] - 22s 784ms/step - loss: 0.6853 - accuracy: 0.4966 - val_loss: 0.6890 - val_accuracy: 0.4853
Epoch 7/20
28/28 [==============================] - 22s 796ms/step - loss: 0.6839 - accuracy: 0.5029 - val_loss: 0.6886 - val_accuracy: 0.4853
Epoch 8/20
28/28 [==============================] - 23s 814ms/step - loss: 0.6819 - accuracy: 0.5194 - val_loss: 0.6929 - val_accuracy: 0.4813
Epoch 9/20
28/28 [==============================] - 25s 889ms/step - loss: 0.6795 - accuracy: 0.5269 - val_loss: 0.6946 - val_accuracy: 0.4787
Epoch 10/20
28/28 [==============================] - 25s 885ms/step - loss: 0.6758 - accuracy: 0.5223 - val_loss: 0.6888 - val_accuracy: 0.5400
Epoch 11/20
28/28 [==============================] - 24s 852ms/step - loss: 0.6771 - accuracy: 0.5274 - val_loss: 0.6892 - val_accuracy: 0.5400
Epoch 12/20
28/28 [==============================] - 23s 845ms/step - loss: 0.6723 - accuracy: 0.5331 - val_loss: 0.7078 - val_accuracy: 0.5373
np.array(history.history['accuracy']).mean()
res = model.evaluate(X_test_pad_2, y_test, verbose=0)
print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')
The accuracy evaluated on the test set is of 49.840%