data-sentiment-analysis-with-word2vec

import numpy as np

########################################### ### Just run this cell to load the data ### ########################################### import tensorflow_datasets as tfds from tensorflow.keras.preprocessing.text import text_to_word_sequence def load_data(percentage_of_sentences=None): train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True) train_sentences, y_train = tfds.as_numpy(train_data) test_sentences, y_test = tfds.as_numpy(test_data) # Take only a given percentage of the entire data if percentage_of_sentences is not None: assert(percentage_of_sentences> 0 and percentage_of_sentences<=100) len_train = int(percentage_of_sentences/100*len(train_sentences)) train_sentences, y_train = train_sentences[:len_train], y_train[:len_train] len_test = int(percentage_of_sentences/100*len(test_sentences)) test_sentences, y_test = test_sentences[:len_test], y_test[:len_test] X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences] X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences] return X_train, y_train, X_test, y_test X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=10)

from gensim.models import Word2Vec

word2vec = Word2Vec(sentences=X_train, vector_size=10)

from tensorflow.keras.preprocessing.sequence import pad_sequences import numpy as np # Function to convert a sentence (list of words) into a matrix representing the words in the embedding space def embed_sentence(word2vec, sentence): embedded_sentence = [] for word in sentence: if word in word2vec.wv: embedded_sentence.append(word2vec.wv[word]) return np.array(embedded_sentence) # Function that converts a list of sentences into a list of matrices def embedding(word2vec, sentences): embed = [] for sentence in sentences: embedded_sentence = embed_sentence(word2vec, sentence) embed.append(embedded_sentence) return embed # Embed the training and test sentences X_train_embed = embedding(word2vec, X_train) X_test_embed = embedding(word2vec, X_test) # Pad the training and test embedded sentences X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=500) X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=500)

# TEST ME for X in [X_train_pad, X_test_pad]: assert type(X) == np.ndarray assert X.shape[-1] == word2vec.wv.vector_size assert X_train_pad.shape[0] == len(X_train) assert X_test_pad.shape[0] == len(X_test)

0.5

from tensorflow.keras import layers, Sequential

model = Sequential() model.add(layers.Masking(mask_value=-1000)) model.add(layers.LSTM(20)) model.add(layers.Dense(10, activation="tanh")) model.add(layers.Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

from tensorflow.keras import callbacks

es = callbacks.EarlyStopping(patience=5) history = model.fit(X_train_pad, y_train, batch_size=64, epochs=20, validation_split=0.3, callbacks=[es])

np.array(history.history['accuracy']).mean()

model.evaluate(X_test_pad, y_test)

import gensim.downloader as api print(list(api.info()['models'].keys()))

word2vec_transfer = api.load('glove-wiki-gigaword-50')

len(word2vec_transfer)

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space def embed_sentence_with_TF(word2vec, sentence): embedded_sentence = [] for word in sentence: if word in word2vec: embedded_sentence.append(word2vec[word]) return np.array(embedded_sentence) # Function that converts a list of sentences into a list of matrices def embedding(word2vec, sentences): embed = [] for sentence in sentences: embedded_sentence = embed_sentence_with_TF(word2vec, sentence) embed.append(embedded_sentence) return embed

# Embed the training and test sentences X_train_embed_2 = embedding(word2vec_transfer, X_train) X_test_embed_2 = embedding(word2vec_transfer, X_test)

X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post', maxlen=500) X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post', maxlen=500)

model = Sequential() model.add(layers.Masking(mask_value=-1000)) model.add(layers.LSTM(20)) model.add(layers.Dense(10, activation="tanh")) model.add(layers.Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

history = model.fit(X_train_pad_2, y_train, batch_size=64, epochs=20, validation_split=0.3, callbacks=[es])

np.array(history.history['accuracy']).mean()

res = model.evaluate(X_test_pad_2, y_test, verbose=0) print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')