# Installing Dependencies
!pip install tensorflow transformers -q
!pip install datasets -q
!pip install transformers[torch]
!pip install evaluate
!pip install transformers datasets
!pip install gensim -q
Run to view results
# Import libraries and modules
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from gensim.models import Word2Vec
Run to view results
# Load dataset
dataset = load_dataset("climatebert/climate_sentiment")
train_data = dataset['train']
test_data = dataset['test']
Run to view results
# Preprocess data by tokenizing the text
# Initialize a Tokenizer with maximum 10000 vocabularies
vocab_size = 10000
tokenizer = Tokenizer(num_words=10000)
# Fit the tokenizer
tokenizer.fit_on_texts(train_data['text']+test_data['text'])
# Convert the text data into sequences
X_train = tokenizer.texts_to_sequences(train_data['text'])
X_test = tokenizer.texts_to_sequences(test_data['text'])
y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])
# Pad sequences to a fixed length 100 and store as processed text (X)
seq_len = 100
X_train = pad_sequences(X_train, maxlen=seq_len)
X_test = pad_sequences(X_test, maxlen=seq_len)
Run to view results
# Create RNN Models
# Model 1: LSTM Model
model1 = Sequential([ # Stack layers sequenctially
# Embedding layer to convert integer tokens into dense vectors
Embedding(input_dim=vocab_size, output_dim=128, input_length=seq_len),
# Bidirectional LSTM layer with 64 units, return sequences, and 32 units
Bidirectional((LSTM(64, return_sequences=True))),
Bidirectional((LSTM(32))),
Dropout(0.25), # Dropout for regularization
Dense(64),
Dropout(0.25),
Dense(16),
# Dense layer with a single output using softmax activation function for multiclass classification
Dense(3, activation='softmax') # Output layer for classification
])
# Compile model using sparse categorical crossentropy, Adam optimizer, and accuracy as metrics
model1.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train model using training data
model1.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
Run to view results
# Model 2: Word2Vec Embedding
word2vec_model = Word2Vec(sentences=train_data['text']+test_data['text'], vector_size=128, window=5, min_count=1, sg=0)
word2vec_model.save("word2vec.model")
# Embedding matrix for embedding layer
embedding_matrix = np.zeros((vocab_size, 128))
for word, i in tokenizer.word_index.items():
if i < vocab_size:
if word in word2vec_model.wv:
embedding_matrix[i] = word2vec_model.wv[word]
Run to view results
model2 = Sequential([ # Stack layers sequenctially
# Embedding layer to convert integer tokens into dense vectors
Embedding(input_dim=vocab_size, output_dim=128, input_length=seq_len, weights=[embedding_matrix], trainable=True),
# Bidirectional LSTM layer with 64 units, return sequences, and 32 units
Bidirectional((LSTM(64, return_sequences=True))),
Bidirectional((LSTM(32))),
Dropout(0.25), # Dropout for regularization
Dense(64),
Dropout(0.25),
Dense(16),
# Dense layer with a single output using softmax activation function for multiclass classification
Dense(3, activation='softmax') # Output layer for classification
])
# Compile model using sparse categorical crossentropy, Adam optimizer, and accuracy as metrics
model2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train model using training data
model2.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
Run to view results
# Libraries and modules for text classification using BERT
import tensorflow as tf
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict
Run to view results
# Load dataset (using same dataset with previous RNN Models)
dataset = load_dataset("climatebert/climate_sentiment")
dataset
Run to view results
# Tokenize the text data using BERT tokenizer
# Initialize BERT tokenizer from pre-trained "bert-base-uncased" model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Define a tokenize function to apply tokenization to the examples in the dataset
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length", # to ensure all tokenized sequences have the same length
truncation=True) # truncate exceeded token length
# Applying tokenization function to the dataset through map function
tokenized_datasets = dataset.map(tokenize_function, batched=True)
Run to view results
# Initialize model and metric
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
Run to view results
import numpy as np
import evaluate
# Load accuracy metric
metric = evaluate.load("accuracy")
# Custom function to compute evaluation metrics
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
Run to view results
from transformers import TrainingArguments, Trainer
# Define training arguments for the trainer
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
Run to view results
# Create a Trainer object for model training and evaluation
trainer = Trainer(
model = model, # specifies the model to be trained
args=training_args, # training arguments
train_dataset=tokenized_datasets['train'], # tokenized training dataset
eval_dataset=tokenized_datasets['test'], # tokenized test dataset
compute_metrics=compute_metrics # custom metric computation
)
# start training
trainer.train()
Run to view results
# Evaluate RNN models
loss1, acc1 = model1.evaluate(X_test, y_test)
loss2, acc2 = model2.evaluate(X_test, y_test)
# Evaluate Attention-based model
result = trainer.evaluate()
print(f"Model 1: LSTM")
print(f"Accuracy of LSTM: {acc1}")
print(f"Loss of LSTM: {loss1}\n")
print(f"Model 2: Word2Vec")
print(f"Accuracy of Word2Vec: {acc2}")
print(f"Loss of Word2Vec: {loss2}\n")
print(f"Model 3: BERT")
print(f"Accuracy of BERT: {result['eval_accuracy']}")
print(f"Loss of BERT: {result['eval_loss']}\n")
Run to view results
# Make predictions for the three models
# Predictions from RNN models
predictions1 = model1.predict(X_test)
predictions2 = model2.predict(X_test)
# Predictions from Attention-based model
result = trainer.predict(tokenized_datasets['test'])
Run to view results
# Loop through a subset of examples to see inference result
labels = ['risk', 'neutral', 'opportunity']
for i in range(10):
print(f"Text: {tokenized_datasets['test']['text'][i]}")
print(f"LSTM Prediction: {labels[np.argmax(predictions1[i])]}")
print(f"Word2Vec Prediction: {labels[np.argmax(predictions2[i])]}")
print(f"BERT Prediction: {labels[np.argmax(result[0][i])]}")
print(f"Groundtruth: {labels[tokenized_datasets['test']['label'][i]]}\n")
Run to view results