Praktikum 1

# Installing Dependencies !pip install tensorflow transformers -q !pip install datasets -q !pip install transformers[torch] !pip install evaluate !pip install transformers datasets !pip install gensim -q

Run to view results

# Import libraries and modules import tensorflow as tf import numpy as np from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout from tensorflow.keras.models import Sequential from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from sklearn.model_selection import train_test_split from datasets import load_dataset from gensim.models import Word2Vec

Run to view results

# Load dataset dataset = load_dataset("climatebert/climate_sentiment") train_data = dataset['train'] test_data = dataset['test']

Run to view results

# Preprocess data by tokenizing the text # Initialize a Tokenizer with maximum 10000 vocabularies vocab_size = 10000 tokenizer = Tokenizer(num_words=10000) # Fit the tokenizer tokenizer.fit_on_texts(train_data['text']+test_data['text']) # Convert the text data into sequences X_train = tokenizer.texts_to_sequences(train_data['text']) X_test = tokenizer.texts_to_sequences(test_data['text']) y_train = np.array(train_data['label']) y_test = np.array(test_data['label']) # Pad sequences to a fixed length 100 and store as processed text (X) seq_len = 100 X_train = pad_sequences(X_train, maxlen=seq_len) X_test = pad_sequences(X_test, maxlen=seq_len)

Run to view results

# Create RNN Models # Model 1: LSTM Model model1 = Sequential([ # Stack layers sequenctially # Embedding layer to convert integer tokens into dense vectors Embedding(input_dim=vocab_size, output_dim=128, input_length=seq_len), # Bidirectional LSTM layer with 64 units, return sequences, and 32 units Bidirectional((LSTM(64, return_sequences=True))), Bidirectional((LSTM(32))), Dropout(0.25), # Dropout for regularization Dense(64), Dropout(0.25), Dense(16), # Dense layer with a single output using softmax activation function for multiclass classification Dense(3, activation='softmax') # Output layer for classification ]) # Compile model using sparse categorical crossentropy, Adam optimizer, and accuracy as metrics model1.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # Train model using training data model1.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Run to view results

# Model 2: Word2Vec Embedding word2vec_model = Word2Vec(sentences=train_data['text']+test_data['text'], vector_size=128, window=5, min_count=1, sg=0) word2vec_model.save("word2vec.model") # Embedding matrix for embedding layer embedding_matrix = np.zeros((vocab_size, 128)) for word, i in tokenizer.word_index.items(): if i < vocab_size: if word in word2vec_model.wv: embedding_matrix[i] = word2vec_model.wv[word]

Run to view results

model2 = Sequential([ # Stack layers sequenctially # Embedding layer to convert integer tokens into dense vectors Embedding(input_dim=vocab_size, output_dim=128, input_length=seq_len, weights=[embedding_matrix], trainable=True), # Bidirectional LSTM layer with 64 units, return sequences, and 32 units Bidirectional((LSTM(64, return_sequences=True))), Bidirectional((LSTM(32))), Dropout(0.25), # Dropout for regularization Dense(64), Dropout(0.25), Dense(16), # Dense layer with a single output using softmax activation function for multiclass classification Dense(3, activation='softmax') # Output layer for classification ]) # Compile model using sparse categorical crossentropy, Adam optimizer, and accuracy as metrics model2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # Train model using training data model2.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Run to view results

# Libraries and modules for text classification using BERT import tensorflow as tf import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments from sklearn.model_selection import train_test_split from datasets import load_dataset, DatasetDict

Run to view results

# Load dataset (using same dataset with previous RNN Models) dataset = load_dataset("climatebert/climate_sentiment") dataset

Run to view results

# Tokenize the text data using BERT tokenizer # Initialize BERT tokenizer from pre-trained "bert-base-uncased" model tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Define a tokenize function to apply tokenization to the examples in the dataset def tokenize_function(examples): return tokenizer( examples["text"], padding="max_length", # to ensure all tokenized sequences have the same length truncation=True) # truncate exceeded token length # Applying tokenization function to the dataset through map function tokenized_datasets = dataset.map(tokenize_function, batched=True)

Run to view results

# Initialize model and metric model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Run to view results

import numpy as np import evaluate # Load accuracy metric metric = evaluate.load("accuracy") # Custom function to compute evaluation metrics def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels)

Run to view results

from transformers import TrainingArguments, Trainer # Define training arguments for the trainer training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

Run to view results

# Create a Trainer object for model training and evaluation trainer = Trainer( model = model, # specifies the model to be trained args=training_args, # training arguments train_dataset=tokenized_datasets['train'], # tokenized training dataset eval_dataset=tokenized_datasets['test'], # tokenized test dataset compute_metrics=compute_metrics # custom metric computation ) # start training trainer.train()

Run to view results

# Evaluate RNN models loss1, acc1 = model1.evaluate(X_test, y_test) loss2, acc2 = model2.evaluate(X_test, y_test) # Evaluate Attention-based model result = trainer.evaluate() print(f"Model 1: LSTM") print(f"Accuracy of LSTM: {acc1}") print(f"Loss of LSTM: {loss1}\n") print(f"Model 2: Word2Vec") print(f"Accuracy of Word2Vec: {acc2}") print(f"Loss of Word2Vec: {loss2}\n") print(f"Model 3: BERT") print(f"Accuracy of BERT: {result['eval_accuracy']}") print(f"Loss of BERT: {result['eval_loss']}\n")

Run to view results

# Make predictions for the three models # Predictions from RNN models predictions1 = model1.predict(X_test) predictions2 = model2.predict(X_test) # Predictions from Attention-based model result = trainer.predict(tokenized_datasets['test'])

Run to view results

# Loop through a subset of examples to see inference result labels = ['risk', 'neutral', 'opportunity'] for i in range(10): print(f"Text: {tokenized_datasets['test']['text'][i]}") print(f"LSTM Prediction: {labels[np.argmax(predictions1[i])]}") print(f"Word2Vec Prediction: {labels[np.argmax(predictions2[i])]}") print(f"BERT Prediction: {labels[np.argmax(result[0][i])]}") print(f"Groundtruth: {labels[tokenized_datasets['test']['label'][i]]}\n")

Run to view results