import torch
import torch.nn as nn
class SegmentRecurrentAttentionLayer(nn.Module):
def __init__(self, embed_dim, num_heads, dropout=0.1):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
# We need to keep track of the memory (hidden states) for each position. To do so we create a memory tensor with shape [max_seq_len, batch_size, embed_dim]. We will initialize this tensor with zeros and update it at each time-step in the forward function below.
self.memory = None
# The segment recurrence mechanism uses an attention mask over the memory to prevent positions from attending to future positions. We need to learn a separate matrix for each attention head to calculate this mask. The shape of this matrix is [num_heads, embed_dim, embed_dim] as we need one matrix for each head and each matrix needs to have shape [embed_dim, embed_dim].
self.w = nn.Parameter(torch.randn(num_heads, embed_dim, embed_dim))
# The output of the attention layer is also calculated using learned weights and biases. These are stored in two matrices which are added together (see equation 1 in the Transformer XL paper). The matrices have shape [num_heads, 2 * embed_dim, embed_dim] and [num_heads, embed_dim] respectively as we need one set of weights and biases for each head and each set needs to have shape [2 * embed_dim, embed_dim] and [embed_dim] respectively.
self.v = nn.Parameter(torch.randn(num_heads, 2 * embed_dim, embed_dim))
self.g = nn.Parameter(torch.randn(num_heads, embed_dim))
# We also apply dropout to the outputs of each sub-layer before it is added to the sub-layer input and normalized. In this case we use dropout with probability 0.1.
self.dropout = nn.Dropout(dropout)
# Layer normalization is applied to the input x, the sub-layer output y, the final output z as well as the memory m at each position in the sequence. We use epsilon=1e-6 in all cases.
self.layer_norm_x = nn.LayerNorm(embed_dim, eps=1e-6)
self.layer_norm_y = nn.LayerNorm(embed_dim, eps=1e-6)
self.layer_norm_z = nn.LayerNorm(embed_dim, eps=1e-6)
self.layer_norm_m = nn.LayerNorm(embed_dim, eps=1e-6)
def forward(self, x, padding_mask):
# If this is the first forward pass, we need to create a memory tensor filled with zeros which we will update at each time-step in the sequence and use when calculating the attention weights for that time-step (see equation 1 in the Transformer XL paper). We detach this from the graph so that we don't backpropagate through it and update its values unintentionally during training (i.e we don't want to update these values with gradient descent).
if self.memory is None:
device = x.device
self.memory = torch.zeros((x.size(0), x.size(1), x.size(2)), device=device).detach()
# Get the batch size from the input tensor x and create a mask over the sequence length (i.e a tensor which has value 1 for all values < seq_len and 0 for all values >= seq_len). We need this mask in the forward pass to calculate the attention weights for positions >= seq_len.
batch_size = x.size(1)
seq_len = x.size(0)
mask = torch.tril(torch.ones(seq_len, seq_len)).view(1, seq_len, seq_len).bool()
# Run the input tensor through the layer normalization layer.
norm_x = self.layer_norm_x(x)
# Calculate the query, key and value using linear layers with learned weights and biases. In the paper each of these are calculated using matrices with shape [batch_size, embed_dim, num_heads] but here we combine them into a single matrix with shape [batch_size, embed_dim, num_heads] by multiplying by a factor of $\sqrt{d}$ where $d$ is the dimensionality of the query (also equal to the number of heads).
q = torch.matmul(norm_x, self.v[:, :embed_dim, :].transpose(1, 2)) * math.sqrt(self.embed_dim)
k = torch.matmul(self.memory, self.v[:, embed_dim:, :].transpose(1, 2)) * math.sqrt(self.embed_dim)
v = self.memory
# Calculate the attention weights using equation 1 in the Transformer XL paper and apply dropout to them with probability 0.1 as described in section 3.4 of the paper. Note that we calculate the attention weights for positions >= seq_len using a mask over all positions which has value 1 for all positions < seq_len and 0 otherwise (see equation 1 in the Transformer XL paper). For positions < seq_len we use a mask over all positions which has value 0 for all positions >= seq_len and 1 otherwise (see equation 1 in the Transformer XL paper). We then scale these attention weights by $\frac{1}{\sqrt{d}}$ where $d$ is the dimensionality of the query (also equal to the number of heads).
attn_weights = torch.matmul(q, k.transpose(1, 2))
attn_weights = attn_weights.masked_fill(mask[:, :seq_len, :seq_len] == 0, -1e4)
attn_weights = F.softmax(attn_weights, dim=-1)
attn_weights = self.dropout(attn_weights) * math.sqrt(self.embed_dim)
# Calculate the context vector using equation 2 in the Transformer XL paper and apply dropout to it with probability 0.1 as described in section 3.4 of the paper. Note that we calculate the context vector for positions >= seq_len using a mask over all positions which has value 1 for all positions < seq_len and 0 otherwise (see equation 1 in the Transformer XL paper). For positions < seq_len we use a mask over all positions which has value 0 for all positions >= seq_len and 1 otherwise (see equation 1 in the Transformer XL paper). We then scale this context vector by $\frac{1}{\sqrt{d}}$ where $d$ is the dimensionality of the query (also equal to the number of heads).
y = torch.matmul(attn_weights, v)
y = self.dropout(y) * math.sqrt(self.embed_dim)
# Run the sub-layer output y through the layer normalization layer.
norm_y = self.layer_norm_y(y)
# Concatenate the sub-layer input x and output y along the embedding dimension (see equation 3 in the Transformer XL paper). Note that we also apply dropout with probability 0.1 as described in section 3.4 of the paper here (not shown in equation 3).
z = torch.cat((norm_x, norm_y), dim=2)
# Calculate the output using a linear layer with learned weights and biases as described in equation 4 in the Transformer XL paper. Note that we apply dropout with probability 0.1 as described in section 3.4 of the paper.
z = self.dropout(torch.matmul(z, self.g[:, None, :]))
# Run the final output z through the layer normalization layer.
norm_z = self.layer_norm_z(z)
# Update the memory tensor using equation 5 in the Transformer XL paper. Note that we apply dropout with probability 0.1 as described in section 3.4 of the paper here (not shown in equation 5).
memory = torch.cat((self.memory[1:], norm_z.unsqueeze(0)), dim=0)
memory = self.dropout(memory)
# Run the updated memory tensor through the layer normalization layer and return it as the output along with the final output z and attention weights for all positions >= seq_len (we don't need to return these for positions < seq_len as they are not used).
output = self.layer_norm_m(memory)
return output, norm_z, attn_weights[:, :, -seq_len:]

```
/shared-libs/python3.7/py/lib/python3.7/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
```

import torch
import torch.nn as nn
import torch.nn.functional as F
class RelativePositionalEncoding(nn.Module):
def __init__(self, embed_dim, max_seq_len=1024, dropout=0.1):
super().__init__()
self.embed_dim = embed_dim
# The relative positional encoding is given by:
# P_rel = [p_0, p_1, ..., p_{C - 1}]
# where C is the context length and p_i is a learned vector for each position i. In practice the authors use an embedding dimension of 64 and initialize P_rel with random numbers from a normal distribution with mean 0 and standard deviation 0.01.
pe = torch.zeros(max_seq_len, embed_dim)
position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe[None, :, :]
# We also apply dropout to the relative positional encoding with probability 0.1 as described in section 3.4 of the Transformer XL paper (not shown in the equations above). Note that we don't apply dropout here to sequences with length >= max_seq_len as we only need to use the first max_seq_len positions of the positional encoding matrix (see equation 6 in the Transformer XL paper). We use dropout with probability 0.1 in all other cases (i.e for sequences with length < max_seq_len).
self.dropout = nn.Dropout(dropout)
self.pe = nn.Parameter(pe, requires_grad=False)
def forward(self, x, seq_len):
# Get the batch size from the input tensor x.
batch_size = x.size(1)
# Create a mask over the sequence length (i.e a tensor which has value 1 for all values < seq_len and 0 for all values >= seq_len). We use this mask to apply dropout to sequences with length < max_seq_len (see equation 6 in the Transformer XL paper). We then unsqueeze this mask so that it has shape [max_seq_len, batch_size, 1] and repeat it along the embedding dimension so that it has shape [max_seq_len, batch_size, embed_dim].
mask = torch.arange(seq_len).expand(batch_size, seq_len) < seq_len.unsqueeze(0)
mask = mask[:, :, None].repeat(1, 1, self.embed_dim)
# Apply dropout to the relative positional encoding with probability 0.1 as described in section 3.4 of the Transformer XL paper (not shown in the equations above). Note that we don't apply dropout here to sequences with length >= max_seq_len as we only need to use the first max_seq_len positions of the positional encoding matrix (see equation 6 in the Transformer XL paper). We use dropout with probability 0.1 in all other cases (i.e for sequences with length < max_seq_len).
pe = self.pe[:, :seq_len]
pe = self.dropout(pe) * math.sqrt(self.embed_dim) * mask
# Add the relative positional encoding to the input tensor x (see equation 6 in the Transformer XL paper). Note that we don't add the positional encoding to sequences with length >= max_seq_len as we only need to use the first max_seq_len positions of the positional encoding matrix (see equation 6 in the Transformer XL paper).
output = x + pe
return output

class TransformerXL(nn.Module):
def __init__(self, vocab_size, embed_dim, num_heads, num_layers, max_seq_len, dropout=0.1):
super().__init__()
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_heads = num_heads
self.num_layers = num_layers
self.max_seq_len = max_seq_len
self.dropout = nn.Dropout(dropout)
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.position = nn.Embedding(max_seq_len - 1, embed_dim)
self.segment = nn.Embedding((max_seq_len - 1) // segment_size + 1, embed_dim)
self.attention = nn.ModuleList([SegmentRecurrentAttentionLayer(embed_dim, num_heads, dropout) for _ in range(num_layers)])
self.feed_forward = nn.ModuleList([nn.Sequential(nn.Linear(embed_dim, 4 * embed_dim), nn.ReLU(), nn.Linear(4 * embed_dim, embed_dim)) for _ in range(num_layers)])
self.layer_norm = nn.LayerNorm(embed_dim, eps=1e-6)
def forward(self, x, segment_size):
batch_size = x.size(1)
seq_len = x.size(0)
pos = torch.arange(seq_len, dtype=torch.long)[None, :].to(x.device) + 1
pos = self.position(pos)[:, :seq_len]
seg = torch.arange(seq_len, dtype=torch.long)[None, :].to(x.device) // segment_size + 1
seg = self.segment(seg)[:, :seq_len]
x = self.embedding(x) + pos + seg
for i in range(self.num_layers):
x, _, _ = self.attention[i](x, None)
x = self.feed_forward[i](x)
x = self.layer_norm(x)
return x

from datasets import list_datasets, load_dataset, list_metrics, load_metric
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import math
import random
import sys

squad_dataset = load_dataset('squad')

```
Downloading builder script: 5.27kB [00:00, 1.64MB/s]
Downloading metadata: 2.36kB [00:00, 975kB/s]
Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...
Downloading data files: 0%| | 0/2 [00:00<?, ?it/s]
Downloading data: 0%| | 0.00/8.12M [00:00<?, ?B/s]
Downloading data: 99%|█████████▊| 8.00M/8.12M [00:00<00:00, 80.0MB/s]
Downloading data: 16.0MB [00:00, 77.9MB/s]
Downloading data: 30.3MB [00:00, 75.8MB/s]
Downloading data files: 50%|█████ | 1/2 [00:00<00:00, 2.06it/s]
Downloading data: 4.85MB [00:00, 87.5MB/s]
Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 3.25it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 975.42it/s]
Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.
100%|██████████| 2/2 [00:00<00:00, 399.42it/s]
```

squad_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
squad_tokenizer.add_special_tokens({'cls_token': '[CLS]', 'sep_token': '[SEP]'})
squad_vocab_size = len(squad_tokenizer)

```
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 14.5kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 748kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 36.7MB/s]
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 44.2MB/s]
```

def train(model, optimizer, criterion, epochs, batch_size, segment_size, clip):
for epoch in range(epochs):
for context, question, answer in squad_dataset:
context = squad_tokenizer.encode(context)
question = squad_tokenizer.encode(question)
answer = squad_tokenizer.encode(answer)
context = torch.tensor(context)[None, :].to('cuda')
question = torch.tensor(question)[None, :].to('cuda')
answer = torch.tensor(answer)[None, :].to('cuda')
optimizer.zero_grad()
context_embedding = model(context, segment_size)
question_embedding = model(question, 1)
logits = torch.matmul(context_embedding[0], question_embedding[0].transpose(-1, -2))[:, 0]
loss = criterion(logits, answer[0])
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, loss))

def evaluate(model, context, question, answer):
context = squad_tokenizer.encode(context)
question = squad_tokenizer.encode(question)
answer = squad_tokenizer.encode(answer)
context = torch.tensor(context)[None, :].to('cpu')
question = torch.tensor(question)[None, :].to('cpu')
answer = torch.tensor(answer)[None, :].to('cpu')
context_embedding = model(context, segment_size)
question_embedding = model(question, 1)
logits = torch.matmul(context_embedding[0], question_embedding[0].transpose(-1, -2))[:, 0]
loss = criterion(logits, answer[0])
return loss.item()

model = TransformerXL(squad_vocab_size, embedding_dim, heads, layers, segment_size + 1, dropout).to('cuda')
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

NameError: name 'embedding_dim' is not defined