import torch
import torch.nn as nn
import torch.nn.functional as F
def init_parameters(input_size, hidden_size, output_size):
assert input_size == output_size
weights = {
"input_to_hidden" : torch.randn(size=(input_size, hidden_size)) * 0.01,
"hidden_to_hidden" : torch.randn(size=(hidden_size, hidden_size)) * 0.01,
"hidden_to_output" : torch.randn(size=(hidden_size, output_size)) * 0.01,
}
biases = {
"hidden" : torch.zeros(hidden_size),
"output" : torch.zeros(output_size)
}
for key in [weights, biases]:
for param in key.values():
param.requires_grad = True
return weights, biases
def init_state(batch_size, hidden_size):
hidden_state = torch.zeros((batch_size, hidden_size))
return hidden_state
def recurrent_layer(input, state, parameter):
weight, bias = parameter
hidden_state = state
output = []
for query in input:
new_state = torch.tanh(torch.mm(query, weight['input_to_hidden']) + torch.mm(hidden_state, weight['hidden_to_hidden']) + bias['hidden'])
output.append(torch.mm(new_state, weight['hidden_to_output'] + bias['output']))
hidden_state = new_state
return torch.cat(output, dim=0), hidden_state
class RecurrentNetwork:
def __init__(self, input_size, hidden_size, parameters, init_state, step_function):
self.input_size = input_size
self.hidden_size = hidden_size
self.parameters = parameters
self.init_state = init_state
self.step_function = step_function
def __call__(self, input_sequence, hidden_state):
input_sequence = F.one_hot(input_sequence.T, self.input_size).type(torch.float32)
output_sequence = self.step_function(input_sequence, hidden_state, self.parameters)
return output_sequence
def init_random(self, batch_size):
return self.init_state(batch_size, self.hidden_size)
def clip(x, a, b):
return np.clip(x, a, b)
def clip_gradients(parameters, c):
for p in parameters:
if p.grad is not None:
p.grad.data = torch.from_numpy(clip(p.grad.data.numpy(), -c, c))
return parameters
def normalize(x):
return x / np.linalg.norm(x)
def normalize_gradient(parameters):
for p in parameters:
if p.grad is not None:
p.grad = torch.from_numpy(normalize(p.grad))
return parameters
def scale_gradient(parameters, k):
for p in parameters:
if p.grad is not None:
p.grad = torch.from_numpy(k * normalize(p.grad))
return parameters