Implementing one to many LSTM/RNN, PyTorch - pytorch

I have a matrix sized m x n, and want to predict by 1 x n vector (x at the picture with the network structure) the whole next (m-1) x n matrix (y^{i} at the picture), using RNN or LSTM, I don't understand how to implement feeding each
1 x n vector to the next hidden state and get all the
(m-1) x n vectors simultaneously and how to compute error over all y^{i}
I have this vanilla RNN-model and don't know how to modify it
class RNNModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super(RNNModel, self).__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
# (batch_dim, seq_dim, feature_dim)
self.RNN = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='tanh')
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
# Initialize hidden state with zeros
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
out, h_t = self.RNN(x, h0)
#out = self.fc(h_t[:, -1, :])
out = self.fc(out[:, -1, :])
return out

Stainley, try this:
you initiate hidden state only if no other hidden state is passed. then you return the hidden state and pass it to forward() at the next iteration.
def forward(self, x, h=None):
if h is None: # if no hidden state is passed
h = torch.zeros( # Initialize hidden state with zeros
self.layer_dim, x.size(0),
self.hidden_dim).requires_grad_()
out, h_t = self.RNN(x, h)
out = self.fc(out[:, -1, :])
return out, h_t
in training code you run the cycle like this like this:
x = seed
h = None
for i in range (...)
optimizer.zero_grad()
...
x, h = model.forward (x, h)
...
loss = ...
loss.backward()
optimizer.step()

Related

Normalizing multivariate time-series data with different sequence length

I have a multivariate time-series dataset with different sequence lengths. I filled the missing values in the sequences with zeros. I am trying to use a recurrent neural network model for forcasting with Time Series. I noticed my results of the model degrade when the range of the data is outside -1 and 1. I wrote the following normalization class using MinMaxScaler. However, I don't know how to exclude the missing values in the sequences during computation of MinMaxScaler. Here is my code
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
import numpy as np
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def get_mask_from_sequence_lengths(
sequence_lengths: torch.Tensor, max_length: int
) -> torch.BoolTensor:
# (batch_size, max_length)
ones = sequence_lengths.new_ones(sequence_lengths.size(0), max_length)
range_tensor = ones.cumsum(dim=1)
return sequence_lengths.unsqueeze(1) >= range_tensor
class Normalizer1D(nn.Module):
# Data size of (batch_size, seq_len, input_size)
def __init__(self, input_dim, inputs ):
super(Normalizer1D, self).__init__()
self.input_dim = input_dim
self.to(device)
self._norm = self.build_normalizers(inputs)
max_len=inputs.shape[-1]
data = torch.from_numpy(inputs)
length = torch.LongTensor([torch.max((data[i,0,:]!=0).nonzero()).item()+1 for i in range(data.shape[0])])
mask = get_mask_from_sequence_lengths( length, max_len)
def build_normalizers(self, x):
normalizers = OrderedDict()
for i in range(self.input_dim):
if np.min(x[:,i,:])<0:
scaler = MinMaxScaler(feature_range=(-1, 1))
else:
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(x[:,i,:])
normalizers[str(i)] = scaler
return normalizers
def normalize(self, x):
#(B, D, T)
d = x.cpu().detach().numpy()
n_x=[]
for i in range(x.shape[1]):
n_x.append(self._norm[str(i)].fit_transform(d[:,i,:]))
x =np.stack(n_x, axis=1)
return torch.from_numpy(x).to(device)
def unnormalize(self, x):
#(T, B, D)==>(B, T, D)
d = x.cpu().detach().numpy()
n_x=[]
for i in range(x.shape[1]):
n_x.append(self._norm[str(i)].inverse_transform(d[:,i,:]))
x =np.stack(n_x, axis=1)
return torch.from_numpy(x).to(device)
#property
def min_(self):
#(T, B, D)
min_ = []
for i in range(len(self._norm)):
min_.append(self._norm[str(i)].min_)
return torch.from_numpy(np.stack(min_, axis=1))
#property
def scale_(self):
#(T, B, D)
scale_ = []
for i in range(len(self._norm)):
scale_.append(self._norm[str(i)].scale_)
return torch.from_numpy(np.stack(scale_, axis=1))
def unnormalize_mean(self, x_mu):
Xscale = self.scale_()
Xmin = self.min_()
normX = x_mu.mul_(Xscale)
return normX.add_(Xmin)
def unnormalize_sigma(self, x_sigma):
Xscale =self.scale_()
return x_sigma.mul_(Xscale)
# compute the normalizers
def compute_normalizer(loader_train):
##batch_size, input_dim, seq_len
for i, (u, y) in enumerate(loader_train):
if i ==0:
#input u torch.Size([B, D, T])
inputs = u
outputs = y
else:
inputs = torch.cat([inputs,u], dim=0)
outputs = torch.cat([outputs,y], dim=0)
inputs = inputs.cpu().detach().numpy()
outputs = outputs.cpu().detach().numpy()
# initialization
u_normalizer = Normalizer1D(inputs.shape[1], inputs)
y_normalizer = Normalizer1D(outputs.shape[1], outputs)
return u_normalizer, y_normalizer
I will appreciate if someone could suggest a way to exclude the missing values from the normalization process.

tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [100,200] vs. [100,10,200]

The shape of the tensor input to my model is(None, 10, 256),after processing by the attention layer, the shape becomes(None, 256),How should I modify layercompute_output_shape(self, input_shape) so that the shape of the model does not change?
attention layer
class Attention_layer(Layer):
def __init__(self,
W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
super(Attention_layer, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight(name='att_weight',shape=(input_shape[-1], input_shape[-1],),
initializer=self.init,
regularizer=self.W_regularizer,
constraint=self.W_constraint
)
if self.bias:
self.b = self.add_weight((input_shape[-1],),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
super(Attention_layer, self).build(input_shape)
def compute_mask(self, input, input_mask=None):#build(input_shape):
# do not pass the mask to the next layers
return None
def call(self, x, mask=None):#call(x):
uit = K.dot(x, self.W)
if self.bias:
uit += self.b
uit = K.tanh(uit)
a = K.exp(uit)
# apply mask after the exp. will be re-normalized next
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
a *= K.cast(mask, K.floatx())
# in some cases especially in the early stages of training the sum may be almost zero
# and this results in NaN's. A workaround is to add a very small positive number to the sum.
# a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
# a = K.expand_dims(a)
weighted_input = x * a
print(weighted_input)
return K.sum(weighted_input, axis=1)#output.shape = (batch_size, embedding_size)
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]

PyTorch: GRU, one-to-many / many-to-one

I would like to implement a GRU able to encode a sequence of vectors to one vector (many-to-one), and then another GRU able to decode a vector to a sequence of vector (one-to-many). The size of the vectors wouldn't be changed. I would like to have an opinion about what I implemented.
Here is the code:
class AEGRU(nn.Module):
def __init__(self, opt):
super(AEGRU, self).__init__()
self.length = 256
self.latent_space = 256
self.num_layers = 1
self.GRU_enc = nn.GRU(input_size=3, hidden_size=self.latent_space, num_layers=self.num_layers, batch_first=True)
self.fc_enc = nn.Linear(self.latent_space, self.latent_space)
self.GRU_dec = nn.GRU(input_size=self.latent_space, hidden_size=3, num_layers=self.num_layers, batch_first=True)
self.fc_dec = nn.Linear(3, 3)
def enc(self, x):
# x has shape: Batch_size x self.length x 3
h0 = torch.zeros(self.num_layers, x.shape[0], self.latent_space).cuda()
out, _ = self.GRU_enc(x, h0)
out = out[:, -1, :]
out = self.fc_enc(out)
return out
def dec(self, x):
# x has shape: Batch_size x self.latent_space
x = x[:, None, :]
h = torch.zeros(self.num_layers, x.shape[0], 3).cuda()
# method 1 ??
'''outputs = torch.zeros(x.shape[0], self.length, 3).cuda()
for i in range(self.length):
out, h = self.GRU_dec(x, h)
outputs[:, i, :] = out[:, 0, :]'''
# method 2 ??
x = x.repeat(1, self.length, 1)
outputs, _ = self.GRU_dec(x, h)
# linear layer
outputs = self.fc_dec(outputs)
return outputs
def forward(self, x):
self.indices = []
latent = self.enc(x)
output = self.dec(latent)
return output
I am not sure whether this is the good way to do a one-to-many GRU. Could I have some opinions about this?
Thanks for reading!

GPU memory increasing at each batch (PyTorch)

I am trying to build a convolutionnal network using ConvLSTM layer (LSTM cell but with convolutions instead of matrix multiplications), but the problem is that my GPU memory increases at each batch, even if I'm deleting variables, and getting the true value for the loss (and not the graph) for each iteration. I may be doing something wrong but that exact same script ran without issues with another model (with more parameters and also using ConvLSTM layer).
Each batch is composed of num_batch x 3 images (grayscale) and I'm trying to predict the difference |Im(t+1)-Im(t)| with the input Im(t)
def main():
config = Config()
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, num_workers=0, shuffle=True, drop_last=True)
nb_img = len(train_dataset)
util.clear_progress_dir()
step_tensorboard = 0
###################################
# Model Setup #
###################################
model = fully_convLSTM()
if torch.cuda.is_available():
model = model.float().cuda()
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
util.enumerate_params([model])
###################################
# Training Loop #
###################################
model.train() #Put model in training mode
train_loss_recon = []
train_loss_recon2 = []
for epoch in tqdm(range(config.num_epochs)):
running_loss1 = 0.0
running_loss2 = 0.0
for i, (inputs, outputs) in enumerate(train_dataloader, 0):
print(i)
torch.cuda.empty_cache()
gc.collect()
# if torch.cuda.is_available():
inputs = autograd.Variable(inputs.float()).cuda()
outputs = autograd.Variable(outputs.float()).cuda()
im1 = inputs[:,0,:,:,:]
im2 = inputs[:,1,:,:,:]
im3 = inputs[:,2,:,:,:]
diff1 = torch.abs(im2 - im1).cuda().float()
diff2 = torch.abs(im3 - im2).cuda().float()
model.initialize_hidden()
optimizer.zero_grad()
pred1 = model.forward(im1)
loss = reconstruction_loss(diff1, pred1)
loss.backward()
# optimizer.step()
model.update_hidden()
optimizer.zero_grad()
pred2 = model.forward(im2)
loss2 = reconstruction_loss(diff2, pred2)
loss2.backward()
optimizer.step()
model.update_hidden()
## print statistics
running_loss1 += loss.detach().data
running_loss2 += loss2.detach().data
if i==0:
with torch.no_grad():
img_grid_diff_true = (diff2).cpu()
img_grid_diff_pred = (pred2).cpu()
f, axes = plt.subplots(2, 4, figsize=(48,48))
for l in range(4):
axes[0, l].imshow(img_grid_diff_true[l].squeeze(0).squeeze(0), cmap='gray')
axes[1, l].imshow(img_grid_diff_pred[l].squeeze(0).squeeze(0), cmap='gray')
plt.show()
plt.close()
writer_recon_loss.add_scalar('Reconstruction loss', running_loss1, step_tensorboard)
writer_recon_loss2.add_scalar('Reconstruction loss2', running_loss2, step_tensorboard)
step_tensorboard += 1
del pred1
del pred2
del im1
del im2
del im3
del diff1
del diff2#, im1_noised, im2_noised
del inputs
del outputs
del loss
del loss2
for obj in gc.get_objects():
if torch.is_tensor(obj) :
del obj
torch.cuda.empty_cache()
gc.collect()
epoch_loss = running_loss1 / len(train_dataloader.dataset)
epoch_loss2 = running_loss2/ len(train_dataloader.dataset)
print(f"Epoch {epoch} loss reconstruction1: {epoch_loss:.6f}")
print(f"Epoch {epoch} loss reconstruction2: {epoch_loss2:.6f}")
train_loss_recon.append(epoch_loss)
train_loss_recon2.append(epoch_loss2)
del running_loss1, running_loss2, epoch_loss, epoch_loss2
Here is the model used :
class ConvLSTMCell(nn.Module):
def __init__(self, input_channels, hidden_channels, kernel_size):
super(ConvLSTMCell, self).__init__()
# assert hidden_channels % 2 == 0
self.input_channels = input_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
# self.num_features = 4
self.padding = 1
self.Wxi = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whi = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxf = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whf = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxc = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whc = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxo = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Who = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wci = None
self.Wcf = None
self.Wco = None
def forward(self, x, h, c): ## Equation (3) dans Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting
ci = torch.sigmoid(self.Wxi(x) + self.Whi(h) + c * self.Wci)
cf = torch.sigmoid(self.Wxf(x) + self.Whf(h) + c * self.Wcf)
cc = cf * c + ci * torch.tanh(self.Wxc(x) + self.Whc(h)) ###gt= tanh(cc)
co = torch.sigmoid(self.Wxo(x) + self.Who(h) + cc * self.Wco) ##channel out = hidden channel
ch = co * torch.tanh(cc)
return ch, cc #short memory, long memory
def init_hidden(self, batch_size, hidden, shape):
if self.Wci is None:
self.Wci = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wcf = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wco = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
else:
assert shape[0] == self.Wci.size()[2], 'Input Height Mismatched!'
assert shape[1] == self.Wci.size()[3], 'Input Width Mismatched!'
return (autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda(),
autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda())
class fully_convLSTM(nn.Module):
def __init__(self):
super(fully_convLSTM, self).__init__()
layers = []
self.hidden_list = [1,32,32,1]#,32,64,32,
for k in range(len(self.hidden_list)-1): # Define blocks of [ConvLSTM,BatchNorm,Relu]
name_conv = "self.convLSTM" +str(k)
cell_conv = ConvLSTMCell(self.hidden_list[k],self.hidden_list[k+1],3)
setattr(self, name_conv, cell_conv)
name_batchnorm = "self.batchnorm"+str(k)
batchnorm=nn.BatchNorm2d(self.hidden_list[k+1])
setattr(self, name_batchnorm, batchnorm)
name_relu =" self.relu"+str(k)
relu=nn.ReLU()
setattr(self, name_relu, relu)
self.sigmoid = nn.Sigmoid()
self.internal_state=[]
def initialize_hidden(self):
for k in range(len(self.hidden_list)-1):
name_conv = "self.convLSTM" +str(k)
(h,c) = getattr(self,name_conv).init_hidden(config.batch_size, self.hidden_list[k+1],(256,256))
self.internal_state.append((h,c))
self.internal_state_new=[]
def update_hidden(self):
for i, hidden in enumerate(self.internal_state_new):
self.internal_state[i] = (hidden[0].detach(), hidden[1].detach())
self.internal_state_new = []
def forward(self, input):
x = input
for k in range(len(self.hidden_list)-1):
name_conv = "self.convLSTM" +str(k)
name_batchnorm = "self.batchnorm"+str(k)
name_relu =" self.relu"+str(k)
x, c = getattr(self,name_conv)(x, self.internal_state[k][1], self.internal_state[k][0])
self.internal_state_new.append((x.detach(),c.detach()))
x = getattr(self,name_batchnorm)(x)
if k!= len(self.hidden_list)-2:
x = getattr(self,name_relu)(x)
else :
x = self.sigmoid(x)
return x
So my question is, what in my code is causing memory to accumulate during the training phase?
A few quick notes about training code:
torch.Variable is deprecated since at least 8 minor versions (see here), don't use it
gc.collect() has no point, PyTorch does the garbage collector on it's own
Don't use torch.cuda.empty_cache() for each batch, as PyTorch reserves some GPU memory (doesn't give it back to OS) so it doesn't have to allocate it for each batch once again. It will make your code slow, don't use this function at all tbh, PyTorch handles this.
Don't spam random memory cleaning, that's most probably not where the error is
Model
Yes, this is probably the case (although it's hard to read this model's code).
Take notice of self.internal_state list and self.internal_state_new list also.
Each time you call model.initialize_hidden() a new set of tensor is added to this list (and never cleaned as far as I can tell)
self.internal_state_new seems to be cleaned in update_hidden, maybe self.internal_state should be also?
In essence, check out this self.internal_state property of your model, the list grows indefinitely from what I see. Initializing with zeros everywhere is quite strange, there is probably no need to do that (e.g. PyTorch's RNN is initialized with zeros by default, this is probably similar).

TensorFlow, losses after training the model are different than losses printed during the last Epoch of Stochastic Gradient Descent.

I'm trying to do binary classification on two spirals. For testing, I am feeding my neural network the exact spiral data with no noise, and the model seems to work as the losses near 0 during SGD. However, after using my model to infer the exact same data points after SGD has completed, I get completely different losses than what was printed during the last epoch of SGD.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(threshold=np.nan)
# get the spiral points
t_p = np.linspace(0, 4, 1000)
x1_p = t_p * np.cos(t_p*2*np.pi)
y1_p = t_p * np.sin(t_p*2*np.pi)
x2_p = t_p * np.cos(t_p*2*np.pi + np.pi)
y2_p = t_p * np.sin(t_p*2*np.pi + np.pi)
plt.plot(x1_p, y1_p, x2_p, y2_p)
# generate data points
x1_dat = x1_p
y1_dat = y1_p
x2_dat = x2_p
y2_dat = y2_p
def model_variable(shape, name, initializer):
variable = tf.get_variable(name=name,
dtype=tf.float32,
shape=shape,
initializer=initializer
)
tf.add_to_collection('model_variables', variable)
return variable
class Model():
#layer specifications includes bias nodes
def __init__(self, sess, data, nEpochs, learning_rate, layer_specifications):
self.sess = sess
self.data = data
self.nEpochs = nEpochs
self.learning_rate = learning_rate
if layer_specifications[0] != 2 or layer_specifications[-1] != 1:
raise ValueError('First layer only two nodes, last layer only 1 node')
else:
self.layer_specifications = layer_specifications
self.build_model()
def build_model(self):
# x is the two nodes that will be layer one, will input an x, y coordinate
# and need to classify which spiral is it on, the non phase shifted or the phase
# shifted one.
# y is the output of the model
self.x = tf.placeholder(tf.float32, shape=[2, 1])
self.y = tf.placeholder(tf.float32, shape=[])
self.thetas = []
self.biases = []
for i in range(1, len(self.layer_specifications)):
self.thetas.append(model_variable([self.layer_specifications[i], self.layer_specifications[i-1]], 'theta'+str(i), tf.random_normal_initializer(stddev=0.1)))
self.biases.append(model_variable([self.layer_specifications[i], 1], 'bias'+str(i), tf.constant_initializer()))
#forward propagation
intermediate = self.x
for i in range(0, len(self.layer_specifications)-1):
if i != (len(self.layer_specifications) - 2):
intermediate = tf.nn.elu(tf.add(tf.matmul(self.thetas[i], intermediate), self.biases[i]))
else:
intermediate = tf.add(tf.matmul(self.thetas[i], intermediate), self.biases[i])
self.yhat = tf.squeeze(intermediate)
self.loss = tf.nn.sigmoid_cross_entropy_with_logits(self.yhat, self.y);
def train_init(self):
model_variables = tf.get_collection('model_variables')
self.optim = (
tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
.minimize(self.loss, var_list=model_variables)
)
self.check = tf.add_check_numerics_ops()
self.sess.run(tf.initialize_all_variables())
# here is where x and y combine to get just x in tf with shape [2, 1] and where label becomes y in tf
def train_iter(self, x, y):
loss, _, _ = sess.run([self.loss, self.optim, self.check],
feed_dict = {self.x: x, self.y: y})
print('loss: {0} on:{1}'.format(loss, x))
# here x and y are still x and y coordinates, label is separate
def train(self):
for _ in range(self.nEpochs):
for x, y, label in self.data():
print(label)
self.train_iter([[x], [y]], label)
print("NEW ONE:\n")
# here x and y are still x and y coordinates, label is separate
def infer(self, x, y, label):
return self.sess.run((tf.sigmoid(self.yhat), self.loss), feed_dict={self.x : [[x], [y]], self.y : label})
def data():
#so first spiral is label 0, second is label 1
for _ in range(len(x1_dat)-1, -1, -1):
for dat in range(2):
if dat == 0:
yield x1_dat[_], y1_dat[_], 0
else:
yield x2_dat[_], y2_dat[_], 1
layer_specifications = [2, 100, 100, 100, 1]
sess = tf.Session()
model = Model(sess, data, nEpochs=10, learning_rate=1.1e-2, layer_specifications=layer_specifications)
model.train_init()
model.train()
inferrences_1 = []
inferrences_2 = []
losses = 0
for i in range(len(t_p)-1, -1, -1):
infer, loss = model.infer(x1_p[i], y1_p[i], 0)
if infer >= 0.5:
print('loss: {0} on point {1}, {2}'.format(loss, x1_p[i], y1_p[i]))
losses = losses + 1
inferrences_1.append('r')
else:
inferrences_1.append('g')
for i in range(len(t_p)-1, -1, -1):
infer, loss = model.infer(x2_p[i], y2_p[i], 1)
if infer >= 0.5:
inferrences_2.append('r')
else:
print('loss: {0} on point {1}, {2}'.format(loss, x2_p[i], y2_p[i]))
losses = losses + 1
inferrences_2.append('g')
print('total losses: {}'.format(losses))
plt.scatter(x1_p, y1_p, c=inferrences_1)
plt.scatter(x2_p, y2_p, c=inferrences_2)
plt.show()

Resources