Try to concatenate tensors of inconsistent batch size in tensorflow2 - tensorflow2.x

I trie to concatenate 2 tensors of different batch size within a function decorated with #tf.function. I tried 2 methods and the first one is listed as below:
import tensorflow as tf
#tf.function # indispensable
def fun1(tensors, indices):
results = []
for i in tf.range(2): # batch size = 2
pos = tf.where(indices==i)
emb = tf.gather_nd(tensors, pos)
# do something to emb, but do nothing here for simplicity.
results += [emb]
results = tf.concat(results, axis=0)
return results
tensors = tf.random.uniform((5, 2))
fun1(tensors, indices=[0, 0, 1, 1, 1])
But it raises errors as following:
TypeError: 'results' does not have the same nested structure after one iteration.
The two structures don't have the same nested structure.
First structure: type=list str=[]
Second structure: type=list str=[<tf.Tensor 'while/GatherNd:0' shape=(None, 2) dtype=float32>]
More specifically: The two structures don't have the same number of elements. First structure: type=list str=[]. Second structure: type=list str=[<tf.Tensor 'while/GatherNd:0' shape=(None, 2) dtype=float32>]
Entire first structure:
[]
Entire second structure:
[.]
So I tried the second method:
import tensorflow as tf
#tf.function # indispensable
def fun2(tensors, indices):
results = tf.reshape(tf.constant([], dtype=tf.float32), (0, 2)) # make empty tensors
for i in tf.range(2): # batch size = 2
pos = tf.where(indices==i)
emb = tf.gather_nd(tensors, pos)
# do something to emb, but do nothing here for simplicity
results = tf.concat([results, emb], axis=0)
return results
tensors = tf.random.uniform((5, 2))
fun2(tensors, indices=[0, 0, 1, 1, 1])
But it raises errors:
ValueError: 'results' has shape (0, 2) before the loop, but shape (None, 2) after one iteration. Use tf.autograph.experimental.set_loop_options to set shape invariants.
How should I resolve the problems? Thanks

I found I could achieve it by adding one line of codes to the second method as following:
#tf.function
def fun2(tensors, indices):
results = tf.reshape(tf.constant([], dtype=tf.float32), (0, 2)) # make empty tensors
for i in tf.range(2): # batch size = 2
tf.autograph.experimental.set_loop_options(shape_invariants=[(results, tf.TensorShape([None, 2]))])
pos = tf.where(indices==i)
emb = tf.gather_nd(tensors, pos)
# do something to emb, but do nothing here for simplicity
results = tf.concat([results, emb], axis=0)
return results

Related

Pytorch datatype/dimension confusion TypeError: 'Tensor' object is not callable

This piece of code is originally written in numpy and I'm trying to utilise GPU computation by rewriting it in pytorch, but as I'm new to pytorch a lot of problems occured to me. Firstly I'm confused by the dimension of the tensors. Sometimes after operating on the tensors, only transposing the tensor would fix the problem, is there anyway I can stop doing .t()? The major problem here is that in the line ar = torch.stack ... the error "TypeError: 'Tensor' object is not callable " occurs. Any suggestion/correction would be appreciated. Thxxx
def vec_datastr(vector):
vector = vector.float()
# Find the indices corresponding to non-zero entries
index = torch.nonzero(vector)
index = index.t()
# Compute probability
prob = vector ** 2
if torch.sum(prob) == 0:
prob = 0
else:
prob = prob / torch.sum(prob)
d = depth(vector)
CumProb = torch.ones((2**d-len(prob.t()),1), device ='cuda')
cp = torch.cumsum(prob, dim=0)
cp = cp.reshape((len(cp.t()),1))
CumProb = torch.cat((cp, CumProb),0)
vector = vector.t()
prob = prob.t()
ar = torch.stack((index, vector([index,1]), prob([index, 1]), CumProb([index, 1]))) # Problems occur here
ar = ar.reshape((len(index), 4))
# Store the data as a 4-dimensional array
output = dict()
output = {'index':ar[:,0], 'value': ar[:,1], 'prob':ar[:,2], 'CumProb': ar[:,3]}
return output
ar = torch.stack(
(index, vector([index, 1]), prob([index, 1]), CumProb([index, 1]))
) # Problems occur here
vector is of type torch.Tensor. It has no __call__ defined. You are going for vector(...) (vector([index,1])) while you should slice the data directly like this: vector[index, 1]. Same goes for prob and CumProb.
Somehow, you do it correctly for ar with ar[:,0] so it might be a typo

What would be the equivalent of keras.layers.Masking in pytorch?

I have time-series sequences which I needed to keep the length of sequences fixed to a number by padding zeroes into matrix and using keras.layers.Masking in keras I could neglect those padded zeros for further computations, I am wondering how could it be done in Pytorch?
Either I need to do the padding in pytroch and pytorch can't handle the sequences with varying lengths what is the equivalent to Masking layer of keras in pytorch, or if pytorch handles the sequences with varying lengths, how could it be done?
You can use PackedSequence class as equivalent to keras masking. you can find more features at torch.nn.utils.rnn
Here putting example from packing for variable-length sequence inputs for rnn
import torch
import torch.nn as nn
from torch.autograd import Variable
batch_size = 3
max_length = 3
hidden_size = 2
n_layers =1
# container
batch_in = torch.zeros((batch_size, 1, max_length))
#data
vec_1 = torch.FloatTensor([[1, 2, 3]])
vec_2 = torch.FloatTensor([[1, 2, 0]])
vec_3 = torch.FloatTensor([[1, 0, 0]])
batch_in[0] = vec_1
batch_in[1] = vec_2
batch_in[2] = vec_3
batch_in = Variable(batch_in)
seq_lengths = [3,2,1] # list of integers holding information about the batch size at each sequence step
# pack it
pack = torch.nn.utils.rnn.pack_padded_sequence(batch_in, seq_lengths, batch_first=True)
>>> pack
PackedSequence(data=Variable containing:
1 2 3
1 2 0
1 0 0
[torch.FloatTensor of size 3x3]
, batch_sizes=[3])
# initialize
rnn = nn.RNN(max_length, hidden_size, n_layers, batch_first=True)
h0 = Variable(torch.randn(n_layers, batch_size, hidden_size))
#forward
out, _ = rnn(pack, h0)
# unpack
unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(out)
>>> unpacked
Variable containing:
(0 ,.,.) =
-0.7883 -0.7972
0.3367 -0.6102
0.1502 -0.4654
[torch.FloatTensor of size 1x3x2]
more you would find this article useful. [Jum to Title - "How the PackedSequence object works"] - link
You can use a packed sequence to mask a timestep in the sequence dimension:
batch_mask = ... # boolean mask e.g. (seq x batch)
# move `padding` at right place then it will be cut when packing
compact_seq = torch.zeros_like(x)
for i, seq_len in enumerate(batch_mask.sum(0)):
compact_seq[:seq_len, i] = x[batch_mask[:,i],i]
# pack in sequence dimension (the number of agents)
packed_x = pack_padded_sequence(compact_seq, batch_mask.sum(0).cpu().numpy(), enforce_sorted=False)
packed_scores, rnn_hxs = nn.GRU(packed_x, rnn_hxs)
# restore sequence dimension
scores, _ = pad_packed_sequence(packed_scores)
# restore order, moving padding in its place
scores = torch.zeros((*batch_mask.shape,scores.size(-1))).to(scores.device).masked_scatter(batch_mask.unsqueeze(-1), scores)
instead use a mask select/scatter to mask in the batch dimension:
batch_mask = torch.any(x, -1).unsqueeze(-1) # boolean mask (batch,1)
batch_x = torch.masked_select(x, batch_mask).reshape(-1, x.size(-1))
batch_rnn_hxs = torch.masked_select(rnn_hxs, batch_mask).reshape(-1, rnn_hxs.size(-1))
batch_rnn_hxs = nn.GRUCell(batch_x, batch_rnn_hxs)
rnn_hxs = rnn_hxs.masked_scatter(batch_mask, batch_rnn_hxs) # restore batch
Note that using scatter function is safe for gradient backpropagation

Loop over tensor dimension 0 (NoneType) with second tensor values

I have a tensor a, I'd like to loop over the rows and index values based on another tensor l. i.e. l suggests the length of the vector I need.
sess = tf.InteractiveSession()
a = tf.constant(np.random.rand(3,4)) # shape=(3,4)
a.eval()
Out:
array([[0.35879311, 0.35347166, 0.31525201, 0.24089784],
[0.47296348, 0.96773956, 0.61336239, 0.6093023 ],
[0.42492552, 0.2556728 , 0.86135674, 0.86679779]])
l = tf.constant(np.array([3,2,4])) # shape=(3,)
l.eval()
Out:
array([3, 2, 4])
Expected output:
[array([0.35879311, 0.35347166, 0.31525201]),
array([0.47296348, 0.96773956]),
array([0.42492552, 0.2556728 , 0.86135674, 0.86679779])]
The tricky part is the fact that a could have None as first dimension since it's what is usually defined as batch size through placeholder.
I can not just use mask and condition as below since I need to compute the variance of each row individually.
condition = tf.sequence_mask(l, tf.reduce_max(l))
a_true = tf.boolean_mask(a, condition)
a_true
Out:
array([0.35879311, 0.35347166, 0.31525201, 0.47296348, 0.96773956,
0.42492552, 0.2556728 , 0.86135674, 0.86679779])
I also tried to use tf.map_fn but can't get it to work.
elems = (a, l)
tf.map_fn(lambda x: x[0][:x[1]], elems)
Any help will be highly appreciated!
TensorArray object can store tensors of different shapes. However, it is still not that simple. Take a look at this example that does what you want using tf.while_loop() with tf.TensorArray and tf.slice() function:
import tensorflow as tf
import numpy as np
batch_data = np.array([[0.35879311, 0.35347166, 0.31525201, 0.24089784],
[0.47296348, 0.96773956, 0.61336239, 0.6093023 ],
[0.42492552, 0.2556728 , 0.86135674, 0.86679779]])
batch_idx = np.array([3, 2, 4]).reshape(-1, 1)
x = tf.placeholder(tf.float32, shape=(None, 4))
idx = tf.placeholder(tf.int32, shape=(None, 1))
n_items = tf.shape(x)[0]
init_ary = tf.TensorArray(dtype=tf.float32,
size=n_items,
infer_shape=False)
def _first_n(i, ta):
ta = ta.write(i, tf.slice(input_=x[i],
begin=tf.convert_to_tensor([0], tf.int32),
size=idx[i]))
return i+1, ta
_, first_n = tf.while_loop(lambda i, ta: i < n_items,
_first_n,
[0, init_ary])
first_n = [first_n.read(i) # <-- extracts the tensors
for i in range(batch_data.shape[0])] # that you're looking for
with tf.Session() as sess:
res = sess.run(first_n, feed_dict={x:batch_data, idx:batch_idx})
print(res)
# [array([0.3587931 , 0.35347167, 0.315252 ], dtype=float32),
# array([0.47296348, 0.9677396 ], dtype=float32),
# array([0.4249255 , 0.2556728 , 0.86135674, 0.8667978 ], dtype=float32)]
Note
We still had to use batch_size to extract elements one by one from first_n TensorArray using read() method. We can't use any other method that returns Tensor because we have rows of different sizes (except TensorArray.concat method but it will return all elements stacked in one dimension).
If TensorArray will have less elements than index you pass to TensorArray.read(index) you will get InvalidArgumentError.
You can't use tf.map_fn because it returns a tensor that must have all elements of the same shape.
The task is simpler if you only need to compute variances of the first n elements of each row (without actually gather elements of different sizes together). In this case we could directly compute variance of sliced tensor, put it to TensorArray and then stack it to tensor:
n_items = tf.shape(x)[0]
init_ary = tf.TensorArray(dtype=tf.float32,
size=n_items,
infer_shape=False)
def _variances(i, ta, begin=tf.convert_to_tensor([0], tf.int32)):
mean, varian = tf.nn.moments(
tf.slice(input_=x[i], begin=begin, size=idx[i]),
axes=[0]) # <-- compute variance
ta = ta.write(i, varian) # <-- write variance of each row to `TensorArray`
return i+1, ta
_, variances = tf.while_loop(lambda i, ta: i < n_items,
_variances,
[ 0, init_ary])
variances = variances.stack() # <-- read from `TensorArray` to `Tensor`
with tf.Session() as sess:
res = sess.run(variances, feed_dict={x:batch_data, idx:batch_idx})
print(res) # [0.0003761 0.06120085 0.07217039]

Implementing word dropout in pytorch

I want to add word dropout to my network so that I can have sufficient training examples for training the embedding of the "unk" token. As far as I'm aware, this is standard practice. Let's assume the index of the unk token is 0, and the index for padding is 1 (we can switch them if that's more convenient).
This is a simple CNN network which implements word dropout the way I would have expected it to work:
class Classifier(nn.Module):
def __init__(self, params):
super(Classifier, self).__init__()
self.params = params
self.word_dropout = nn.Dropout(params["word_dropout"])
self.pad = torch.nn.ConstantPad1d(max(params["window_sizes"])-1, 1)
self.embedding = nn.Embedding(params["vocab_size"], params["word_dim"], padding_idx=1)
self.convs = nn.ModuleList([nn.Conv1d(1, params["feature_num"], params["word_dim"] * window_size, stride=params["word_dim"], bias=False) for window_size in params["window_sizes"]])
self.dropout = nn.Dropout(params["dropout"])
self.fc = nn.Linear(params["feature_num"] * len(params["window_sizes"]), params["num_classes"])
def forward(self, x, l):
x = self.word_dropout(x)
x = self.pad(x)
embedded_x = self.embedding(x)
embedded_x = embedded_x.view(-1, 1, x.size()[1] * self.params["word_dim"]) # [batch_size, 1, seq_len * word_dim]
features = [F.relu(conv(embedded_x)) for conv in self.convs]
pooled = [F.max_pool1d(feat, feat.size()[2]).view(-1, params["feature_num"]) for feat in features]
pooled = torch.cat(pooled, 1)
pooled = self.dropout(pooled)
logit = self.fc(pooled)
return logit
Don't mind the padding - pytorch doesn't have an easy way of using non zero padding in CNNs, much less trainable non-zero padding, so I'm doing it manually. Dropout also doesn't allow me to use non zero dropout, and I want to separate the padding token from the unk token. I'm keeping it in my example because it's the reason for this question's existence.
This doesn't work because dropout wants Float Tensors so that it can scale them properly, while my input is Long Tensors that don't need to be scaled.
Is there an easy way of doing this in pytorch? I essentially want to use LongTensor-friendly dropout (bonus: better if it will let me specify a dropout constant that isn't 0, so that I could use zero padding).
Actually I would do it outside of your model, before converting your input into a LongTensor.
This would look like this:
import random
def add_unk(input_token_id, p):
#random.random() gives you a value between 0 and 1
#to avoid switching your padding to 0 we add 'input_token_id > 1'
if random.random() < p and input_token_id > 1:
return 0
else:
return input_token_id
#than you have your input token_id
#for this example I take just a random number, lets say 127
input_token_id = 127
#let p be your probability for UNK
p = 0.01
your_input_tensor = torch.LongTensor([add_unk(input_token_id, p)])
Edit:
So there are two options which come to my mind which are actually GPU-friendly. In general both solutions should be much more efficient.
Option one - Doing computation directly in forward():
If you're not using torch.utils and don't have plans using it later this is probably the way to go.
Instead of doing the computation before we just do it in the forward() method of main PyTorch class. However I see no (simple) way doing this in torch 0.3.1., so you would need to upgrade to version 0.4.0:
So imagine x is your input vector:
>>> x = torch.tensor(range(10))
>>> x
tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
probs is a vector containing uniform probabilities for dropout so we can check later agains our probability for dropout:
>>> probs = torch.empty(10).uniform_(0, 1)
>>> probs
tensor([ 0.9793, 0.1742, 0.0904, 0.8735, 0.4774, 0.2329, 0.0074,
0.5398, 0.4681, 0.5314])
Now we apply the dropout probabilities probs on our input x:
>>> torch.where(probs > 0.2, x, torch.zeros(10, dtype=torch.int64))
tensor([ 0, 0, 0, 3, 4, 5, 0, 7, 8, 9])
Note: To see some effect I chose a dropout probability of 0.2 here. I reality you probably want it to be smaller.
You can pick for this any token / id you like, here is an example with 42 as unknown token id:
>>> unk_token = 42
>>> torch.where(probs > 0.2, x, torch.empty(10, dtype=torch.int64).fill_(unk_token))
tensor([ 0, 42, 42, 3, 4, 5, 42, 7, 8, 9])
torch.where comes with PyTorch 0.4.0:
https://pytorch.org/docs/master/torch.html#torch.where
I don't know about the shapes of your network, but your forward() should look something like this then (when using mini-batching you need to flatten the input before applying dropout):
def forward_train(self, x, l):
# probabilities
probs = torch.empty(x.size(0)).uniform_(0, 1)
# applying word dropout
x = torch.where(probs > 0.02, x, torch.zeros(x.size(0), dtype=torch.int64))
# continue like before ...
x = self.pad(x)
embedded_x = self.embedding(x)
embedded_x = embedded_x.view(-1, 1, x.size()[1] * self.params["word_dim"]) # [batch_size, 1, seq_len * word_dim]
features = [F.relu(conv(embedded_x)) for conv in self.convs]
pooled = [F.max_pool1d(feat, feat.size()[2]).view(-1, params["feature_num"]) for feat in features]
pooled = torch.cat(pooled, 1)
pooled = self.dropout(pooled)
logit = self.fc(pooled)
return logit
Note: I named the function forward_train() so you should use another forward() without dropout for evaluation / predicting. But you could also use some if conditions with train().
Option two: using torch.utils.data.Dataset:
If you're using Dataset provided by torch.utils it is very easy to do this kind of pre-processing efficiently. Dataset uses strong multi-processing acceleration by default so the the code sample above just has to be executed in the __getitem__ method of your Dataset class.
This could look like this:
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
ID = self.input_tokens[index]
# Load data and get label
# using add ink_unk function from code above
X = torch.LongTensor(add_unk(ID, p=0.01))
y = self.targets[index]
return X, y
This is a bit out of context and doesn't look very elegant but I think you get the idea. According to this blog post of Shervine Amidi at Stanford it should be no problem to do more complex pre-processing steps in this function:
Since our code [Dataset is meant] is designed to be multicore-friendly, note that you
can do more complex operations instead (e.g. computations from source
files) without worrying that data generation becomes a bottleneck in
the training process.
The linked blog post - "A detailed example of how to generate your data in parallel with PyTorch" - provides also a good guide for implementing the data generation with Dataset and DataLoader.
I guess you'll prefer option one - only two lines and it should be very efficient. :)
Good luck!

LSTMLayer produces NaN values even before training it

I'm currently trying to construct a LSTM network with Lasagne to predict the next step of noisy sequences. I first trained a stack of 2 LSTM layers for a while, but had to use an abysmally small learning rate (1e-6) because of divergence issues (that ultimately produced NaN values). The results were kind of disappointing, as the network produced smooth, out-of-phase versions of the input.
I then came to the conclusion I should use better parameter initialization than what is given by default. The goal was to start from a network that just mimics identity, since for strongly auto-correlated signal it should be a good first estimation of the next step (x(t) ~ x(t+1)), and to sprinkle a bit of noise on top of it.
import theano, numpy, lasagne
from theano import tensor as T
from lasagne.layers.recurrent import LSTMLayer, InputLayer, Gate
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import sigmoid, tanh, leaky_rectify
from lasagne.layers import get_output
from lasagne.init import GlorotNormal, Normal, Constant
floatX = 'float32'
# function to create a lstm that ~ propagate the input from start to finish off the bat
# should be a good start for a predictive lstm with high one-step autocorrelation
def create_identity_lstm(input, shape, orig_inp=None, noiselvl=0.01, G=10., mask_input=None):
inp, out = shape
# orig_inp is used to limit the number of units that are actually used to pass the input information from one layer to the other - the rest of the units should produce ~ 0 activation.
if orig_inp is None:
orig_inp = inp
# input gate
inputgate = Gate(
W_in=GlorotNormal(noiselvl),
W_hid=GlorotNormal(noiselvl),
W_cell=Normal(noiselvl),
b=Constant(0.),
nonlinearity=sigmoid
)
# forget gate
forgetgate = Gate(
W_in=GlorotNormal(noiselvl),
W_hid=GlorotNormal(noiselvl),
W_cell=Normal(noiselvl),
b=Constant(0.),
nonlinearity=sigmoid
)
# cell gate
cell = Gate(
W_in=GlorotNormal(noiselvl),
W_hid=GlorotNormal(noiselvl),
W_cell=None,
b=Constant(0.),
nonlinearity=leaky_rectify
)
# output gate
outputgate = Gate(
W_in=GlorotNormal(noiselvl),
W_hid=GlorotNormal(noiselvl),
W_cell=Normal(noiselvl),
b=Constant(0.),
nonlinearity=sigmoid
)
lstm = LSTMLayer(input, out, ingate=inputgate, forgetgate=forgetgate, cell=cell, outgate=outputgate, nonlinearity=leaky_rectify, mask_input=mask_input)
# change matrices and biases
# ingate - should return ~1 (matrices = 0, big bias)
b_i = lstm.b_ingate.get_value()
b_i[:orig_inp] += G
lstm.b_ingate.set_value(b_i)
# forgetgate - should return 0 (matrices = 0, big negative bias)
b_f = lstm.b_forgetgate.get_value()
b_f[:orig_inp] -= G
b_f[orig_inp:] += G # to help learning future features, I preserve a large bias on "unused" units to help it remember stuff
lstm.b_forgetgate.set_value(b_f)
# cell - should return x(t) (W_xc = identity, rest is 0)
W_xc = lstm.W_in_to_cell.get_value()
for i in xrange(orig_inp):
W_xc[i, i] += 1.
lstm.W_in_to_cell.set_value(W_xc)
# outgate - should return 1 (same as ingate)
b_o = lstm.b_outgate.get_value()
b_o[:orig_inp] += G
lstm.b_outgate.set_value(b_o)
# done
return lstm
I then use this lstm generation code to generate the following network:
# layers
#input + dropout
input = InputLayer((None, None, 7), name='input')
mask = InputLayer((None, None), name='mask')
drop1 = DropoutLayer(input, p=0.33)
#lstm1 + dropout
lstm1 = create_identity_lstm(drop1, (7, 1024), mask_input=mask)
drop2 = DropoutLayer(lstm1, p=0.33)
#lstm2 + dropout
lstm2 = create_identity_lstm(drop2, (1024, 128), orig_inp=7, mask_input=mask)
drop3 = DropoutLayer(lstm2, p=0.33)
#lstm3
lstm3 = create_identity_lstm(drop3, (128, 7), orig_inp=7, mask_input=mask)
# symbolic variables and prediction
x = input.input_var
ma = mask.input_var
ma_reshape = ma.dimshuffle((0,1,'x'))
yhat = get_output(lstm3, deterministic=False)
yhat_det = get_output(lstm3, deterministic=True)
y = T.ftensor3('y')
predict = theano.function([x, ma], yhat_det)
Problem is, even without any training, this network produces garbage values and sometimes even a bunch of NaNs, right from the very first LSTM layer:
X = numpy.random.random((5, 10000, 7)).astype('float32')
Masks = numpy.ones(X.shape[:2], dtype='float32')
hid1 = get_output(lstm1, determistic=True)
get_hid1 = theano.function([x, ma], hid1)
h1 = get_hid1(X, Masks)
print numpy.isnan(h1).sum(axis=1).sum(axis=1)
array([6379520, 6367232, 6377472, 6376448, 6378496])
# even the first output value is garbage!
print h1[:,0,0] - X[:,0,0]
array([-0.03898358, -0.10118812, 0.34877831, -0.02509735, 0.36689138], dtype=float32)
I don't get why, I checked each matrices and their values are fine, like I wanted them to be. I even tried to recreate each gate activations and the resulting hidden activations using the actual numpy arrays and they reproduce the input just fine. What did I do wrong there??

Resources