Assertion error in DataParallel when applying to RNN - pytorch

I'm trying to apply DataParallel to a RNN model.
this is part of my code:
if use_cuda:
encoder = encoder.cuda()
decoder = decoder.cuda()
encoder = nn.DataParallel(encoder, dim=0)
decoder = nn.DataParallel(decoder, dim=0)
class EncoderRNN(nn.Module):
def __init__(self, vocal_size, hidden_size):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(vocal_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
def forward(self, input_batch, input_batch_length, hidden):
print(input_batch)
print(input_batch_length)
print(hidden)
embedded = self.embedding(input_batch)
packed_input = nn.utils.rnn.pack_padded_sequence(embedded, input_batch_length.cpu().numpy(), batch_first=True)
output, hidden = self.gru(packed_input, hidden)
return output, hidden
def init_hidden(self, batch_size):
result = torch.autograd.Variable(torch.zeros(1, batch_size, self.hidden_size))
if use_cuda:
return result.cuda()
else:
return result
I can guarantee that all inputs are cuda, but I received this error. It seems that my inputs are not all cuda.
Traceback (most recent call last):
File "train.py", line 156, in <module>
train_iteration(encoder, decoder, fileDataSet)
File "train.py", line 122, in train_iteration
target_indices, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
File "train.py", line 49, in train
encoder_output, encoder_hidden = encoder(input_batch, input_batch_length, encoder_hidden)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 74, in forward
return self.gather(outputs, self.output_device)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 86, in gather
return gather(outputs, output_device, dim=self.dim)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 65, in gather
return gather_map(outputs)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 60, in gather_map
return type(out)(map(gather_map, zip(*outputs)))
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 60, in gather_map
return type(out)(map(gather_map, zip(*outputs)))
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/utils/rnn.py", line 39, in __new__
return super(PackedSequence, cls).__new__(cls, *args[0])
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 57, in gather_map
return Gather.apply(target_device, dim, *outputs)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 58, in forward
assert all(map(lambda i: i.is_cuda, inputs))
AssertionError
This code works on the single GPU, but I didn't find any similar problem in Google.
Content of input_batch:
2.0000e+00 6.2900e+02 5.4000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00
2.0000e+00 1.6759e+04 6.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00
2.0000e+00 7.2000e+01 3.3500e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00
2.0000e+00 5.4000e+01 1.2900e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00
[torch.cuda.LongTensor of size (4,2687) (GPU 0)]
input_batch_length:
1844
1507
1219
1021
[torch.cuda.LongTensor of size (4,) (GPU 0)]
hidden:
( 0 ,.,.) =
0 0 0 ... 0 0 0
0 0 0 ... 0 0 0
0 0 0 ... 0 0 0
0 0 0 ... 0 0 0
[torch.cuda.FloatTensor of size (1,4,256) (GPU 0)]

Related

ValueError: bytes must be in range(0, 256) while decoding input tensor using transformer AutoTokenizer (MT5ForConditionalGerneration Model)

Relevant Code :
from transformers import (
AdamW,
MT5ForConditionalGeneration,
AutoTokenizer,
get_linear_schedule_with_warmup
)
tokenizer = AutoTokenizer.from_pretrained('google/byt5-small', use_fast=True)
model=MT5ForConditionalGeneration.from_pretrained("working/result/",
return_dict=True)
def generate(text):
model.eval()
# print(model)
# input_ids = tokenizer.encode("WebNLG:{} </s>".format(text),
# return_tensors="pt")
input_ids = tokenizer.batch_encode_plus(
[text], max_length=512, pad_to_max_length=True, return_tensors="pt"
).to(device)
source_ids = input_ids["input_ids"].squeeze()
print(tokenizer.decode(source_ids))
print(type(input_ids.input_ids))
input_ids.input_ids.to(device)
print(input)
outputs = model.generate(input_ids.input_ids)
print(outputs)
print(outputs[0])
return tokenizer.decode(outputs[0])
Calling above function
input_str = "Title: %s Category: %s" % ("10 Min Quick Food Recipe","Food")
input_str = "Title: %s Category: %s" % ("I am marathon runner and going to run 21km on 4th dec in Thane","Fitness")
print(input_str)
print(generate(input_str))
Output:
Title: I am marathon runner and going to run 21km on 4th dec in Thane Category: Fitness
Title: I am marathon runner and going to run 21km on 4th dec in Thane Category: Fitness</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<class 'torch.Tensor'>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x7ff645eed970>>
tensor([[ 0, 259, 266, 259, 3659, 390, 259, 262, 48580, 288,
259, 262, 38226, 5401, 259, 1]], device='cuda:0')
tensor([ 0, 259, 266, 259, 3659, 390, 259, 262, 48580, 288,
259, 262, 38226, 5401, 259, 1], device='cuda:0')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In [30], line 5
2 input_str = "Title: %s Category: %s" % ("I am marathon runner and going to run 21km on 4th dec in Thane","Fitness")
4 print(input_str)
----> 5 print(generate(input_str))
Cell In [29], line 18, in generate(text)
16 print(outputs)
17 print(outputs[0])
---> 18 return tokenizer.decode(outputs[0])
File ~/T5/t5_venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3436, in PreTrainedTokenizerBase.decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
3433 # Convert inputs to python lists
3434 token_ids = to_py_obj(token_ids)
-> 3436 return self._decode(
3437 token_ids=token_ids,
3438 skip_special_tokens=skip_special_tokens,
3439 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
3440 **kwargs,
3441 )
File ~/T5/t5_venv/lib/python3.8/site-packages/transformers/tokenization_utils.py:949, in PreTrainedTokenizer._decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, spaces_between_special_tokens, **kwargs)
947 current_sub_text.append(token)
948 if current_sub_text:
--> 949 sub_texts.append(self.convert_tokens_to_string(current_sub_text))
951 if spaces_between_special_tokens:
952 text = " ".join(sub_texts)
File ~/T5/t5_venv/lib/python3.8/site-packages/transformers/models/byt5/tokenization_byt5.py:243, in ByT5Tokenizer.convert_tokens_to_string(self, tokens)
241 tok_string = token.encode("utf-8")
242 else:
--> 243 tok_string = bytes([ord(token)])
244 bstring += tok_string
245 string = bstring.decode("utf-8", errors="ignore")
ValueError: bytes must be in range(0, 256)
I tried to change the max_length param to 256 but can't seems to get it work.
Any leads highly appreciated. Thanks in Advance.
Got it. I was doing a silly mistake. I was trying different pre-trained tokenizer and T5 models.
During training I had used google/mt5-base but during inference I used google/byt5-small which created this issue. Changed back to google/mt5-base to fix the issue. Now inference working fine.

Result type Float can’t be cast to the desired output type Byte

I am trying to make a model to classify knee MRIs. I am using the torchio library and I have succesfully train a classifier to the raw data. Now, I am adding masks to the MRI object and I want to pass them to a model. I use the same code but now I am getting this error:
RuntimeError Traceback (most recent call last)
<ipython-input-56-2a7d5ddf84fc> in <module>()
26 for epoch in range(1, num_epochs+1):
27
---> 28 train_loss, train_accuracy = training(epoch,model,train_loader,optimizer,criterion)
29 valid_loss, valid_accuracy = validation(epoch,model,valid_loader,criterion)
30
6 frames
<ipython-input-55-59a55a26f2b7> in training(epoch, model, train_loader, optimizer, criterion)
8 model.train()
9
---> 10 for i, batch in tqdm(enumerate(train_loader,0)):
11 images = batch['t1'][tio.DATA].cuda()
12 labels = batch['label']
/usr/local/lib/python3.7/dist-packages/tqdm/notebook.py in __iter__(self)
255 def __iter__(self):
256 try:
--> 257 for obj in super(tqdm_notebook, self).__iter__():
258 # return super(tqdm...) will not catch exception
259 yield obj
/usr/local/lib/python3.7/dist-packages/tqdm/std.py in __iter__(self)
1193
1194 try:
-> 1195 for obj in iterable:
1196 yield obj
1197 # Update and possibly print the progressbar.
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
1201 else:
1202 del self._task_info[idx]
-> 1203 return self._process_data(data)
1204
1205 def _try_put_index(self):
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _process_data(self, data)
1227 self._try_put_index()
1228 if isinstance(data, ExceptionWrapper):
-> 1229 data.reraise()
1230 return data
1231
/usr/local/lib/python3.7/dist-packages/torch/_utils.py in reraise(self)
432 # instantiate since we don't know how to
433 raise RuntimeError(msg) from None
--> 434 raise exception
435
436
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
return self.collate_fn(data)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 74, in default_collate
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 74, in <dictcomp>
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 74, in default_collate
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 74, in <dictcomp>
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 56, in default_collate
return torch.stack(batch, 0, out=out)
RuntimeError: result type Float can't be cast to the desired output type Byte
The training() function is this:
def training(epoch, model, train_loader, optimizer, criterion):
"Training over an epoch"
metric_monitor = MetricMonitor()
model.train()
for i, batch in tqdm(enumerate(train_loader,0)):
images = batch['t1'][tio.DATA].cuda()
labels = batch['label'].cuda()
if images.sum() != 0:
output = F.softmax(model(images), dim =1)
loss = criterion(output, labels)
output = output.data.max(dim=1,keepdim=True)[1]
output = output.view(-1)
acc = calculate_acc(output, labels)
metric_monitor.update("Loss", loss.item())
metric_monitor.update("Accuracy", acc)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("[Epoch: {epoch:03d}] Train | {metric_monitor}".format(epoch=epoch, metric_monitor=metric_monitor))
return metric_monitor.metrics['Loss']['avg'], metric_monitor.metrics['Accuracy']['avg']
When I run the code some batches pass but it stops on some batches and produces this error. I have checked my data and all of them are torch.FloatTensor
The labels are integers. Does anybody know how to fix this ?

Error while training with Sequential in Keras: Shapes are incompatible

I am training a neural network (2 conv layers and 1 dense hidden layer) to classify hand-sign images for 24 alphabets (J and Z has no images). Using ImageDataGenerator's flow() function to create training and testing data generators. Using Keras Sequential to create the neural network model. While training, I am getting the following error:
model = create_model()
# Train your model
history = model.fit(train_generator,
epochs=15,
validation_data=validation_generator)
Epoch 1/15
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-46-746fb7255d3f> in <module>()
6 history = model.fit(train_generator,
7 epochs=15,
----> 8 validation_data=validation_generator)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py in autograph_handler(*args, **kwargs)
1145 except Exception as e: # pylint:disable=broad-except
1146 if hasattr(e, "ag_error_metadata"):
-> 1147 raise e.ag_error_metadata.to_exception(e)
1148 else:
1149 raise
ValueError: in user code:
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1021, in train_function *
return step_function(self, iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1010, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1000, in run_step **
outputs = model.train_step(data)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 860, in train_step
loss = self.compute_loss(x, y, y_pred, sample_weight)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 919, in compute_loss
y, y_pred, sample_weight, regularization_losses=self.losses)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/compile_utils.py", line 201, in __call__
loss_value = loss_obj(y_t, y_p, sample_weight=sw)
File "/usr/local/lib/python3.7/dist-packages/keras/losses.py", line 141, in __call__
losses = call_fn(y_true, y_pred)
File "/usr/local/lib/python3.7/dist-packages/keras/losses.py", line 245, in call **
return ag_fn(y_true, y_pred, **self._fn_kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/losses.py", line 1790, in categorical_crossentropy
y_true, y_pred, from_logits=from_logits, axis=axis)
File "/usr/local/lib/python3.7/dist-packages/keras/backend.py", line 5083, in categorical_crossentropy
target.shape.assert_is_compatible_with(output.shape)
**ValueError: Shapes (None, 1) and (None, 24) are incompatible**
Here is the link to the colab code: 1
You need to use one hot encoding for the y parameters where you define training and validation generator. So under 'train_val_generators' function change:
y=training_labels
into
y=tf.keras.utils.to_categorical(training_labels, x)
and do same thing for the validation. x is the number of output neurons.

Using albumentation with Tensorflow Sequence API

I am trying to use tf.keras.utils.Sequence object as input to my keras model so,that I can apply augmentations that are not available in tensorflow using albumentations library. But I am getting error while doing so. (The image pre-processing operations mentioned here are just for clarity)
import albumentations as A
from tensorflow.keras.utils import Sequence
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Dropout
from tensorflow.keras.models import Sequential
TRAIN_DIR = os.path.join('..', 'Data', 'PetImages')
def load_data():
list_of_fpaths = glob.glob('../Data/PetImages/Cat/*')
labels = [1] * len(list_of_fpaths)
temp = glob.glob('../Data/PetImages/Dog/*')
list_of_fpaths.extend(temp)
labels.extend([0] * len(temp))
return list_of_fpaths, labels
# Now list of fpaths contain the list of file paths and labels contain
# corresponding labels
class DataSequence(Sequence):
def __init__(self, x_set, y_set, batch_size, augmentations):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
self.augment = augmentations
def __len__(self):
return int(np.ceil(len(self.x) / float(self.batch_size)))
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
a = np.array([
self.augment(image=plt.imread(file_name))["image"] for file_name in
batch_x
])
b = np.array(batch_y)
return a,b
def get_model(input_shape):
model = Sequential([
Conv2D(8, 3, activation='relu', input_shape=input_shape),
MaxPool2D(2),
Conv2D(16, 3, activation='relu'),
MaxPool2D(2),
Conv2D(32, 3, activation='relu'),
MaxPool2D(2),
Conv2D(32, 3, activation='relu'),
MaxPool2D(2),
Conv2D(32, 3, activation='relu'),
MaxPool2D(2),
Flatten(),
Dense(1024, activation='relu'),
Dropout(0.3),
Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
return model
ALBUMENTATIONS_TRAIN = A.Compose([
A.Resize(256, 256),
# A.Resize(512, 512),
A.ToFloat(),
# A.RandomCrop(384, 384, p=0.5),
])
ALBUMENTATIONS_TEST = A.Compose([
A.ToFloat(),
A.Resize(256, 256)
])
X, Y = load_data()
train_gen = DataSequence(X, Y, 16, ALBUMENTATIONS_TRAIN)
model = get_model(input_shape=(256,256,3))
model.fit(train_gen,epochs=100)
The error that I am getting is
17/748 [..............................] - ETA: 1:06 - loss: 0.4304 - accuracy: 0.92282020-07-08 13:25:47.751964: W tensorflow/core/framework/op_kernel.cc:1741] Invalid argument: ValueError: could not broadcast input array from shape (256,256,3) into shape (256,256)
Traceback (most recent call last):
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\ops\script_ops.py", line 243, in __call__
ret = func(*args)
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 309, in wrapper
return func(*args, **kwargs)
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 785, in generator_py_func
values = next(generator_state.get_iterator(iterator_id))
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 801, in wrapped_generator
for data in generator_fn():
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 932, in generator_fn
yield x[i]
File "D:/ACAD/TENSORFLOW/Rough/data_aug_pipeline.py", line 40, in __getitem__
a = np.array([
ValueError: could not broadcast input array from shape (256,256,3) into shape (256,256)
Traceback (most recent call last):
File "D:/ACAD/TENSORFLOW/Rough/data_aug_pipeline.py", line 89, in <module>
model.fit(train_gen,epochs=100)
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
tmp_logs = train_function(iterator)
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
result = self._call(*args, **kwds)
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\eager\function.py", line 2420, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\eager\function.py", line 1661, in _filtered_call
return self._call_flat(
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\eager\function.py", line 1745, in _call_flat
return self._build_call_outputs(self._inference_function.call(
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\eager\function.py", line 593, in call
outputs = execute.execute(
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\eager\execute.py", line 59, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: ValueError: could not broadcast input array from shape (256,256,3) into shape (256,256)
Traceback (most recent call last):
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\ops\script_ops.py", line 243, in __call__
ret = func(*args)
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 309, in wrapper
return func(*args, **kwargs)
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 785, in generator_py_func
values = next(generator_state.get_iterator(iterator_id))
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 801, in wrapped_generator
for data in generator_fn():
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 932, in generator_fn
yield x[i]
File "D:/ACAD/TENSORFLOW/Rough/data_aug_pipeline.py", line 40, in __getitem__
a = np.array([
ValueError: could not broadcast input array from shape (256,256,3) into shape (256,256)
[[{{node PyFunc}}]]
[[IteratorGetNext]]
[[IteratorGetNext/_4]]
(1) Invalid argument: ValueError: could not broadcast input array from shape (256,256,3) into shape (256,256)
Traceback (most recent call last):
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\ops\script_ops.py", line 243, in __call__
ret = func(*args)
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 309, in wrapper
return func(*args, **kwargs)
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 785, in generator_py_func
values = next(generator_state.get_iterator(iterator_id))
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 801, in wrapped_generator
for data in generator_fn():
File "C:\Users\aksha\Anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 932, in generator_fn
yield x[i]
File "D:/ACAD/TENSORFLOW/Rough/data_aug_pipeline.py", line 40, in __getitem__
a = np.array([
ValueError: could not broadcast input array from shape (256,256,3) into shape (256,256)
[[{{node PyFunc}}]]
[[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1195]
Function call stack:
train_function -> train_function
Process finished with exit code 1
Please help me to understand what mistake I am making.
Base on the error messages, there is at least one grayscale image in your dataset that was resize to 256x256 and thus cannot fit into your network.

Pytorch: Error in DataParallel for RNN model

I'm trying to use torch.nn.DataParallel for a RNN model. My model looks like this:
class EncoderRNN(nn.Module):
def __init__(self, vocal_size, hidden_size):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(vocal_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
def forward(self, input_batch, input_batch_length, hidden):
embedded = self.embedding(input_batch)
packed_input = nn.utils.rnn.pack_padded_sequence(embedded, input_batch_length.cpu().numpy(), batch_first=True)
output, hidden = self.gru(packed_input, hidden)
return output, hidden
class DecoderRNN(nn.Module):
def __init__(self, hidden_size, vocab_size):
super(DecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(vocab_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
self.out = nn.Linear(hidden_size, vocab_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, target_batch, target_batch_length, hidden, train=False):
embedded = self.embedding(target_batch)
output = F.relu(embedded)
if train:
# minus 1 to eliminate <EOS>
packed_target = nn.utils.rnn.pack_padded_sequence(output, (target_batch_length - 1).cpu().numpy(),
batch_first=True)
output, hidden = self.gru(packed_target, hidden)
output = self.softmax(self.out(output[0]))
return output, hidden
And I implemented DataParallel like this when declaring the model:
encoder = nn.DataParallel(encoder)
decoder = nn.DataParallel(decoder)
The code runs on a server with 4 GPUs, and I received following error message:
/home/cjunjie/NLP/DocSummarization/model.py:18: UserWarning: RNN module weights are not part of single contiguous chunk of memory. This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters().
output, hidden = self.gru(packed_input, hidden)
Traceback (most recent call last):
File "train.py", line 144, in <module>
train_iteration(encoder, decoder, fileDataSet)
File "train.py", line 110, in train_iteration
target_indices, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
File "train.py", line 41, in train
encoder_output, encoder_hidden = encoder(input_batch, input_batch_length, encoder_hidden)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 74, in forward
return self.gather(outputs, self.output_device)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 86, in gather
return gather(outputs, output_device, dim=self.dim)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 65, in gather
return gather_map(outputs)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 60, in gather_map
return type(out)(map(gather_map, zip(*outputs)))
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 60, in gather_map
return type(out)(map(gather_map, zip(*outputs)))
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/utils/rnn.py", line 39, in __new__
return super(PackedSequence, cls).__new__(cls, *args[0])
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 57, in gather_map
return Gather.apply(target_device, dim, *outputs)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 58, in forward
assert all(map(lambda i: i.is_cuda, inputs))
AssertionError
I searched for the same problem, but none of them have a solution. Can anyone help?
In order to run the code on GPUs you need to copy both variables and model weights to cuda. I suspect you did not copy model weights to cuda. To do that you need to do
encoder.cuda()
decoder.cuda()

Resources