Error about torch tensor precision on gpu - pytorch

I tried to finetune a Bert model on GPU using PyTorch-Lightning's class Trainer using the following code:
from pytorch_lightning import Trainer
from models import LitAdModel, AdModel
from dataloaders import train_dataloader, test_dataloader
model = AdModel()
litmodel = LitAdModel(model=model)
trainer = Trainer(accelerator='gpu', devices=1)
trainer.fit(model=litmodel, train_dataloaders=train_dataloader,
val_dataloaders=test_dataloader)
in which train_dataloader, test_dataloader and AdModel and LitAdModel classes are defined elsewhere. When I do this without using the GPU, it works ( slowly), but with GPU it gives the following error:
File "/Users/sanjinjuricfot/developer/copy_models/test_pl.py", line
24, in
main() File "/Users/sanjinjuricfot/developer/copy_models/test_pl.py", line 18, in
main
littrain(train=train, test=test) File "/Users/sanjinjuricfot/developer/copy_models/src/_torch/littrain.py",
line 39, in littrain
trainer.fit(model=litmodel, train_dataloaders=train_dataloader, val_dataloaders=test_dataloader) File
"/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",
line 582, in fit
call._call_and_handle_interrupt( File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py",
line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",
line 624, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",
line 1061, in _run
results = self._run_stage() File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",
line 1140, in _run_stage
self._run_train() File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",
line 1153, in _run_train
self._run_sanity_check() File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",
line 1225, in _run_sanity_check
val_loop.run() File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py",
line 199, in run
self.advance(*args, **kwargs) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py",
line 152, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs) File
"/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py",
line 199, in run
self.advance(*args, **kwargs) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py",
line 121, in advance
batch = next(data_fetcher) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py",
line 184, in next
return self.fetching_function() File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py",
line 275, in fetching_function
return self.move_to_device(batch) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py",
line 294, in move_to_device
batch = self.batch_to_device(batch) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py",
line 142, in batch_to_device
batch = self.trainer._call_strategy_hook("batch_to_device", batch, dataloader_idx=dataloader_idx) File
"/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",
line 1443, in _call_strategy_hook
output = fn(*args, **kwargs) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py",
line 273, in batch_to_device
return model._apply_batch_transfer_handler(batch, device=device, dataloader_idx=dataloader_idx) File
"/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/core/module.py",
line 295, in _apply_batch_transfer_handler
batch = self._call_batch_hook("transfer_batch_to_device", batch, device, dataloader_idx) File
"/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/core/module.py",
line 283, in _call_batch_hook
return trainer_method(hook_name, *args) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py",
line 1305, in _call_lightning_module_hook
output = fn(*args, **kwargs) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/pytorch_lightning/core/hooks.py",
line 632, in transfer_batch_to_device
return move_data_to_device(batch, device) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/lightning_lite/utilities/apply_func.py",
line 101, in move_data_to_device
return apply_to_collection(batch, dtype=_TransferableDataType, function=batch_to) File
"/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/lightning_utilities/core/apply_func.py",
line 55, in apply_to_collection
v = apply_to_collection( File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/lightning_utilities/core/apply_func.py",
line 47, in apply_to_collection
return function(data, *args, **kwargs) File "/Users/sanjinjuricfot/developer/copy_models/.venv/lib/python3.10/site-packages/lightning_lite/utilities/apply_func.py",
line 95, in batch_to
data_output = data.to(device, **kwargs) TypeError: Cannot convert a MPS Tensor to float64 dtype as the MPS framework doesn't support
float64. Please use float32 instead.
I tried using this command
torch.set_default_dtype(torch.float32)
in all the relevant files and adding
.to(torch.float32)
extension to all the tensors, but it didn't work.
I am using MacBook Pro with M2 processor. Thanks in advance for any help!

Related

Is it possible to iterate through Tensor in graph mode?

I am trying to implement Aleju's Imgaug to TFOD API. Noticed that you can not iterate through Tensors in the graph mode . I looked up for the solution and tried many suggestions but neither of them worked for my case. Do you know any work around?
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
from tensorflow.python.framework.ops import EagerTensor
import tensorflow.compat.v1 as tf
import numpy as np
augseq = iaa.Sequential([# augmentation options], random_order=True)
#tf.function
def augment(image, boxes):
image_np = image.numpy().astype(np.uint8) if type(image) == EagerTensor else image
boxes_np = boxes.numpy() if type(boxes) == EagerTensor else boxes
width, height, _ = image_np.shape
bbs = []
for i in range(len(boxes_np)):
box = boxes_np[i]
ymin, xmin, ymax, xmax = box.numpy()
bbs.append(BoundingBox(
x1=xmin*width, y1=ymin*height,
x2=xmax*width, y2=ymax*height,))
bbs = BoundingBoxesOnImage(bbs, shape=image_np.shape)
image_aug, bbs_aug = augseq(image=image_np, bounding_boxes=bbs) # float np.ndarray
bbs_aug = bbs_aug.remove_out_of_image().clip_out_of_image()
boxes_aug = []
for bb in bbs_aug:
boxes_aug.append([bb.y1/height, bb.x1/width, bb.y2/height, bb.x2/width])
boxes_aug = np.array(boxes_aug)
return image_aug, boxes_aug
Stack Trace:
raceback (most recent call last):
File "/content/models/research/object_detection/model_main_tf2.py", line 115, in <module>
tf.compat.v1.app.run()
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/usr/local/lib/python3.7/dist-packages/absl/app.py", line 303, in run
_run_main(main, args)
File "/usr/local/lib/python3.7/dist-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "/content/models/research/object_detection/model_main_tf2.py", line 112, in main
record_summaries=FLAGS.record_summaries)
File "/usr/local/lib/python3.7/dist-packages/object_detection/model_lib_v2.py", line 558, in train_loop
train_dataset_fn)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/deprecation.py", line 348, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py", line 1199, in experimental_distribute_datasets_from_function
return self.distribute_datasets_from_function(dataset_fn, options)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py", line 1191, in distribute_datasets_from_function
dataset_fn, options)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py", line 979, in _distribute_datasets_from_function
options=options)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/input_lib.py", line 181, in get_distributed_datasets_from_function
build=build,
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/input_lib.py", line 1618, in __init__
self.build()
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/input_lib.py", line 1639, in build
self._input_contexts, self._input_workers, self._dataset_fn))
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/input_lib.py", line 2350, in _create_datasets_from_function_with_input_context
dataset = dataset_fn(ctx)
File "/usr/local/lib/python3.7/dist-packages/object_detection/model_lib_v2.py", line 553, in train_dataset_fn
input_context=input_context)
File "/usr/local/lib/python3.7/dist-packages/object_detection/inputs.py", line 906, in train_input
reduce_to_frame_fn=reduce_to_frame_fn)
File "/usr/local/lib/python3.7/dist-packages/object_detection/builders/dataset_builder.py", line 258, in build
batch_size, input_reader_config)
File "/usr/local/lib/python3.7/dist-packages/object_detection/builders/dataset_builder.py", line 237, in dataset_map_fn
fn_to_map, num_parallel_calls=num_parallel_calls)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/deprecation.py", line 348, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 3886, in map_with_legacy_function
use_legacy_function=True))
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 5505, in __init__
use_legacy_function=use_legacy_function)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 4540, in __init__
self._function.add_to_graph(ops.get_default_graph())
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/function.py", line 544, in add_to_graph
self._create_definition_if_needed()
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/function.py", line 380, in _create_definition_if_needed
self._create_definition_if_needed_impl()
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/function.py", line 407, in _create_definition_if_needed_impl
capture_resource_var_by_value=self._capture_resource_var_by_value)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/function.py", line 970, in func_graph_from_py_func
outputs = func(*func_graph.inputs)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 4458, in wrapped_fn
ret = wrapper_helper(*args)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 4440, in wrapper_helper
ret = autograph.tf_convert(self._func, ag_ctx)(*nested_args)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 699, in wrapper
raise e.ag_error_metadata.to_exception(e)
AttributeError: in user code:
File "/usr/local/lib/python3.7/dist-packages/object_detection/inputs.py", line 886, in transform_and_pad_input_data_fn *
tensor_dict = pad_input_data_to_static_shapes(
File "/usr/local/lib/python3.7/dist-packages/object_detection/inputs.py", line 272, in transform_input_data *
out_tensor_dict = data_augmentation_fn(out_tensor_dict)
File "/usr/local/lib/python3.7/dist-packages/object_detection/inputs.py", line 623, in augment_input_data *
tensor_dict = preprocessor.preprocess(
File "/usr/local/lib/python3.7/dist-packages/object_detection/core/preprocessor.py", line 4812, in preprocess *
results = func(*args, **params)
File "/usr/local/lib/python3.7/dist-packages/object_detection/core/preprocessor.py", line 4422, in _adjust_imgaug *
adjusted_image, adjusted_boxes = tf.cast(imgaug_utils.augment(image,boxes), tf.float32)
File "/usr/local/lib/python3.7/dist-packages/object_detection/core/imgaug_utils.py", line 24, in augment *
ymin, xmin, ymax, xmax = box.numpy()
AttributeError: 'Tensor' object has no attribute 'numpy'
Here is what I tried and did not work:
Enable eager execution(It is default in tf 2.x)
Decorate/Not Decorate function with #tf.function.
Create Tf session and try to eval() or run():
InvalidArgumentError: You must feed a value for placeholder tensor 'while/Placeholder' with dtype int32
Tried on both TPU and CPU

RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED in pytorch

I am running CNN algorithm using PyTorch on my new machine with 3 Nvidia GPUs and getting the error below:
RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED
File "code.py", line 342, in <module>
trainer.fit(model)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 514, in fit
self.dispatch()
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 554, in dispatch
self.accelerator.start_training(self)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 111, in start_training
self._results = trainer.run_train()
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 615, in run_train
self.run_sanity_check(self.lightning_module)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 864, in run_sanity_check
_, eval_results = self.run_evaluation(max_batches=self.num_sanity_val_batches)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 733, in run_evaluation
output = self.evaluation_loop.evaluation_step(batch, batch_idx, dataloader_idx)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/evaluation_loop.py", line 164, in evaluation_step
output = self.trainer.accelerator.validation_step(args)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 178, in validation_step
return self.training_type_plugin.validation_step(*args)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 290, in validation_step
return self.model(*args, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 705, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 63, in forward
output = self.module.validation_step(*inputs, **kwargs)
File code.py", line 314, in validation_step
pred = self.forward(x)
File code.py", line 259, in forward
x = self.conv0(x) #([12, 600, 600])
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/container.py", line 119, in forward
input = module(input)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 399, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 395, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
NVIDIA-MSI:
The code is running without any issue on another machine with driver version 450.51.06 and Cuda version 11. You can see nvidia-smi of new machine above. I checked different comments on other questions same to this issue and non of them resolved my issue.

attribute lookup s3.ServiceResource on boto3.resources.factory failed

I want to try my model. The data is saved in AWS. I use boto3 simply like
self.s3_img = S3Images(boto3.resource('s3'))
self.s3_obj = S3GetObjects()
I met this error when I feed the data and model in to the pytorch training pipeline.
The code looks like
import pytorch_lightning as pl
from pytorch_lightning import Trainer
trainer = Trainer(
checkpoint_callback=checkpoint_callback,
callbacks=get_callbacks(chkpt_path),
fast_dev_run=False,
max_epochs=100,
resume_from_checkpoint=checkpoint_path
)
trainer.fit(model)
The error is
File "main.py", line 191, in <module>
train()
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/hydra/main.py", line 20, in decorated_main
run_hydra(
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/hydra/_internal/utils.py", line 171, in run_hydra
hydra.run(
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/hydra/_internal/hydra.py", line 82, in run
return run_job(
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/hydra/plugins/common/utils.py", line 109, in run_job
ret.return_value = task_function(task_cfg)
File "main.py", line 176, in train
trainer.fit(model)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/pytorch_lightning/trainer/states.py", line 48, in wrapped_fn
result = fn(self, *args, **kwargs)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1084, in fit
results = self.accelerator_backend.train(model)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/pytorch_lightning/accelerators/cpu_backend.py", line 39, in train
results = self.trainer.run_pretrain_routine(model)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1224, in run_pretrain_routine
self._run_sanity_check(ref_model, model)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1257, in _run_sanity_check
eval_results = self._evaluate(model, self.val_dataloaders, max_batches, False)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/pytorch_lightning/trainer/evaluation_loop.py", line 305, in _evaluate
for batch_idx, batch in enumerate(dataloader):
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 352, in __iter__
return self._get_iterator()
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 294, in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 801, in __init__
w.start()
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/multiprocessing/context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/multiprocessing/context.py", line 284, in _Popen
return Popen(process_obj)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/multiprocessing/popen_spawn_posix.py", line 32, in __init__
super().__init__(process_obj)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/multiprocessing/popen_spawn_posix.py", line 47, in _launch
reduction.dump(process_obj, fp)
File "/Users/admin/opt/anaconda3/envs/kk/lib/python3.8/multiprocessing/reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <class 'boto3.resources.factory.s3.ServiceResource'>: attribute lookup s3.ServiceResource on boto3.resources.factory failed
Can anyone tell me what's the meaning of this error and how to solve it? Thanks for any suggestions and help!

How to create a custom keras generator to fit multiple outputs and use workers

I have one input, and multiple outputs, like a multilabel classification, but I chose to try another approach to see if I have any improvements.
I have these generators, I'm using flow_from_dataframe because I have a huge dataset (200k):
self.train_generator = datagen.flow_from_dataframe(
dataframe=train,
directory='dataset',
x_col='Filename',
y_col=columns,
batch_size=BATCH_SIZE,
color_mode='rgb',
class_mode='raw',
shuffle=True,
target_size=(HEIGHT,WIDTH))
self.test_generator = datatest.flow_from_dataframe(
dataframe=test,
directory='dataset',
x_col='Filename',
y_col=columns,
batch_size=BATCH_SIZE,
color_mode='rgb',
class_mode='raw',
target_size=(HEIGHT,WIDTH))
I'm passing to fit using this function:
def generator(self, generator):
while True:
X, y = generator.next()
y = [y[:,x] for x in range(len(columns))]
yield X,[y]
If I fit like this:
self.h = self.model.fit_generator(self.generator(self.train_generator),
steps_per_epoch=self.STEP_SIZE_TRAIN,
validation_data=self.generator(self.test_generator),
validation_steps=self.STEP_SIZE_TEST,
epochs=50,
verbose = 1,
workers = 2,
)
I get :
RuntimeError: Your generator is NOT thread-safe. Keras requires a thread-safe generator when `use_multiprocessing=False, workers > 1`.
Using multiprocessing=True:
self.h = self.model.fit_generator(self.generator(self.train_generator),
steps_per_epoch=self.STEP_SIZE_TRAIN,
validation_data=self.generator(self.test_generator),
validation_steps=self.STEP_SIZE_TEST,
epochs=50,
verbose = 1,
workers = 2,
use_multiprocessing=True,
)
Results in:
File "C:\ProgramData\Anaconda3\lib\threading.py", line 932, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\utils\data_utils.py", line 877, in _run
with closing(self.executor_fn(_SHARED_SEQUENCES)) as executor:
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\utils\data_utils.py", line 867, in pool_fn
pool = get_pool_class(True)(
File "C:\ProgramData\Anaconda3\lib\multiprocessing\context.py", line 119, in Pool
return Pool(processes, initializer, initargs, maxtasksperchild,
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 212, in __init__
self._repopulate_pool()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 303, in _repopulate_pool
return self._repopulate_pool_static(self._ctx, self.Process,
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 326, in _repopulate_pool_static
w.start()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 121, in start
self._popen = self._Popen(self)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\context.py", line 327, in _Popen
return Popen(process_obj)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\popen_spawn_win32.py", line 93, in __init__
reduction.dump(process_obj, to_child)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
TypeError: cannot pickle 'generator' object
File "C:\ProgramData\Anaconda3\lib\threading.py", line 932, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\utils\data_utils.py", line 877, in _run
with closing(self.executor_fn(_SHARED_SEQUENCES)) as executor:
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\utils\data_utils.py", line 867, in pool_fn
pool = get_pool_class(True)(
File "C:\ProgramData\Anaconda3\lib\multiprocessing\context.py", line 119, in Pool
return Pool(processes, initializer, initargs, maxtasksperchild,
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 212, in __init__
self._repopulate_pool()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 303, in _repopulate_pool
return self._repopulate_pool_static(self._ctx, self.Process,
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 326, in _repopulate_pool_static
w.start()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 121, in start
self._popen = self._Popen(self)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\context.py", line 327, in _Popen
return Popen(process_obj)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\popen_spawn_win32.py", line 93, in __init__
reduction.dump(process_obj, to_child)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
TypeError: cannot pickle 'generator' object
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:\ProgramData\Anaconda3\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
EOFError: Ran out of input
Now I'm stuck, how to solve this?
According to documentation https://keras.io/api/preprocessing/image/
The argument class_mode can be set as "multi_output" so you don't need to create a custom generator:
class_mode: one of "binary", "categorical", "input", "multi_output", "raw", sparse" or None. Default: "categorical". Mode for yielding the targets:
- "binary": 1D numpy array of binary labels,
- "categorical": 2D numpy array of one-hot encoded labels. Supports multi-label output.
- "input": images identical to input images (mainly used to work with autoencoders),
- "multi_output": list with the values of the different columns,
- "raw": numpy array of values in y_col column(s),
- "sparse": 1D numpy array of integer labels,
- None, no targets are returned (the generator will only yield batches of image data, which is useful to use in model.predict()).
I am now being able to use workers > 1, but I am not having performance improvements.

Pytorch to ONNX export function fails and causes legacy function error

I am trying to convert the pytorch model in this link to onnx model using the code below :
device=t.device('cuda:0' if t.cuda.is_available() else 'cpu')
print(device)
faster_rcnn = FasterRCNNVGG16()
trainer = FasterRCNNTrainer(faster_rcnn).cuda()
#trainer = FasterRCNNTrainer(faster_rcnn).to(device)
trainer.load('./checkpoints/model.pth')
dummy_input = t.randn(1, 3, 300, 300, device = 'cuda')
#dummy_input = dummy_input.to(device)
t.onnx.export(faster_rcnn, dummy_input, "model.onnx", verbose = True)
But I get the following error (Sorry for the block quote below stackoverflow wouldn't let the whole trace be in code format and wouldn't let the question be posted otherwise):
Traceback (most recent call last):
small_object_detection_master_samirsen\onnxtest.py", line 44, in <module>
t.onnx.export(faster_rcnn, dummy_input, "fasterrcnn_10120119_06025842847785781.onnx", verbose = True)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\onnx\__init__.py",
line 132, in export
strip_doc_string, dynamic_axes)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\onnx\utils.py",
line 64, in export
example_outputs=example_outputs, strip_doc_string=strip_doc_string, dynamic_axes=dynamic_axes)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\onnx\utils.py",
line 329, in _export
_retain_param_name, do_constant_folding)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\onnx\utils.py",
line 213, in _model_to_graph
graph, torch_out = _trace_and_get_graph_from_model(model, args, training)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\onnx\utils.py",
line 171, in _trace_and_get_graph_from_model
trace, torch_out = torch.jit.get_trace_graph(model, args, _force_outplace=True)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\jit__init__.py",
line 256, in get_trace_graph
return LegacyTracedModule(f, _force_outplace, return_inputs)(*args, **kwargs)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\nn\modules\module.py",
line 547, in call
result = self.forward(*input, **kwargs)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\jit__init__.py",
line 323, in forward
out = self.inner(*trace_inputs)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\nn\modules\module.py",
line 545, in call
result = self._slow_forward(*input, **kwargs)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\nn\modules\module.py",
line 531, in _slow_forward
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\nn\modules\module.py",
line 531, in _slow_forward
result = self.forward(*input, **kwargs)
File "D:\smallobject2\export test s\small_object_detection_master_samirsen\model\faster_rcnn.py", line
133, in forward
h, rois, roi_indices)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\nn\modules\module.py",
line 545, in call
result = self._slow_forward(*input, **kwargs)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\nn\modules\module.py",
line 531, in _slow_forward
result = self.forward(*input, **kwargs)
File "D:\smallobject2\export test s\small_object_detection_master_samirsen\model\faster_rcnn_vgg16.py",
line 142, in forward
pool = self.roi(x, indices_and_rois)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\nn\modules\module.py",
line 545, in call
result = self._slow_forward(*input, **kwargs)
File "C:\Users\HP\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\nn\modules\module.py",
line 531, in _slow_forward
result = self.forward(*input, **kwargs)
File "D:\smallobject2\export test s\small_object_detection_master_samirsen\model\roi_module.py", line
85, in forward
return self.RoI(x, rois)
RuntimeError: Attempted to trace RoI, but tracing of legacy functions is not supported
This is because ONNX does not support torch.grad.Function. The issue is because ROI class Refer this
To overcome the issue, you have to implement the forward and backward function as a separate function definition rather than a member of ROI class.
The function call to ROI in FasterRCNNVGG16 is supposed to be altered to explicit call forward and backward functions.

Resources