YoloV4 keras to TensorRT - keras

I am custom training YoloV4 based on keras from this repo: https://github.com/taipingeric/yolo-v4-tf.keras
model = Yolov4(weight_path=weights,
class_name_path=class_name_path)
model.load_model('saved_model/with es')
#model.predict('input.jpg')
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
precision_mode=trt.TrtPrecisionMode.FP16,
max_workspace_size_bytes=4000000000,
max_batch_size=4)
converter = trt.TrtGraphConverterV2(
input_saved_model_dir=saved_model, conversion_params=conversion_params)
converter.convert()
converter.save(output_saved_model_dir)
And then load the same model using NVIDIA's TF20-TF-TRT guide:
saved_model_loaded = tf.saved_model.load(input_saved_model, tags=[tag_constants.SERVING])
signature_keys = list(saved_model_loaded.signatures.keys())
print(signature_keys)
infer = saved_model_loaded.signatures['serving_default']
print(infer.structured_outputs)
But when I try to infer from it using
labeling = infer(x)
I get the following error:
2022-03-02 10:52:18.993365: W tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc:628] TF-TRT Warning: Engine retrieval for input shapes: [[1,416,416,3]] failed. Running native segment for PartitionedCall/TRTEngineOp_0_1
2022-03-02 10:52:19.035613: E tensorflow/stream_executor/cuda/cuda_blas.cc:226] failed to create cublas handle: CUBLAS_STATUS_ALLOC_FAILED
2022-03-02 10:52:19.035702: W tensorflow/core/framework/op_kernel.cc:1763] OP_REQUIRES failed at conv_ops.cc:1106 : Not found: No algorithm worked!
2022-03-02 10:52:19.035770: W tensorflow/core/framework/op_kernel.cc:1763] OP_REQUIRES failed at trt_engine_op.cc:400 : Not found: No algorithm worked!
[[{{node StatefulPartitionedCall/model_1/conv2d/Conv2D}}]]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1669, in __call__
return self._call_impl(args, kwargs)
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1679, in _call_impl
cancellation_manager)
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1762, in _call_with_structured_signature
cancellation_manager=cancellation_manager)
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/saved_model/load.py", line 116, in _call_flat
cancellation_manager)
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1919, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 560, in call
ctx=ctx)
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/execute.py", line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.NotFoundError: No algorithm worked!
[[{{node StatefulPartitionedCall/model_1/conv2d/Conv2D}}]]
[[PartitionedCall/TRTEngineOp_0_1]] [Op:__inference_signature_wrapper_130807]
Function call stack:
signature_wrapper

Related

Pytorch loader throws error after some iterations

The code runs for several iterations and throws the following error.
My Dataset
class Dataset(Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, input_feature_paths, target_feature_folder) -> None:
self.input_feature_paths = input_feature_paths
self.target_feature_folder = target_feature_folder
def __len__(self):
#return sum(1 for _ in self.input_feature_paths)
return len(self.input_feature_paths)
def __getitem__(self, index) -> None:
input_feature_path = self.input_feature_paths[index]
input_feature = load(input_feature_path, map_location='cpu')
target_feature_path = self.target_feature_folder / input_feature_path.parts[-1]
target_feature = load(target_feature_path, map_location='cpu')
return input_feature.to(dtype=torch.float64), target_feature.to(dtype=torch.float64)
I set dtype torch float64 because it throws the same error while writing on the tensorboard summary writer.
Error Stack
Traceback (most recent call last):
File "student_audio_feature_extractor.py", line 178, in <module>
train(dt, input_frame)
File "student_audio_feature_extractor.py", line 164, in train
model, train_loss = train_step(model, train_loader, optimizer, criterion)
File "student_audio_feature_extractor.py", line 80, in train_step
for input_feature, target_feature in train_loader:
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 628, in __next__
data = self._next_data()
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1313, in _next_data
return self._process_data(data)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1359, in _process_data
data.reraise()
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/_utils.py", line 543, in reraise
raise exception
RuntimeError: Caught RuntimeError in DataLoader worker process 4.
Original Traceback (most recent call last):
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
return self.collate_fn(data)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
return collate(batch, collate_fn_map=default_collate_fn_map)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 143, in collate
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed] # Backwards compatibility.
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 143, in <listcomp>
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed] # Backwards compatibility.
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 120, in collate
return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 162, in collate_tensor_fn
out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable
I had a tensor of shape [] that's why it throws this error i changed it and it works now.

Multi-GPU inference with Luke NER not working

I'm trying to run Luke for inference on multiple gpus using DataParallel but I'm encountering an error that I can't seem to resolve. Can you help ?
Here is my code
from transformers import LukeTokenizer, LukeForEntitySpanClassification
import torch
luke_model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
#Getting inputs (type : transformers.tokenization_utils_base.BatchEncoding)
inputs = []
for i in tqdm(range(10)):
input_filepath = df["input_filepath"].iloc[i]
handle = open(input_filepath,'rb')
input_tensor = pickle.load(handle)
inputs.append(input_tensor)
device_ids = [0,1,2,3]
model= torch.nn.DataParallel(luke_model)
model.to("cuda")
replicas = nn.parallel.replicate(model,device_ids)
inputs_dp = nn.parallel.scatter(inputs[:4], device_ids)
outputs = nn.parallel.parallel_apply(replicas, inputs_dp)`
The error I get is :
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<command-1863732679336681> in <module>
21
22 inputs_dp = nn.parallel.scatter(inputs[:4], device_ids)
---> 23 outputs = nn.parallel.parallel_apply(replicas, inputs_dp)
/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
84 output = results[i]
85 if isinstance(output, ExceptionWrapper):
---> 86 output.reraise()
87 outputs.append(output)
88 return outputs
/databricks/python/lib/python3.8/site-packages/torch/_utils.py in reraise(self)
432 # instantiate since we don't know how to
433 raise RuntimeError(msg) from None
--> 434 raise exception
435
436
AttributeError: Caught AttributeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/databricks/python/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/databricks/python/lib/python3.8/site-packages/torch/_utils.py", line 434, in reraise
raise exception
AttributeError: Caught AttributeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/databricks/python/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 250, in __getattr__
return self.data[item]
KeyError: 'size'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/databricks/python/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/databricks/python/lib/python3.8/site-packages/transformers/models/luke/modeling_luke.py", line 1583, in forward
outputs = self.luke(
File "/databricks/python/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/databricks/python/lib/python3.8/site-packages/transformers/models/luke/modeling_luke.py", line 977, in forward
input_shape = input_ids.size()
File "/databricks/python/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 252, in __getattr__
raise AttributeError
AttributeError
I tried adding
class MyDataParallel(nn.DataParallel):
def __getattr__(self, name):
return getattr(self.module, name)
But I get : RecursionError: maximum recursion depth exceeded while calling a Python object
Thanks in advance!

tensorflow.python.framework.errors_impl.InternalError: Unsupported object type int

Shows error when training is executed. The error points to a function named "class_weight" in helper.py which is posted below the code
Epoch 1/150
Traceback (most recent call last):
File "main.py", line 34, in <module>
helper.exec()
File "/content/drive/MyDrive/Tata Elxsi/Function-level-Vulnerability-Detection/src/helper.py", line 284, in exec
class_weight = class_weights)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1239, in fit
validation_freq=validation_freq)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training_arrays.py", line 196, in fit_loop
outs = fit_function(ins_batch)
File "/usr/local/lib/python3.7/dist-packages/tensorflow_core/python/keras/backend.py", line 3476, in __call__
run_metadata=self.run_metadata)
File "/usr/local/lib/python3.7/dist-packages/tensorflow_core/python/client/session.py", line 1472, in __call__
run_metadata_ptr)
tensorflow.python.framework.errors_impl.InternalError: Unsupported object type int
Error line from helper.py:
train_history = model_func.fit(train_set_x, train_set_y,
epochs = self.config['training_settings']['network_config']['epochs'],
batch_size = self.config['training_settings']['network_config']
['batch_size'],
shuffle = False,
validation_data = (validation_set_x, validation_set_y),
callbacks = callbacks_list,
verbose=self.paras.verbose,
class_weight = class_weights)
'class_weights' from helper.py
class_weights = class_weight.compute_class_weight(class_weight='balanced',
classes=np.unique(train_set_y),
y=train_set_y)

Value error while using roboflow object detection Yolov4 pytorch model on custom dataset

We are using Roboflow for object detection using Yolov4 Pytorch model for our custom data set. During the training process, we are getting the following error.
Traceback (most recent call last):
File "./pytorch-YOLOv4/train.py", line 447, in <module>
device=device, )
File "./pytorch-YOLOv4/train.py", line 310, in train
for i, batch in enumerate(train_loader):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 345, in __next__
data = self._next_data()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 856, in _next_data
return self._process_data(data)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 881, in _process_data
data.reraise()
File "/usr/local/lib/python3.7/dist-packages/torch/_utils.py", line 394, in reraise
raise self.exc_type(msg)
ValueError: Caught ValueError in DataLoader worker process 7.
Original Traceback (most recent call last):
File "/content/pytorch-YOLOv4/dataset.py", line 382, in __getitem__
out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
AttributeError: 'list' object has no attribute 'shape'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/content/pytorch-YOLOv4/dataset.py", line 385, in __getitem__
out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
ValueError: could not broadcast input array from shape (0) into shape (0,5)
I don't know the info of your params, but the log said something wrong with your code:
out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] \
= out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
The first error said your param out_bboxes has no attribute 'shape', because it's a list object. So you can consider change it's datatype as you need.

Pytorch model weight type conversion

I'm trying to do inference on FlowNet2-C model loading from file.
However, I met some data type problem. How can I resolve it?
Source code
FlowNet2-C pre-trained model
$ python main.py
Initializing Datasets
[0.000s] Loading checkpoint '/notebooks/data/model/FlowNet2-C_checkpoint.pth.tar'
[1.293s] Loaded checkpoint '/notebooks/data/model/FlowNet2-C_checkpoint.pth.tar' (at epoch 0)
(1L, 6L, 384L, 512L)
<class 'torch.autograd.variable.Variable'>
[1.642s] Operation failed
Traceback (most recent call last):
File "main.py", line 102, in <module>
main()
File "main.py", line 98, in main
summary(input_size, model)
File "main.py", line 61, in summary
model(x)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/notebooks/data/vinet/FlowNetC.py", line 75, in forward
out_conv1a = self.conv1(x1)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/container.py", line 67, in forward
input = module(input)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/conv.py", line 282, in forward
self.padding, self.dilation, self.groups)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/functional.py", line 90, in conv2d
return f(input, weight, bias)
RuntimeError: Input type (CUDAFloatTensor) and weight type (CPUFloatTensor) should be the same
Maybe that is because your model and input x to the model has different data types. It seems that the model parameters have been moved to GPU, but your input x is on GPU.
You can try to use model.cuda() after line 94, which will put the model on the GPU. Then the error should disappear.

Resources