I am custom training YoloV4 based on keras from this repo: https://github.com/taipingeric/yolo-v4-tf.keras
model = Yolov4(weight_path=weights,
class_name_path=class_name_path)
model.load_model('saved_model/with es')
#model.predict('input.jpg')
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
precision_mode=trt.TrtPrecisionMode.FP16,
max_workspace_size_bytes=4000000000,
max_batch_size=4)
converter = trt.TrtGraphConverterV2(
input_saved_model_dir=saved_model, conversion_params=conversion_params)
converter.convert()
converter.save(output_saved_model_dir)
And then load the same model using NVIDIA's TF20-TF-TRT guide:
saved_model_loaded = tf.saved_model.load(input_saved_model, tags=[tag_constants.SERVING])
signature_keys = list(saved_model_loaded.signatures.keys())
print(signature_keys)
infer = saved_model_loaded.signatures['serving_default']
print(infer.structured_outputs)
But when I try to infer from it using
labeling = infer(x)
I get the following error:
2022-03-02 10:52:18.993365: W tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc:628] TF-TRT Warning: Engine retrieval for input shapes: [[1,416,416,3]] failed. Running native segment for PartitionedCall/TRTEngineOp_0_1
2022-03-02 10:52:19.035613: E tensorflow/stream_executor/cuda/cuda_blas.cc:226] failed to create cublas handle: CUBLAS_STATUS_ALLOC_FAILED
2022-03-02 10:52:19.035702: W tensorflow/core/framework/op_kernel.cc:1763] OP_REQUIRES failed at conv_ops.cc:1106 : Not found: No algorithm worked!
2022-03-02 10:52:19.035770: W tensorflow/core/framework/op_kernel.cc:1763] OP_REQUIRES failed at trt_engine_op.cc:400 : Not found: No algorithm worked!
[[{{node StatefulPartitionedCall/model_1/conv2d/Conv2D}}]]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1669, in __call__
return self._call_impl(args, kwargs)
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1679, in _call_impl
cancellation_manager)
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1762, in _call_with_structured_signature
cancellation_manager=cancellation_manager)
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/saved_model/load.py", line 116, in _call_flat
cancellation_manager)
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1919, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 560, in call
ctx=ctx)
File "/mnt/d/Testing/research/yolo-v4/yolo-v4/lib/python3.6/site-packages/tensorflow/python/eager/execute.py", line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.NotFoundError: No algorithm worked!
[[{{node StatefulPartitionedCall/model_1/conv2d/Conv2D}}]]
[[PartitionedCall/TRTEngineOp_0_1]] [Op:__inference_signature_wrapper_130807]
Function call stack:
signature_wrapper
Related
The code runs for several iterations and throws the following error.
My Dataset
class Dataset(Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, input_feature_paths, target_feature_folder) -> None:
self.input_feature_paths = input_feature_paths
self.target_feature_folder = target_feature_folder
def __len__(self):
#return sum(1 for _ in self.input_feature_paths)
return len(self.input_feature_paths)
def __getitem__(self, index) -> None:
input_feature_path = self.input_feature_paths[index]
input_feature = load(input_feature_path, map_location='cpu')
target_feature_path = self.target_feature_folder / input_feature_path.parts[-1]
target_feature = load(target_feature_path, map_location='cpu')
return input_feature.to(dtype=torch.float64), target_feature.to(dtype=torch.float64)
I set dtype torch float64 because it throws the same error while writing on the tensorboard summary writer.
Error Stack
Traceback (most recent call last):
File "student_audio_feature_extractor.py", line 178, in <module>
train(dt, input_frame)
File "student_audio_feature_extractor.py", line 164, in train
model, train_loss = train_step(model, train_loader, optimizer, criterion)
File "student_audio_feature_extractor.py", line 80, in train_step
for input_feature, target_feature in train_loader:
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 628, in __next__
data = self._next_data()
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1313, in _next_data
return self._process_data(data)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1359, in _process_data
data.reraise()
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/_utils.py", line 543, in reraise
raise exception
RuntimeError: Caught RuntimeError in DataLoader worker process 4.
Original Traceback (most recent call last):
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
return self.collate_fn(data)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
return collate(batch, collate_fn_map=default_collate_fn_map)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 143, in collate
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed] # Backwards compatibility.
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 143, in <listcomp>
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed] # Backwards compatibility.
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 120, in collate
return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 162, in collate_tensor_fn
out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable
I had a tensor of shape [] that's why it throws this error i changed it and it works now.
I'm trying to run Luke for inference on multiple gpus using DataParallel but I'm encountering an error that I can't seem to resolve. Can you help ?
Here is my code
from transformers import LukeTokenizer, LukeForEntitySpanClassification
import torch
luke_model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
#Getting inputs (type : transformers.tokenization_utils_base.BatchEncoding)
inputs = []
for i in tqdm(range(10)):
input_filepath = df["input_filepath"].iloc[i]
handle = open(input_filepath,'rb')
input_tensor = pickle.load(handle)
inputs.append(input_tensor)
device_ids = [0,1,2,3]
model= torch.nn.DataParallel(luke_model)
model.to("cuda")
replicas = nn.parallel.replicate(model,device_ids)
inputs_dp = nn.parallel.scatter(inputs[:4], device_ids)
outputs = nn.parallel.parallel_apply(replicas, inputs_dp)`
The error I get is :
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<command-1863732679336681> in <module>
21
22 inputs_dp = nn.parallel.scatter(inputs[:4], device_ids)
---> 23 outputs = nn.parallel.parallel_apply(replicas, inputs_dp)
/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
84 output = results[i]
85 if isinstance(output, ExceptionWrapper):
---> 86 output.reraise()
87 outputs.append(output)
88 return outputs
/databricks/python/lib/python3.8/site-packages/torch/_utils.py in reraise(self)
432 # instantiate since we don't know how to
433 raise RuntimeError(msg) from None
--> 434 raise exception
435
436
AttributeError: Caught AttributeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/databricks/python/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/databricks/python/lib/python3.8/site-packages/torch/_utils.py", line 434, in reraise
raise exception
AttributeError: Caught AttributeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/databricks/python/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 250, in __getattr__
return self.data[item]
KeyError: 'size'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/databricks/python/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/databricks/python/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/databricks/python/lib/python3.8/site-packages/transformers/models/luke/modeling_luke.py", line 1583, in forward
outputs = self.luke(
File "/databricks/python/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/databricks/python/lib/python3.8/site-packages/transformers/models/luke/modeling_luke.py", line 977, in forward
input_shape = input_ids.size()
File "/databricks/python/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 252, in __getattr__
raise AttributeError
AttributeError
I tried adding
class MyDataParallel(nn.DataParallel):
def __getattr__(self, name):
return getattr(self.module, name)
But I get : RecursionError: maximum recursion depth exceeded while calling a Python object
Thanks in advance!
Shows error when training is executed. The error points to a function named "class_weight" in helper.py which is posted below the code
Epoch 1/150
Traceback (most recent call last):
File "main.py", line 34, in <module>
helper.exec()
File "/content/drive/MyDrive/Tata Elxsi/Function-level-Vulnerability-Detection/src/helper.py", line 284, in exec
class_weight = class_weights)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1239, in fit
validation_freq=validation_freq)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training_arrays.py", line 196, in fit_loop
outs = fit_function(ins_batch)
File "/usr/local/lib/python3.7/dist-packages/tensorflow_core/python/keras/backend.py", line 3476, in __call__
run_metadata=self.run_metadata)
File "/usr/local/lib/python3.7/dist-packages/tensorflow_core/python/client/session.py", line 1472, in __call__
run_metadata_ptr)
tensorflow.python.framework.errors_impl.InternalError: Unsupported object type int
Error line from helper.py:
train_history = model_func.fit(train_set_x, train_set_y,
epochs = self.config['training_settings']['network_config']['epochs'],
batch_size = self.config['training_settings']['network_config']
['batch_size'],
shuffle = False,
validation_data = (validation_set_x, validation_set_y),
callbacks = callbacks_list,
verbose=self.paras.verbose,
class_weight = class_weights)
'class_weights' from helper.py
class_weights = class_weight.compute_class_weight(class_weight='balanced',
classes=np.unique(train_set_y),
y=train_set_y)
We are using Roboflow for object detection using Yolov4 Pytorch model for our custom data set. During the training process, we are getting the following error.
Traceback (most recent call last):
File "./pytorch-YOLOv4/train.py", line 447, in <module>
device=device, )
File "./pytorch-YOLOv4/train.py", line 310, in train
for i, batch in enumerate(train_loader):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 345, in __next__
data = self._next_data()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 856, in _next_data
return self._process_data(data)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 881, in _process_data
data.reraise()
File "/usr/local/lib/python3.7/dist-packages/torch/_utils.py", line 394, in reraise
raise self.exc_type(msg)
ValueError: Caught ValueError in DataLoader worker process 7.
Original Traceback (most recent call last):
File "/content/pytorch-YOLOv4/dataset.py", line 382, in __getitem__
out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
AttributeError: 'list' object has no attribute 'shape'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/content/pytorch-YOLOv4/dataset.py", line 385, in __getitem__
out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
ValueError: could not broadcast input array from shape (0) into shape (0,5)
I don't know the info of your params, but the log said something wrong with your code:
out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] \
= out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
The first error said your param out_bboxes has no attribute 'shape', because it's a list object. So you can consider change it's datatype as you need.
I'm trying to do inference on FlowNet2-C model loading from file.
However, I met some data type problem. How can I resolve it?
Source code
FlowNet2-C pre-trained model
$ python main.py
Initializing Datasets
[0.000s] Loading checkpoint '/notebooks/data/model/FlowNet2-C_checkpoint.pth.tar'
[1.293s] Loaded checkpoint '/notebooks/data/model/FlowNet2-C_checkpoint.pth.tar' (at epoch 0)
(1L, 6L, 384L, 512L)
<class 'torch.autograd.variable.Variable'>
[1.642s] Operation failed
Traceback (most recent call last):
File "main.py", line 102, in <module>
main()
File "main.py", line 98, in main
summary(input_size, model)
File "main.py", line 61, in summary
model(x)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/notebooks/data/vinet/FlowNetC.py", line 75, in forward
out_conv1a = self.conv1(x1)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/container.py", line 67, in forward
input = module(input)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/conv.py", line 282, in forward
self.padding, self.dilation, self.groups)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/functional.py", line 90, in conv2d
return f(input, weight, bias)
RuntimeError: Input type (CUDAFloatTensor) and weight type (CPUFloatTensor) should be the same
Maybe that is because your model and input x to the model has different data types. It seems that the model parameters have been moved to GPU, but your input x is on GPU.
You can try to use model.cuda() after line 94, which will put the model on the GPU. Then the error should disappear.