Pytorch loader throws error after some iterations - pytorch

The code runs for several iterations and throws the following error.
My Dataset
class Dataset(Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, input_feature_paths, target_feature_folder) -> None:
self.input_feature_paths = input_feature_paths
self.target_feature_folder = target_feature_folder
def __len__(self):
#return sum(1 for _ in self.input_feature_paths)
return len(self.input_feature_paths)
def __getitem__(self, index) -> None:
input_feature_path = self.input_feature_paths[index]
input_feature = load(input_feature_path, map_location='cpu')
target_feature_path = self.target_feature_folder / input_feature_path.parts[-1]
target_feature = load(target_feature_path, map_location='cpu')
return input_feature.to(dtype=torch.float64), target_feature.to(dtype=torch.float64)
I set dtype torch float64 because it throws the same error while writing on the tensorboard summary writer.
Error Stack
Traceback (most recent call last):
File "student_audio_feature_extractor.py", line 178, in <module>
train(dt, input_frame)
File "student_audio_feature_extractor.py", line 164, in train
model, train_loss = train_step(model, train_loader, optimizer, criterion)
File "student_audio_feature_extractor.py", line 80, in train_step
for input_feature, target_feature in train_loader:
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 628, in __next__
data = self._next_data()
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1313, in _next_data
return self._process_data(data)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1359, in _process_data
data.reraise()
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/_utils.py", line 543, in reraise
raise exception
RuntimeError: Caught RuntimeError in DataLoader worker process 4.
Original Traceback (most recent call last):
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
return self.collate_fn(data)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
return collate(batch, collate_fn_map=default_collate_fn_map)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 143, in collate
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed] # Backwards compatibility.
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 143, in <listcomp>
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed] # Backwards compatibility.
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 120, in collate
return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 162, in collate_tensor_fn
out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable

I had a tensor of shape [] that's why it throws this error i changed it and it works now.

Related

pytorch: Merge three datasets with predefined and custom datasets

I am training an AI model to recognize handwritten hangul characters along with English characters and numbers. It means that I require three datasets custom korean character dataset and other datasets.
I have three datasets and now I am merging three datasets but when I print the train_set path it shows MJSynth only which is wrong.
긴장_1227682.jpg is in my custom korean dataset not in MJSynth
Code
custom_train_set = RecognitionDataset(
parts[0].joinpath("images"),
parts[0].joinpath("labels.json"),
img_transforms=Compose(
[
T.Resize((args.input_size, 4 * args.input_size), preserve_aspect_ratio=True),
# Augmentations
T.RandomApply(T.ColorInversion(), 0.1),
ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.02),
]
),
)
if len(parts) > 1:
for subfolder in parts[1:]:
custom_train_set.merge_dataset(
RecognitionDataset(subfolder.joinpath("images"), subfolder.joinpath("labels.json"))
)
train_set = MJSynth(
train=True,
img_folder='/media/cvpr/CM_22/mjsynth/mnt/ramdisk/max/90kDICT32px',
label_path='/media/cvpr/CM_22/mjsynth/mnt/ramdisk/max/90kDICT32px/imlist.txt',
img_transforms=T.Resize((args.input_size, 4 * args.input_size), preserve_aspect_ratio=True),
)
_train_set = SynthText(
train=True,
recognition_task=True,
download=True, # NOTE: download can take really long depending on your bandwidth
img_transforms=T.Resize((args.input_size, 4 * args.input_size), preserve_aspect_ratio=True),
)
train_set.data.extend([(np_img, target) for np_img, target in _train_set.data])
train_set.data.extend([(np_img, target) for np_img, target in custom_train_set.data])
Traceback
Traceback (most recent call last):
File "/media/cvpr/CM_22/doctr/references/recognition/train_pytorch.py", line 485, in <module>
main(args)
File "/media/cvpr/CM_22/doctr/references/recognition/train_pytorch.py", line 396, in main
fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, mb, amp=args.amp)
File "/media/cvpr/CM_22/doctr/references/recognition/train_pytorch.py", line 118, in fit_one_epoch
for images, targets in progress_bar(train_loader, parent=mb):
File "/home/cvpr/anaconda3/envs/pytesseract/lib/python3.9/site-packages/fastprogress/fastprogress.py", line 50, in __iter__
raise e
File "/home/cvpr/anaconda3/envs/pytesseract/lib/python3.9/site-packages/fastprogress/fastprogress.py", line 41, in __iter__
for i,o in enumerate(self.gen):
File "/home/cvpr/anaconda3/envs/pytesseract/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 628, in __next__
data = self._next_data()
File "/home/cvpr/anaconda3/envs/pytesseract/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1333, in _next_data
return self._process_data(data)
File "/home/cvpr/anaconda3/envs/pytesseract/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1359, in _process_data
data.reraise()
File "/home/cvpr/anaconda3/envs/pytesseract/lib/python3.9/site-packages/torch/_utils.py", line 543, in reraise
raise exception
FileNotFoundError: Caught FileNotFoundError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/cvpr/anaconda3/envs/pytesseract/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/cvpr/anaconda3/envs/pytesseract/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/cvpr/anaconda3/envs/pytesseract/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/media/cvpr/CM_22/doctr/doctr/datasets/datasets/base.py", line 48, in __getitem__
img, target = self._read_sample(index)
File "/media/cvpr/CM_22/doctr/doctr/datasets/datasets/pytorch.py", line 37, in _read_sample
else read_img_as_tensor(os.path.join(self.root, img_name), dtype=torch.float32)
File "/media/cvpr/CM_22/doctr/doctr/io/image/pytorch.py", line 52, in read_img_as_tensor
pil_img = Image.open(img_path, mode="r").convert("RGB")
File "/home/cvpr/anaconda3/envs/pytesseract/lib/python3.9/site-packages/PIL/Image.py", line 2912, in open
fp = builtins.open(filename, "rb")
FileNotFoundError: [Errno 2] No such file or directory: '/media/cvpr/CM_22/mjsynth/mnt/ramdisk/max/90kDICT32px/긴장_1227682.jpg'

tensorflow.python.framework.errors_impl.InternalError: Unsupported object type int

Shows error when training is executed. The error points to a function named "class_weight" in helper.py which is posted below the code
Epoch 1/150
Traceback (most recent call last):
File "main.py", line 34, in <module>
helper.exec()
File "/content/drive/MyDrive/Tata Elxsi/Function-level-Vulnerability-Detection/src/helper.py", line 284, in exec
class_weight = class_weights)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1239, in fit
validation_freq=validation_freq)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training_arrays.py", line 196, in fit_loop
outs = fit_function(ins_batch)
File "/usr/local/lib/python3.7/dist-packages/tensorflow_core/python/keras/backend.py", line 3476, in __call__
run_metadata=self.run_metadata)
File "/usr/local/lib/python3.7/dist-packages/tensorflow_core/python/client/session.py", line 1472, in __call__
run_metadata_ptr)
tensorflow.python.framework.errors_impl.InternalError: Unsupported object type int
Error line from helper.py:
train_history = model_func.fit(train_set_x, train_set_y,
epochs = self.config['training_settings']['network_config']['epochs'],
batch_size = self.config['training_settings']['network_config']
['batch_size'],
shuffle = False,
validation_data = (validation_set_x, validation_set_y),
callbacks = callbacks_list,
verbose=self.paras.verbose,
class_weight = class_weights)
'class_weights' from helper.py
class_weights = class_weight.compute_class_weight(class_weight='balanced',
classes=np.unique(train_set_y),
y=train_set_y)

Value error while using roboflow object detection Yolov4 pytorch model on custom dataset

We are using Roboflow for object detection using Yolov4 Pytorch model for our custom data set. During the training process, we are getting the following error.
Traceback (most recent call last):
File "./pytorch-YOLOv4/train.py", line 447, in <module>
device=device, )
File "./pytorch-YOLOv4/train.py", line 310, in train
for i, batch in enumerate(train_loader):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 345, in __next__
data = self._next_data()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 856, in _next_data
return self._process_data(data)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 881, in _process_data
data.reraise()
File "/usr/local/lib/python3.7/dist-packages/torch/_utils.py", line 394, in reraise
raise self.exc_type(msg)
ValueError: Caught ValueError in DataLoader worker process 7.
Original Traceback (most recent call last):
File "/content/pytorch-YOLOv4/dataset.py", line 382, in __getitem__
out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
AttributeError: 'list' object has no attribute 'shape'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/content/pytorch-YOLOv4/dataset.py", line 385, in __getitem__
out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
ValueError: could not broadcast input array from shape (0) into shape (0,5)
I don't know the info of your params, but the log said something wrong with your code:
out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] \
= out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
The first error said your param out_bboxes has no attribute 'shape', because it's a list object. So you can consider change it's datatype as you need.

Getting this while using pytorch transforms--->TypeError: integer argument expected, got float

I cloned transfer-learning-library repo and working on maximum classifier discrepancy. I am trying to change the augmentation but getting the following error
Traceback (most recent call last):
File "mcd.py", line 378, in <module>
main(args)
File "mcd.py", line 145, in main
results = validate(val_loader, G, F1, F2, args)
File "mcd.py", line 290, in validate
for i, (images, target) in enumerate(val_loader):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 521, in __next__
data = self._next_data()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1203, in _next_data
return self._process_data(data)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1229, in _process_data
data.reraise()
File "/usr/local/lib/python3.7/dist-packages/torch/_utils.py", line 425, in reraise
raise self.exc_type(msg)
TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "../../../common/vision/datasets/imagelist.py", line 48, in __getitem__
img = self.transform(img)
File "/usr/local/lib/python3.7/dist-packages/torchvision/transforms/transforms.py", line 60, in __call__
img = t(img)
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/torchvision/transforms/transforms.py", line 750, in forward
return F.perspective(img, startpoints, endpoints, self.interpolation, fill)
File "/usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional.py", line 647, in perspective
return F_pil.perspective(img, coeffs, interpolation=pil_interpolation, fill=fill)
File "/usr/local/lib/python3.7/dist-packages/torchvision/transforms/functional_pil.py", line 289, in perspective
return img.transform(img.size, Image.PERSPECTIVE, perspective_coeffs, interpolation, **opts)
File "/usr/local/lib/python3.7/dist-packages/PIL/Image.py", line 2371, in transform
im = new(self.mode, size, fillcolor)
File "/usr/local/lib/python3.7/dist-packages/PIL/Image.py", line 2578, in new
return im._new(core.fill(mode, size, color))
TypeError: integer argument expected, got float
The previous code was
# Data loading code
normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
if args.center_crop:
train_transform = T.Compose([
ResizeImage(256),
T.CenterCrop(224),
T.RandomHorizontalFlip(),
T.ToTensor(),
normalize
])
else:
train_transform = T.Compose([
ResizeImage(256),
T.RandomResizedCrop(224),
T.RandomHorizontalFlip(),
T.ToTensor(),
normalize
])
val_transform = T.Compose([
ResizeImage(256),
T.CenterCrop(224),
T.ToTensor(),
normalize
])
I just added T.RandomPerspective(distortion_scale = 0.8, p=0.5, fill=0.6) for val_transform.
Before this I also added few other transforms for train_transform but still got the same error.
What could be the problem?
The fill argument needs to be an integer.
This transform does not support the fill parameter for Tensor types; therefore, if you wish to use the fill parameter, then you must use this transform before the ToTensor transform. At this point, the data is integral.

WSQ files not opening with Pillow/wsq when using joblib.Parallel

I am trying to preprocess large amounts of WSQ images for model training using both the Pillow and wsq libraries. To speed up my code, I am trying to use Parallel but this causes an UnidentifiedImageError.
I verified that the files are there where they should be, and that the function runs without errors when used in a regular for-loop. Other files (eg csv files) can be opened inside the function without errors, so I presume that the error lies with the combination of Parallel and Pillow/wsq. All libraries are up to date. As I am just starting out with Pillow and multiprocessing, I have no idea yet on how to fix this and any help would be highly appreciated.
Code:
from joblib import Parallel, delayed
from PIL import Image
import multiprocessing
import wsq
import numpy as np
def process_image(i):
path = "/home/user/project/wsq/image_"+str(i)+".wsq"
img = np.array(Image.open(path))
#some preprocessing, saving as npz
output_path = "/home/user/project/npz/image_"+str(i)+".npz"
np.savez_compressed(output_path, img)
return None
inputs = range(100000)
num_cores = multiprocessing.cpu_count()
Parallel(n_jobs=num_cores)(delayed(process_image)(i) for i in inputs)
Output:
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/user/.local/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 431, in _process_worker
r = call_item()
File "/home/user/.local/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 285, in __call__
return self.fn(*self.args, **self.kwargs)
File "/home/user/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "/home/user/.local/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "/home/user/.local/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "preprocess_images.py", line 9, in process_image
img = np.array(Image.open(path))
File "/home/user/.local/lib/python3.8/site-packages/PIL/Image.py", line 2967, in open
raise UnidentifiedImageError(
PIL.UnidentifiedImageError: cannot identify image file '/home/user/project/wsq/image_1.wsq'
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "preprocess_images.py", line 18, in <module>
Parallel(n_jobs=num_cores)(delayed(process_image)(i) for i in inputs)
File "/home/user/.local/lib/python3.8/site-packages/joblib/parallel.py", line 1054, in __call__
self.retrieve()
File "/home/user/.local/lib/python3.8/site-packages/joblib/parallel.py", line 933, in retrieve
self._output.extend(job.get(timeout=self.timeout))
File "/home/user/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 542, in wrap_future_result
return future.result(timeout=timeout)
File "/usr/lib/python3.8/concurrent/futures/_base.py", line 439, in result
return self.__get_result()
File "/usr/lib/python3.8/concurrent/futures/_base.py", line 388, in __get_result
raise self._exception
PIL.UnidentifiedImageError: cannot identify image file '/home/user/project/wsq/image_1.wsq'

Resources