PyTorch Dataloader freezes with num_workers > 0 - pytorch

The following dataset class -> dataloader only works with num_workers = 0, and I'm not sure why. Other notebooks in the same environment do work with num_workers > 0. This has been bothering me for months!
Class that does not work: There is no error message, just runs indefinitely on next(iter(train_dl)), whereas with num_workers = 0 it takes 1sec.
class SegmentationDataSet(data.Dataset):
def __init__(self, fnames, rle_df=None, path=train_val_dir):
self.fnames = fnames
self.rle_df = rle_df
self.path = path
def __len__(self):
return len(self.fnames)
def __getitem__(self, index:int):
img_id = self.fnames[index]
mask = None
im = torchvision.io.read_image(self.path + img_id).float()
if self.rle_df is not None:
rle = self.rle_df.loc[self.rle_df['id']==img_id]['rle']
if not pd.isnull(rle).values[0]:
rle = rle.values[0]
mask = rle2mask(rle, [1024,1024])
mask = torch.from_numpy(np.expand_dims(mask,0))
else:
mask = torch.zeros([1,1024,1024])
return self.transform(im, mask)
def transform(self, im, mask):
im = im / 255
im = torchvision.transforms.Resize((512,512))(im)
if mask is not None:
mask = torchvision.transforms.Resize((512,512))(mask)
return im, mask
else:
return im
In contrast, other notebooks using torchvision.datasets.ImageFolder(folder, transform) do work with num_workers > 0.
Any advice for how to make this compatible with async data loading, or other code feedback would be appreciated.
Python versoin 3.9.7
PyTorch version 1.10.1+cu113
Windows 11

After running into a similar problem, it looks like the issue has to do with how Jupyter Notebooks handle multiprocessing on Windows (source). To work around the issue, try running the code in a Python script, or if you need it in a notebook, run the notebook from WSL

Related

Use of pytorch dataset for model inference- GPU

I am running T5-base-grammar-correction for grammer correction on my dataframe with text column
from happytransformer import HappyTextToText
from happytransformer import TTSettings
from tqdm.notebook import tqdm
tqdm.pandas()
happy_tt = HappyTextToText("T5", "./t5-base-grammar-correction")
beam_settings = TTSettings(num_beams=5, min_length=1, max_length=30)
def grammer_pipeline(text):
text = "gec: " + text
result = happy_tt.generate_text(text, args=beam_settings)
return result.text
df['new_text'] = df['original_text'].progress_apply(grammer_pipeline)
Pandas apply function, though runs and provides required results, but runs quite slow.
Also I get the below warning while executing the code
/home/.local/lib/python3.6/site-packages/transformers/pipelines/base.py:908: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
UserWarning,
I have access to GPU. Can somebody provide some pointers to speed up the execution and utilising full capabilities of GPU
--------------------------------EDIT---------------------------------
I tried using pytorch Dataset in the below way, but still the processing is slow:
class CustomD(Dataset):
def __init__(self, text):
self.text = text
self.len = text.shape[0]
def __len__(self):
return self.len
def __getitem__(self, idx):
text = self.text[idx]
text = "gec: " + text
result = happy_tt.generate_text(text, args=beam_settings)
return result.text
TD = GramData(df.original_text)
final_data = DataLoader(dataset=TD,
batch_size=10,
shuffle=False
)
import itertools
list_modified=[]
for (idx, batch) in enumerate(final_data):
list_modified.append(batch)
flat_list = [item for sublist in list_modified for item in sublist]
df["new_text"]=flat_list

Pytorch: GPU not fully utilized while training custom model on custom dataset

I'm trying to train my own model, but for some reason, my gpu is not fully utilized. Is there any way to solve this?
Here is the snipet of my Dataset.
I have already tried pin_memory=True, still not working.
environment: torch1.8 + cuda11.1 + RTX3090
class ImageDataset(Dataset):
def __init__(self, data_path, transform=None, image_size=512):
self.train_data_path = data_path['train']
self.train_label_path = data_path['label']
self.train_lables = os.listdir(self.train_label_path)
self.train_data = os.listdir(self.train_data_path)
self.transform = transform
self.image_size = to_2tuple(image_size)
def __len__(self):
return len(self.train_data)
def __getitem__(self, indx):
if indx >= len(self.train_data):
raise Exception("Index should be less than {}".format(len(self.train_data)))
image = Image.open(os.path.join(self.train_data_path, self.train_data[indx]))
final_label = Image.open(os.path.join(self.train_label_path, self.train_lables[indx]))
image = self.transform(image)
final_label = self.transform(final_label)
return image, final_label

Pytorch netwrok with variable number of hidden layers

I want to create a class that creates a simple network with X fully connected layers, where X is an input given by the user. I tried this using the setattr/getattr but for some reason is not working.
class MLP(nn.Module):
def __init__(self,in_size, out_size,n_layers, hidden_size):
super(MLP,self).__init__()
self.n_layers=n_layers
for i in range(n_layers):
if i==0:
layer_in_size = in_size
else:
layer_in_size = hidden_size
if i==(n_layers-1):
layer_out_size = out_size
else:
layer_out_size = hidden_size
setattr(self,'dense_{}'.format(i), nn.Linear(layer_in_size,layer_out_size))
def forward(self,x):
out = x
for i in range(self.n_layers):
if i==(self.n_layers-1):
out = getattr(self,'dense_{}'.format(i),out)
else:
out = F.relu(getattr(self,'dense_{}'.format(i),out))
return out
This is the error im getting when trying a forward pass with the net:
enter image description here
Some insights of what's the issue will be helpful.
This seems like a problem with forward implementation with the mod2 function. Try the pytorch functions (torch.fmod and torch.remainder) or if you don't need the backprop capabilities try to do .detach() before the mod2 function.

Pytorch Dataset for video

Hi I made a video frames loader Dataset to be fed into a pytorch model. I want to sample frames from a video, but the frames should be uniformly sampled from each video. This is the class I came up with. I was wondering if there was any better method to speed up the sampling process.
Do you have any suggestion especially in the read_video method part??
Thanks
import torch
import torchvision as tv
import cv2
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from pathlib import Path
class VideoLoader(torch.utils.data.Dataset):
def __init__(self, data_path, classes, transforms=None, max_frames=None, frames_ratio=None):
super(VideoLoader, self).__init__()
self.data_path = data_path
self.classes = classes
self.frames_ratio = frames_ratio
self.transforms = transforms
self.max_frames = max_frames
def read_video(self, path):
frames = []
vc = cv2.VideoCapture(path)
total_frames = int(vc.get(cv2.CAP_PROP_FRAME_COUNT))
if self.frames_ratio:
if type(self.frames_ratio) is float:
frames_to_pick = int(total_frames * self.frames_ratio)
else:
frames_to_pick = self.frames_ratio
else:
frames_to_pick = total_frames
idxs = np.linspace(0, total_frames, frames_to_pick, endpoint=False)
for i in idxs:
ok, f = vc.read()
if ok:
f = tv.transforms.ToTensor()(f)
f = self.transforms(f) if self.transforms else f
frames.append(f)
vc.set(cv2.CAP_PROP_POS_FRAMES, i)
if self.max_frames and len(frames) == self.max_frames: break
else: break
vc.release()
return torch.stack(frames)
def __getitem__(self, index):
v_path, label = self.data_path[index]
return self.read_video(v_path), self.classes[label]
def __len__(self): return len(self.data_path)
Because you can't really seek through a video in parallel, there's not really any faster sampling process you can run locally. I personally had trouble with this problem which is why I started building a simple API for this called Sieve. You can literally upload data directly to Sieve (either from a cloud bucket or from local storage) and it'll quickly cut up all the frames for you and even mark them with things like motion, people, objects, and more. It parallelizes using serverless functions in the cloud which makes it really fast, even for hours or days of footage.
You can then quickly export from Sieve using the dashboard which gives you a quick curl command you can run to download the exact samples you want.
Here's a helpful repo: https://github.com/Sieve-Data/automatic-video-processing
If you are happy with extracting the frames of each video to disk beforehand, this library is exactly what you're looking for:
Video-Dataset-Loading-PyTorch on Github
https://github.com/RaivoKoot/Video-Dataset-Loading-Pytorch

Pytorch dynamic amount of Layers?

I am trying to specify a dynamic amount of layers, which I seem to be doing wrong.
My issue is that when I define the 100 layers here, I will get an error in the forward step.
But when I define the layer properly it works?
Below simplified example
class PredictFromEmbeddParaSmall(LightningModule):
def __init__(self, hyperparams={'lr': 0.0001}):
super(PredictFromEmbeddParaSmall, self).__init__()
#Input is something like tensor.size=[768*100]
self.TO_ILLUSTRATE = nn.Linear(768, 5)
self.enc_ref=[]
for i in range(100):
self.enc_red.append(nn.Linear(768, 5))
# gather the layers output sth
self.dense_simple1 = nn.Linear(5*100, 2)
self.output = nn.Sigmoid()
def forward(self, x):
# first input to enc_red
x_vecs = []
for i in range(self.para_count):
layer = self.enc_red[i]
# The first dim is the batch size here, output is correct
processed_slice = x[:, i * 768:(i + 1) * 768]
# This works and give the out of size 5
rand = self.TO_ILLUSTRATE(processed_slice)
#This will fail? Error below
ret = layer(processed_slice)
#more things happening we can ignore right now since we fail earlier
I get this error when executing "ret = layer.forward(processed_slice)"
RuntimeError: Expected object of device type cuda but got device type
cpu for argument #1 'self' in call to _th_addmm
Is there a smarter way to program this? OR solve the error?
You should use a ModuleList from pytorch instead of a list: https://pytorch.org/docs/master/generated/torch.nn.ModuleList.html . That is because Pytorch has to keep a graph with all modules of your model, if you just add them in a list they are not properly indexed in the graph, resulting in the error you faced.
Your coude should be something alike:
class PredictFromEmbeddParaSmall(LightningModule):
def __init__(self, hyperparams={'lr': 0.0001}):
super(PredictFromEmbeddParaSmall, self).__init__()
#Input is something like tensor.size=[768*100]
self.TO_ILLUSTRATE = nn.Linear(768, 5)
self.enc_ref=nn.ModuleList() # << MODIFIED LINE <<
for i in range(100):
self.enc_red.append(nn.Linear(768, 5))
# gather the layers output sth
self.dense_simple1 = nn.Linear(5*100, 2)
self.output = nn.Sigmoid()
def forward(self, x):
# first input to enc_red
x_vecs = []
for i in range(self.para_count):
layer = self.enc_red[i]
# The first dim is the batch size here, output is correct
processed_slice = x[:, i * 768:(i + 1) * 768]
# This works and give the out of size 5
rand = self.TO_ILLUSTRATE(processed_slice)
#This will fail? Error below
ret = layer(processed_slice)
#more things happening we can ignore right now since we fail earlier
Then it should work all right!
Edit: alternative way.
Instead of using ModuleList you can also just use nn.Sequential, this allows you to avoid using the for loop in the forward pass. That also means that you will not have access to intermediary activations, so that is not the solution for you if you need them.
class PredictFromEmbeddParaSmall(LightningModule):
def __init__(self, hyperparams={'lr': 0.0001}):
super(PredictFromEmbeddParaSmall, self).__init__()
#Input is something like tensor.size=[768*100]
self.TO_ILLUSTRATE = nn.Linear(768, 5)
self.enc_ref=[]
for i in range(100):
self.enc_red.append(nn.Linear(768, 5))
self.enc_red = nn.Seqential(*self.enc_ref) # << MODIFIED LINE <<
# gather the layers output sth
self.dense_simple1 = nn.Linear(5*100, 2)
self.output = nn.Sigmoid()
def forward(self, x):
# first input to enc_red
x_vecs = []
out = self.enc_red(x) # << MODIFIED LINE <<
A little bit more adjustable solution which comes down to matter of taste or complexity of your exact situation was posted here.
For reference I post an adjusted version of the code here:
import torch
from torch import nn, optim
from torch.nn.modules import Module
from implem.settings import settings
class Model(nn.Module):
def __init__(self, input_size, layers_data: list, learning_rate=0.01, optimizer=optim.Adam):
super().__init__()
self.layers = nn.ModuleList()
self.input_size = input_size # Can be useful later ...
for size, activation in layers_data:
self.layers.append(nn.Linear(input_size, size))
input_size = size # For the next layer
if activation is not None:
assert isinstance(activation, Module), \
"Each tuples should contain a size (int) and a torch.nn.modules.Module."
self.layers.append(activation)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.learning_rate = learning_rate
self.optimizer = optimizer(params=self.parameters(), lr=learning_rate)
def forward(self, input_data):
for layer in self.layers:
input_data = layer(input_data)
return input_data
# test that the net is working properly
if __name__ == "__main__":
data_size = 5
layer1, layer2 = 10, 10
output_size = 2
data = torch.randn(data_size)
mlp = Model(data_size, [(layer1, nn.ReLU()), (layer2, nn.ReLU()), (output_size, nn.Sigmoid())])
output = mlp(data)
print("done")

Resources