Pytorch customized dataloader - pytorch

I am trying to train a classifier with MNIST dataset using pytorch-lightening.
import pytorch_lightning as pl
from torchvision import transforms
from torchvision.datasets import MNIST, SVHN
from torch.utils.data import DataLoader, random_split
class MNISTData(pl.LightningDataModule):
def __init__(self, data_dir='./', batch_size=256):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.transform = transforms.ToTensor()
def download(self):
MNIST(self.data_dir, train=True, download=True)
MNIST(self.data_dir, train=False, download=True)
def setup(self, stage=None):
if stage == 'fit' or stage is None:
mnist_train = MNIST(self.data_dir, train=True, transform=self.transform)
self.mnist_train, self.mnist_val = random_split(mnist_train, [55000, 5000])
if stage == 'test' or stage is None:
self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)
def train_dataloader(self):
mnist_train = DataLoader(self.mnist_train, batch_size=self.batch_size)
return mnist_train
def val_dataloader(self):
mnist_val = DataLoader(self.mnist_val, batch_size=self.batch_size)
return mnist_val
def test_dataloader(self):
mnist_test = DataLoader(self.mnist_test, batch_size=self.batch_size)
After using MNISTData().setup(), I gained MNISTData().mnist_train, MNISTData().mnist_val, MNISTData().mnist_test whose length are 55000, 5000, 10000 with type of torch.utils.data.dataset.Subset.
But when i call dataloader w.r.t MNISTData().train_dataloader, MNISTData().val_dataloader, MNISTData().test_dataloader I only get DataLoader with 215, 20, None datas in them.
Can someone know the reason or could fix the problem?

As I told in the comments, and Ivan posted in his answer, there was missing return statement:
def test_dataloader(self):
mnist_test = DataLoader(self.mnist_test, batch_size=self.batch_size)
return mnist_test # <<< missing return
As per your comment, if we try:
a = MNISTData()
# skip download, assuming you already have it
a.setup()
b, c, d = a.train_dataloader(), a.val_dataloader(), a.test_dataloader()
# len(b)=215, len(c)=20, len(d)=40
I think your question is why the length of b, c, d are different from the length of the datasets. The answer is that the len() of a DataLoader is equal to the number of batches, not the number of samples, therefore:
import math
batch_size = 256
len(b) = math.ceil(55000 / batch_size) = 215
len(c) = math.ceil(5000 / batch_size) = 20
len(d) = math.ceil(10000 / batch_size) = 40
BTW, we're using math.ceil because DataLoader has drop_last=False by default, otherwise it would be math.floor.

Your test_dataloader function is missing a return statement!
def test_dataloader(self):
mnist_test = DataLoader(self.mnist_test, batch_size=self.batch_size)
return mnist_test
>>> ds = MNISTData()
>>> ds.download()
>>> ds.setup()
Then:
>>> [len(subset) for subset in \
(ds.mnist_train, ds.mnist_val, ds.mnist_test)]
[55000, 5000, 10000]
>>> [len(loader) for loader in \
(ds.train_dataloader(), ds.val_dataloader(), ds.test_dataloader())]
[215, 20, 40]

Others pointing out the fact that you are missing a return is the test_dataloader() is certainly correct.
Judging by how the question is framed, it seems you are confused about the length of a Dataset and a DataLoader.
len(Dataset(..)) returns the number of data samples in your dataset.
whereas, len(DataLoader(ds, ...)) returns the number of batches; and that depends of how much batch_size=... you requested, whether you want to drop_last batch etc. The exact calculations are provided correctly by #Berriel

Related

Optimizing Pytorch implementation of mix-up augmentation

So I have this code here for implementing mix-up augmentation. It's incredibly slow and I'm not sure how to make it faster. It seems like there are some operations that are unavoidable and just by nature slow like scaling images by the weight which is 0.5 then summing up each cell seems like a very slow and unavoidable operation. I'm applying this to Reinforcement Learning so I could be augmenting 64 million images, which is why I need it to be a lot faster.
Note: Here's the original author's implementation but I would assume it's equally as slow as it's essentially the same.
import torch
import utils
import os
import torch.nn.functional as F
import torchvision.transforms as TF
import torchvision.datasets as datasets
dataloader = None
data_iter = None
def _load_data(
sub_path: str, batch_size: int = 256, image_size: int = 84, num_workers: int = 16
):
global data_iter, dataloader
for data_dir in utils.load_config("datasets"):
if os.path.exists(data_dir):
fp = os.path.join(data_dir, sub_path)
if not os.path.exists(fp):
print(f"Warning: path {fp} does not exist, falling back to {data_dir}")
dataloader = torch.utils.data.DataLoader(
datasets.ImageFolder(
fp,
TF.Compose(
[
TF.RandomResizedCrop(image_size),
TF.RandomHorizontalFlip(),
TF.ToTensor(),
]
),
),
batch_size=batch_size,
shuffle=True,
num_workers=num_workers,
pin_memory=True,
)
data_iter = iter(dataloader)
break
if data_iter is None:
raise FileNotFoundError(
"failed to find image data at any of the specified paths"
)
print("Loaded dataset from", data_dir)
def _load_places(batch_size=256, image_size=84, num_workers=16, use_val=False):
partition = "val" if use_val else "train"
sub_path = os.path.join("places365_standard", partition)
print(f"Loading {partition} partition of places365_standard...")
_load_data(
sub_path=sub_path,
batch_size=batch_size,
image_size=image_size,
num_workers=num_workers,
)
def _load_coco(batch_size=256, image_size=84, num_workers=16, use_val=False):
sub_path = "COCO"
print(f"Loading COCO 2017 Val...")
_load_data(
sub_path=sub_path,
batch_size=batch_size,
image_size=image_size,
num_workers=num_workers,
)
def _get_data_batch(batch_size):
global data_iter
try:
imgs, _ = next(data_iter)
if imgs.size(0) < batch_size:
data_iter = iter(dataloader)
imgs, _ = next(data_iter)
except StopIteration:
data_iter = iter(dataloader)
imgs, _ = next(data_iter)
return imgs.cuda()
def load_dataloader(batch_size, image_size, dataset="coco"):
if dataset == "places365_standard":
if dataloader is None:
_load_places(batch_size=batch_size, image_size=image_size)
elif dataset == "coco":
if dataloader is None:
_load_coco(batch_size=batch_size, image_size=image_size)
else:
raise NotImplementedError(
f'overlay has not been implemented for dataset "{dataset}"'
)
def random_mixup(x, dataset="coco"):
"""Randomly overlay an image from Places or COCO"""
global data_iter
alpha = 0.5
load_dataloader(batch_size=x.size(0), image_size=x.size(-1), dataset=dataset)
imgs = _get_data_batch(batch_size=x.size(0)).repeat(1, x.size(1) // 3, 1, 1)
return ((1 - alpha) * (x / 255.0) + (alpha) * imgs) * 255.0
To optimize you need to use the GPU. You need to use PyTorch tensors and operations.
Example of how to do this with PyTorch:
import torch
import torch.nn.functional as F
import torchvision.transforms as TF
import torchvision.datasets as datasets
# Load the data
dataloader = torch.utils.data.DataLoader(
datasets.ImageFolder(
'path/to/data',
TF.Compose(
[
TF.RandomResizedCrop(84),
TF.RandomHorizontalFlip(),
TF.ToTensor(),
]
),
),
batch_size=256,
shuffle=True,
num_workers=16,
pin_memory=True,
)
# Get a batch of data
imgs, _ = next(iter(dataloader))
# Create a tensor of random weights
alpha = torch.rand(imgs.size(0), 1, 1, 1)
# Create a tensor of random indices
indices = torch.randint(0, imgs.size(0), (imgs.size(0),))
# Create a tensor of random images
imgs2 = imgs[indices]
# Mix the images
imgs = (1 - alpha) * imgs + alpha * imgs2
# You can also do this with a single line of code:
imgs = (1 - alpha) * imgs + alpha * imgs[torch.randint(0, imgs.size(0), (imgs.size(0),))]

torch - subsample each dataset differently and concatenate them

I have two datasets, but one is larger than the other and I want to subsample it (resample in each epoch).
I probably cannot use dataloader argument sampler, as I would pass to Dataloader the already concatenated dataset.
How do I achieve this simply?
I think one solution would be to write a class SubsampledDataset(IterableDataset) which would resample every time __iter__ is called (each epoch).
(Or better use a map-style dataset, but is there a hook that gets called every epoch, like __iter__ gets?)
This is what I have so far (untested). Usage:
dataset1: Any = ...
# subsample original_dataset2, so that it is equally large in each epoch
dataset2 = RandomSampledDataset(original_dataset2, num_samples=len(dataset1))
concat_dataset = ConcatDataset([dataset1, dataset2])
data_loader = torch.utils.data.DataLoader(
concat_dataset,
sampler=RandomSamplerWithNewEpochHook(dataset2.new_epoch_hook, concat_dataset)
)
The result is that the concat_dataset will be shuffled each epoch (RandomSampler), in addition, the dataset2 component is a new sample of the (possibly larger) original_dataset2, different in each epoch.
You can add more datasets to be subsampled by doing instead of:
sampler=RandomSamplerWithNewEpochHook(dataset2.new_epoch_hook
this:
sampler=RandomSamplerWithNewEpochHook(lambda: dataset2.new_epoch_hook and dataset3.new_epoch_hook and dataset4.new_epoch_hook, ...
Code:
class RandomSamplerWithNewEpochHook(RandomSampler):
""" Wraps torch.RandomSampler and calls supplied new_epoch_hook before each epoch. """
def __init__(self, new_epoch_hook: Callable, data_source: Sized, replacement: bool = False,
num_samples: Optional[int] = None, generator=None):
super().__init__(data_source, replacement, num_samples, generator)
self.new_epoch_hook = new_epoch_hook
def __iter__(self):
self.new_epoch_hook()
return super().__iter__()
class RandomSampledDataset(Dataset):
""" Subsamples a dataset. The sample is different in each epoch.
This helps when concatenating datasets, as the subsampling rate can be different for each dataset.
Call new_epoch_hook before each epoch. (This can be done using e.g. RandomSamplerWithNewEpochHook.)
This would be arguably harder to achieve with a concatenated dataset and a sampler argument to Dataloader. The
sampler would have to be aware of the indices of subdatasets' items in the concatenated dataset, of the subsampling
for each subdataset."""
def __init__(self, dataset, num_samples, transform=lambda im: im):
self.dataset = dataset
self.transform = transform
self.num_samples = num_samples
self.sampler = RandomSampler(dataset, num_samples=num_samples)
self.current_epoch_samples = None
def new_epoch_hook(self):
self.current_epoch_samples = torch.tensor(iter(self.sampler), dtype=torch.int)
def __len__(self):
return self.num_samples
def __getitem__(self, item):
if item < 0 or item >= len(self):
raise IndexError
img = self.dataset[self.current_epoch_samples[item].item()]
return self.transform(img)
You can stop to iterate by raising StopIteration. This error is caught by Dataloader and simply stop the iteration. So you can do something like that:
class SubDataset(Dataset):
"""SubDataset class."""
def __init__(self, dataset, length):
self.dataset = dataset
self.elem = 0
self.length = length
def __getitem__(self, index):
self.elem += 1
if self.elem > self.length:
self.elem = 0
raise StopIteration # caught by DataLoader
return self.dataset[index]
def __len__(self):
return len(self.dataset)
if __name__ == '__main__':
torch.manual_seed(0)
dataloader = DataLoader(SubDataset(torch.arange(10), 5), shuffle=True)
for _ in range(3):
for x in dataloader:
print(x)
print(len(dataloader)) # 10!!
Output:
Note that setting __len__ to self.length will cause a problem because dataloader will use only indices between 0 and length-1 (that is not what you want). Unfortunately I found nothing to set the actually length without having this behaviour (due to Dataloader restriction). Thus be careful: len(dataset) is the original length and dataset.length is the new length.

how to get the real shape of batch_size which is none in keras

When implementing a custom layer in Keras, I need to know the real size of batch_size. my shape is (?,20).
questions:
1. What is the best way to change (?,20) to (batch_size,20).
I have looked into this but it can not adjust to my problem.
I can pass the batch_size to this layer. In that case, I need to reshape (?,20) to (batch_size,20), how can I do that?
2. Is it the best way to that, or is there any builtin function that can get the real batch_size while building and running the model?
This is my layer:
from scipy.stats import entropy
from keras.engine import Layer
import keras.backend as K
import numpy as np
class measure(Layer):
def __init__(self, beta, **kwargs):
self.beta = beta
self.uses_learning_phase = True
self.supports_masking = True
super(measure, self).__init__(**kwargs)
def call(self, x):
return K.in_train_phase(self.rev_entropy(x, self.beta), x)
def get_config(self):
config = {'beta': self.beta}
base_config = super(measure, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def rev_entropy(self, x, beta):
entropy_p_t_w = np.apply_along_axis(entropy, 1, x)
con = (beta / (1 + entropy_p_t_w)) ** 1.5
new_f_w_t = x * (con.reshape(con.shape[0], 1))
norm_const = 1e-30 + np.sum(new_f_w_t, axis=0)
for t in range(norm_const.shape[0]):
new_f_w_t[:, t] /= norm_const[t]
return new_f_w_t
And here is where I call this layer:
encoded = measure(beta=0.08)(encoded)
I am also using fit_generator if it can help at all:
autoencoder.fit_generator(train_gen, steps_per_epoch=num_train_steps, epochs=NUM_EPOCHS,
validation_data=test_gen, validation_steps=num_test_steps, callbacks=[checkpoint])
The dimension of the x passed to the layer is (?,20) and that's why I can not do my calculation.
Thanks:)

Batchsize in input shape of chainer CNN

I have a training set of 9957 images. The training set has shape (9957, 3, 60, 80).
Is batchsize required when putting training set to model?
If required can the original shape be considered correct for fitting to conv2D layer or do I need to add batchsize to input_shape?
X_train.shape
(9957, 60,80,3)
from chainer.datasets import split_dataset_random
from chainer.dataset import DatasetMixin
import numpy as np
class MyDataset(DatasetMixin):
def __init__(self, X, labels):
super(MyDataset, self).__init__()
self.X_ = X
self.labels_ = labels
self.size_ = X.shape[0]
def __len__(self):
return self.size_
def get_example(self, i):
return np.transpose(self.X_[i, ...], (2, 0, 1)), self.labels_[i]
batch_size = 3
label_train = y_trainHot1
dataset = MyDataset(X_train1, label_train)
dataset_train, valid = split_dataset_random(dataset, 8000, seed=0)
train_iter = iterators.SerialIterator(dataset_train, batch_size)
valid_iter = iterators.SerialIterator(valid, batch_size, repeat=False,
shuffle=False)
The code below tells you that you do not have to care the batch-size by yourself. You just use DatsetMixin and SerialIterator as is instructed in the tutorial of chainer.
from chainer.dataset import DatasetMixin
from chainer.iterators import SerialIterator
import numpy as np
NUM_IMAGES = 9957
NUM_CHANNELS = 3 # RGB
IMAGE_WIDTH = 60
IMAGE_HEIGHT = 80
NUM_CLASSES = 10
BATCH_SIZE = 32
TRAIN_SIZE = min(8000, int(NUM_IMAGES * 0.9))
images = np.random.rand(NUM_IMAGES, NUM_CHANNELS, IMAGE_WIDTH, IMAGE_HEIGHT)
labels = np.random.randint(0, NUM_CLASSES, (NUM_IMAGES,))
class MyDataset(DatasetMixin):
def __init__(self, images_, labels_):
# note: input arg.'s tailing underscore is just to avoid shadowing
super(MyDataset, self).__init__()
self.images_ = images_
self.labels_ = labels_
self.size_ = len(labels_)
def __len__(self):
return self.size_
def get_example(self, i):
return self.images_[i, ...], self.labels_[i]
dataset_train = MyDataset(images[:TRAIN_SIZE, ...], labels[:TRAIN_SIZE])
dataset_valid = MyDataset(images[TRAIN_SIZE:, ...], labels[TRAIN_SIZE:])
train_iter = SerialIterator(dataset_train, BATCH_SIZE)
valid_iter = SerialIterator(dataset_valid, BATCH_SIZE, repeat=False, shuffle=False)
###############################################################################
"""This block is just for the confirmation.
.. note: NOT recommended to call :func:`concat_examples` in your code.
Use :class:`chainer.updaters.StandardUpdater` instead.
"""
from chainer.dataset import concat_examples
batch_image, batch_label = concat_examples(next(train_iter))
print("batch_image.shape\n{}".format(batch_image.shape))
print("batch_label.shape\n{}".format(batch_label.shape))
Output
batch_image.shape
(32, 3, 60, 80)
batch_label.shape
(32,)
It should be noted that chainer.dataset.concat_example is a little bit tricky part. Usually, the users do not pay attention to this function, if you use StandardUpdater which conceals the native function chainer.dataset.concat_example.
Since chainer is designed on the scheme of Trainer, (Standard)Updater, some Optimizer, (Serial)Iterator and Dataset(Mixin), if you do not follow this scheme, you have to dive into the sea of chainer source code.

PyTorch: unable to use the Gloo backend

I am trying to use PyTorch to train a model in a (multi-CPU) distributed manner. However, when I ran the code, I got the following error message
RuntimeError: the Gloo backend is not available; try to recompile the THD package with Gloo support at /opt/conda/conda-bld/pytorch-cpu_1544218188686/work/torch/lib/THD/process_group/General.cpp:20
I tried this both on my own Macbook and the university's slurm cluster (Red Hat 4.8.5-16). The Python version is 3.6. I installed PyTorch using the command conda install pytorch-cpu torchvision-cpu -c pytorch (the version without CUDA support).
I was wondering if I have to re-install PyTorch from the source or install Gloo manually. I was a little confused since according to PyTorch's documentation,
Since version 0.2.0, the Gloo backend is automatically included with the pre-compiled binaries of PyTorch
The code is exactly the example code available at https://github.com/seba-1511/dist_tuto.pth/blob/gh-pages/train_dist.py
import os
import torch
import torch.distributed.deprecated as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from math import ceil
from random import Random
from torch.multiprocessing import Process
from torch.autograd import Variable
from torchvision import datasets, transforms
class Partition(object):
""" Dataset-like object, but only access a subset of it. """
def __init__(self, data, index):
self.data = data
self.index = index
def __len__(self):
return len(self.index)
def __getitem__(self, index):
data_idx = self.index[index]
return self.data[data_idx]
class DataPartitioner(object):
""" Partitions a dataset into different chuncks. """
def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234):
self.data = data
self.partitions = []
rng = Random()
rng.seed(seed)
data_len = len(data)
indexes = [x for x in range(0, data_len)]
rng.shuffle(indexes)
for frac in sizes:
part_len = int(frac * data_len)
self.partitions.append(indexes[0:part_len])
indexes = indexes[part_len:]
def use(self, partition):
return Partition(self.data, self.partitions[partition])
class Net(nn.Module):
""" Network architecture. """
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
def partition_dataset():
""" Partitioning MNIST """
dataset = datasets.MNIST(
'./data',
train=True,
download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307, ), (0.3081, ))
]))
size = dist.get_world_size()
bsz = int(128 / float(size))
partition_sizes = [1.0 / size for _ in range(size)]
partition = DataPartitioner(dataset, partition_sizes)
partition = partition.use(dist.get_rank())
train_set = torch.utils.data.DataLoader(
partition, batch_size=bsz, shuffle=True)
return train_set, bsz
def average_gradients(model):
""" Gradient averaging. """
size = float(dist.get_world_size())
for param in model.parameters():
dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0)
param.grad.data /= size
def run(rank, size):
""" Distributed Synchronous SGD Example """
torch.manual_seed(1234)
train_set, bsz = partition_dataset()
model = Net()
model = model
# model = model.cuda(rank)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
num_batches = ceil(len(train_set.dataset) / float(bsz))
for epoch in range(10):
epoch_loss = 0.0
for data, target in train_set:
data, target = Variable(data), Variable(target)
# data, target = Variable(data.cuda(rank)), Variable(target.cuda(rank))
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
epoch_loss += loss.data.item()
loss.backward()
average_gradients(model)
optimizer.step()
print('Rank ',
dist.get_rank(), ', epoch ', epoch, ': ',
epoch_loss / num_batches)
def init_processes(rank, size, fn, backend='gloo'):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'
dist.init_process_group(backend, rank=rank, world_size=size)
fn(rank, size)
if __name__ == "__main__":
size = 2
processes = []
for rank in range(size):
p = Process(target=init_processes, args=(rank, size, run))
p.start()
processes.append(p)
for p in processes:
p.join()

Resources