BiLSTM Pytorch model.cuda() gives error in GPU - pytorch

import torch
import torchwordemb
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(1)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.embedding = nn.Embedding(1000, 100)
self.lstm = nn.LSTM(100, 30, num_layers=1, bidirectional=True)
self.out_layer = nn.Linear(60, 2)
def forward(self, x):
x = x.t()
batch_size = x.size(1)
emb = self.embedding(x)
print emb.size()
hidden = self.init_hidden(batch_size)
lstm_out, hidden = self.lstm(emb.view(len(emb), batch_size, -1), hidden)
print 'LSTM Out: ', lstm_out[-1].size()
out_layer = self.out_layer(lstm_out[-1])
return out_layer
def init_hidden(self, batch_size):
return (create_variable(torch.zeros(2, batch_size, 30)),
create_variable(torch.zeros(2, batch_size, 30)))
x = [[2, 30, 40, 1, 0], [20, 3, 5, 10, 3], [5, 2, 4, 80, 1]]
def create_variable(tensor):
# Do cuda() before wrapping with variable
if torch.cuda.is_available():
return Variable(tensor.cuda())
else:
return Variable(tensor)
x = create_variable(torch.LongTensor(x))
y = create_variable(torch.LongTensor([0,1,1]))
model = Net()
loss_function = nn.NLLLoss()
#print 'Model parameter ', model.parameters()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model)
if torch.cuda.is_available():
model.cuda()
for epoch in range(10):
output = model(x)
print 'output view: ', output.size()
print 'y: ', y.size()
loss = loss_function(output, y.view(-1))
# total_loss += loss.data[0]
print 'Loss function: ', loss.data[0]
model.zero_grad()
loss.backward()
optimizer.step()
Traceback (most recent call last): File "cuda_test.py", line 52, in
model.cuda() File "/home1/dhanachandra/anaconda3/envs/my_env27/lib/python2.7/site-packages/torch/nn/modules/module.py",
line 216, in cuda
return self._apply(lambda t: t.cuda(device)) File "/home1/dhanachandra/anaconda3/envs/my_env27/lib/python2.7/site-packages/torch/nn/modules/module.py",
line 146, in _apply
module._apply(fn) File "/home1/dhanachandra/anaconda3/envs/my_env27/lib/python2.7/site-packages/torch/nn/modules/module.py",
line 146, in _apply
module._apply(fn) File "/home1/dhanachandra/anaconda3/envs/my_env27/lib/python2.7/site-packages/torch/nn/modules/rnn.py",
line 123, in _apply
self.flatten_parameters() File "/home1/dhanachandra/anaconda3/envs/my_env27/lib/python2.7/site-packages/torch/nn/modules/rnn.py",
line 111, in flatten_parameters
params = rnn.get_parameters(fn, handle, fn.weight_buf) File "/home1/dhanachandra/anaconda3/envs/my_env27/lib/python2.7/site-packages/torch/backends/cudnn/rnn.py",
line 165, in get_parameters
assert filter_dim_a.prod() == filter_dim_a[0] AssertionError
Code runs in cpu without any error but gives error in GPU

Related

AttributeError: 'tuple' object has no attribute 'train_dataloader'

I have a 3 file. In the datamodule file, I have created data and used the basic format of the PyTorch Lightning. In the linear_model I made a linear regression model based on this page. Finally, I have a train file, I am calling the model and trying to fit the data. But I am getting this error
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/test_train.py", line 10, in <module>
train_dataloader=datamodule.DataModuleClass().setup().train_dataloader(),
AttributeError: 'tuple' object has no attribute 'train_dataloader'
Sample datamodule file
class DataModuleClass(pl.LightningDataModule):
def __init__(self):
super().__init__()
self.sigma = 5
self.batch_size = 10
self.prepare_data()
def prepare_data(self):
x = np.random.uniform(0, 10, 10)
e = np.random.normal(0, self.sigma, len(x))
y = x + e
X = np.transpose(np.array([x, e]))
self.x_train_tensor = torch.from_numpy(X).float().to(device)
self.y_train_tensor = torch.from_numpy(y).float().to(device)
training_dataset = TensorDataset(self.x_train_tensor, self.y_train_tensor)
self.training_dataset = training_dataset
def setup(self):
data = self.training_dataset
self.train_data, self.val_data = random_split(data, [8, 2])
return self.train_data, self.val_data
def train_dataloader(self):
return DataLoader(self.train_data)
def val_dataloader(self):
return DataLoader(self.val_data)
Sample training file
from . import datamodule, linear_model
model = linear_model.LinearRegression(input_dim=2, l1_strength=1, l2_strength=1)
trainer = pl.Trainer()
trainer.fit(model,
train_dataloader=datamodule.DataModuleClass().setup().train_dataloader(),
val_dataloaders=datamodule.DataModuleClass().setup().val_dataloaders())
Let me know if you need more code or explanation.
Update (Based on the comment)
Now, I am getting the following error after removing self.prepare_data() from the __init__() of the DataModuleClass(), removed return self.train_data, self.val_data from setup(), and changed the test file to
data_module = datamodule.DataModuleClass()
trainer = pl.Trainer()
trainer.fit(model,data_module)
Error:
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/test_train.py", line 10, in <module>
train_dataloader=datamodule.DataModuleClass().train_dataloader(),
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/datamodule.py", line 54, in train_dataloader
return DataLoader(self.train_data)
AttributeError: 'DataModuleClass' object has no attribute 'train_data'
Most of the things were correct, except few things like:
def prepare_data(self):
This function was right except that it should not return anything.
Another thing was
def setup(self,stage=None):
Requires stage variable which can be set to a default value of none in case we don't want to switch between different test and train stage.
Putting everything together, here is the code:
from argparse import ArgumentParser
import numpy as np
import pytorch_lightning as pl
from torch.utils.data import random_split, DataLoader, TensorDataset
import torch
from torch.autograd import Variable
from torchvision import transforms
import pytorch_lightning as pl
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
from torch.optim.optimizer import Optimizer
class LinearRegression(pl.LightningModule):
def __init__(
self,
input_dim: int = 2,
output_dim: int = 1,
bias: bool = True,
learning_rate: float = 1e-4,
optimizer: Optimizer = Adam,
l1_strength: float = 0.0,
l2_strength: float = 0.0
):
super().__init__()
self.save_hyperparameters()
self.optimizer = optimizer
self.linear = nn.Linear(in_features=self.hparams.input_dim, out_features=self.hparams.output_dim, bias=bias)
def forward(self, x):
y_hat = self.linear(x)
return y_hat
def training_step(self, batch, batch_idx):
x, y = batch
# flatten any input
x = x.view(x.size(0), -1)
y_hat = self(x)
loss = F.mse_loss(y_hat, y, reduction='sum')
# L1 regularizer
if self.hparams.l1_strength > 0:
l1_reg = sum(param.abs().sum() for param in self.parameters())
loss += self.hparams.l1_strength * l1_reg
# L2 regularizer
if self.hparams.l2_strength > 0:
l2_reg = sum(param.pow(2).sum() for param in self.parameters())
loss += self.hparams.l2_strength * l2_reg
loss /= x.size(0)
tensorboard_logs = {'train_mse_loss': loss}
progress_bar_metrics = tensorboard_logs
return {'loss': loss, 'log': tensorboard_logs, 'progress_bar': progress_bar_metrics}
def validation_step(self, batch, batch_idx):
x, y = batch
x = x.view(x.size(0), -1)
y_hat = self(x)
return {'val_loss': F.mse_loss(y_hat, y)}
def validation_epoch_end(self, outputs):
val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
tensorboard_logs = {'val_mse_loss': val_loss}
progress_bar_metrics = tensorboard_logs
return {'val_loss': val_loss, 'log': tensorboard_logs, 'progress_bar': progress_bar_metrics}
def configure_optimizers(self):
return self.optimizer(self.parameters(), lr=self.hparams.learning_rate)
np.random.seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class DataModuleClass(pl.LightningDataModule):
def __init__(self):
super().__init__()
self.sigma = 5
self.batch_size = 10
def prepare_data(self):
x = np.random.uniform(0, 10, 10)
e = np.random.normal(0, self.sigma, len(x))
y = x + e
X = np.transpose(np.array([x, e]))
self.x_train_tensor = torch.from_numpy(X).float().to(device)
self.y_train_tensor = torch.from_numpy(y).float().to(device)
training_dataset = TensorDataset(self.x_train_tensor, self.y_train_tensor)
self.training_dataset = training_dataset
def setup(self,stage=None):
data = self.training_dataset
self.train_data, self.val_data = random_split(data, [8, 2])
def train_dataloader(self):
return DataLoader(self.train_data)
def val_dataloader(self):
return DataLoader(self.val_data)
model = LinearRegression(input_dim=2, l1_strength=1, l2_strength=1)
trainer = pl.Trainer()
dummy = DataModuleClass()
trainer.fit(model,dummy)

Pytorch GAN model doesn't train: matrix multiplication error

I'm trying to build a basic GAN to familiarise myself with Pytorch. I have some (limited) experience with Keras, but since I'm bound to do a larger project in Pytorch, I wanted to explore first using 'basic' networks.
I'm using Pytorch Lightning. I think I've added all necessary components. I tried passing some noise through the generator and the discriminator separately, and I think the output has the expected shape. Nonetheless, I get a runtime error when I try to train the GAN (full traceback below):
RuntimeError: mat1 and mat2 shapes cannot be multiplied (7x9 and 25x1)
I noticed that 7 is the size of the batch (by printing out the batch dimensions), even though I specified batch_size to be 64. Other than that, quite honestly, I don't know where to begin: the error traceback doesn't help me.
Chances are, I made multiple mistakes. However, I'm hoping some of you will be able to spot the current error from the code, since the multiplication error seems to point towards a dimensionality problem somewhere. Here's the code.
import os
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from skimage import io
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.utils import make_grid
from torchvision.transforms import Resize, ToTensor, ToPILImage, Normalize
class DoppelDataset(Dataset):
"""
Dataset class for face data
"""
def __init__(self, face_dir: str, transform=None):
self.face_dir = face_dir
self.face_paths = os.listdir(face_dir)
self.transform = transform
def __len__(self):
return len(self.face_paths)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
face_path = os.path.join(self.face_dir, self.face_paths[idx])
face = io.imread(face_path)
sample = {'image': face}
if self.transform:
sample = self.transform(sample['image'])
return sample
class DoppelDataModule(pl.LightningDataModule):
def __init__(self, data_dir='../data/faces', batch_size: int = 64, num_workers: int = 0):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
self.transforms = transforms.Compose([
ToTensor(),
Resize(100),
Normalize(mean=(123.26290927634774, 95.90498110733365, 86.03763122875182),
std=(63.20679012922922, 54.86211954409834, 52.31266645797249))
])
def setup(self, stage=None):
# Initialize dataset
doppel_data = DoppelDataset(face_dir=self.data_dir, transform=self.transforms)
# Train/val/test split
n = len(doppel_data)
train_size = int(.8 * n)
val_size = int(.1 * n)
test_size = n - (train_size + val_size)
self.train_data, self.val_data, self.test_data = random_split(dataset=doppel_data,
lengths=[train_size, val_size, test_size])
def train_dataloader(self) -> DataLoader:
return DataLoader(dataset=self.test_data, batch_size=self.batch_size, num_workers=self.num_workers)
def val_dataloader(self) -> DataLoader:
return DataLoader(dataset=self.val_data, batch_size=self.batch_size, num_workers=self.num_workers)
def test_dataloader(self) -> DataLoader:
return DataLoader(dataset=self.test_data, batch_size=self.batch_size, num_workers=self.num_workers)
class DoppelGenerator(nn.Sequential):
"""
Generator network that produces images based on latent vector
"""
def __init__(self, latent_dim: int):
super().__init__()
def block(in_channels: int, out_channels: int, padding: int = 1, stride: int = 2, bias=False):
return nn.Sequential(
nn.ConvTranspose2d(in_channels=in_channels, out_channels=out_channels, kernel_size=4, stride=stride,
padding=padding, bias=bias),
nn.BatchNorm2d(num_features=out_channels),
nn.ReLU(True)
)
self.model = nn.Sequential(
block(latent_dim, 512, padding=0, stride=1),
block(512, 256),
block(256, 128),
block(128, 64),
block(64, 32),
nn.ConvTranspose2d(32, 3, kernel_size=4, stride=2, padding=1, bias=False),
nn.Tanh()
)
def forward(self, input):
return self.model(input)
class DoppelDiscriminator(nn.Sequential):
"""
Discriminator network that classifies images in two categories
"""
def __init__(self):
super().__init__()
def block(in_channels: int, out_channels: int):
return nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=4, stride=2, padding=1,
bias=False),
nn.BatchNorm2d(num_features=out_channels),
nn.LeakyReLU(0.2, inplace=True),
)
self.model = nn.Sequential(
block(3, 64),
block(64, 128),
block(128, 256),
block(256, 512),
nn.Conv2d(512, 1, kernel_size=4, stride=1, padding=0, bias=False),
nn.Flatten(),
nn.Linear(25, 1),
nn.Sigmoid()
)
def forward(self, input):
return self.model(input)
class DoppelGAN(pl.LightningModule):
def __init__(self,
channels: int,
width: int,
height: int,
lr: float = 0.0002,
b1: float = 0.5,
b2: float = 0.999,
batch_size: int = 64,
**kwargs):
super().__init__()
# Save all keyword arguments as hyperparameters, accessible through self.hparams.X)
self.save_hyperparameters()
# Initialize networks
# data_shape = (channels, width, height)
self.generator = DoppelGenerator(latent_dim=self.hparams.latent_dim, )
self.discriminator = DoppelDiscriminator()
self.validation_z = torch.randn(8, self.hparams.latent_dim,1,1)
def forward(self, input):
return self.generator(input)
def adversarial_loss(self, y_hat, y):
return F.binary_cross_entropy(y_hat, y)
def training_step(self, batch, batch_idx, optimizer_idx):
images = batch
# Sample noise (batch_size, latent_dim,1,1)
z = torch.randn(images.size(0), self.hparams.latent_dim,1,1)
# Train generator
if optimizer_idx == 0:
# Generate images (call generator -- see forward -- on latent vector)
self.generated_images = self(z)
# Log sampled images (visualize what the generator comes up with)
sample_images = self.generated_images[:6]
grid = make_grid(sample_images)
self.logger.experiment.add_image('generated_images', grid, 0)
# Ground truth result (ie: all fake)
valid = torch.ones(images.size(0), 1)
# Adversarial loss is binary cross-entropy
generator_loss = self.adversarial_loss(self.discriminator(self(z)), valid)
tqdm_dict = {'gen_loss': generator_loss}
output = {
'loss': generator_loss,
'progress_bar': tqdm_dict,
'log': tqdm_dict
}
return output
# Train discriminator: classify real from generated samples
if optimizer_idx == 1:
# How well can it label as real?
valid = torch.ones(images.size(0), 1)
real_loss = self.adversarial_loss(self.discriminator(images), valid)
# How well can it label as fake?
fake = torch.zeros(images.size(0), 1)
fake_loss = self.adversarial_loss(
self.discriminator(self(z).detach()), fake)
# Discriminator loss is the average of these
discriminator_loss = (real_loss + fake_loss) / 2
tqdm_dict = {'d_loss': discriminator_loss}
output = {
'loss': discriminator_loss,
'progress_bar': tqdm_dict,
'log': tqdm_dict
}
return output
def configure_optimizers(self):
lr = self.hparams.lr
b1 = self.hparams.b1
b2 = self.hparams.b2
# Optimizers
opt_g = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2))
opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2))
# Return optimizers/schedulers (currently no scheduler)
return [opt_g, opt_d], []
def on_epoch_end(self):
# Log sampled images
sample_images = self(self.validation_z)
grid = make_grid(sample_images)
self.logger.experiment.add_image('generated_images', grid, self.current_epoch)
if __name__ == '__main__':
# Global parameter
image_dim = 128
latent_dim = 100
batch_size = 64
# Initialize dataset
tfs = transforms.Compose([
ToPILImage(),
Resize(image_dim),
ToTensor()
])
doppel_dataset = DoppelDataset(face_dir='../data/faces', transform=tfs)
# Initialize data module
doppel_data_module = DoppelDataModule(batch_size=batch_size)
# Build models
generator = DoppelGenerator(latent_dim=latent_dim)
discriminator = DoppelDiscriminator()
# Test generator
x = torch.rand(batch_size, latent_dim, 1, 1)
y = generator(x)
print(f'Generator: x {x.size()} --> y {y.size()}')
# Test discriminator
x = torch.rand(batch_size, 3, 128, 128)
y = discriminator(x)
print(f'Discriminator: x {x.size()} --> y {y.size()}')
# Build GAN
doppelgan = DoppelGAN(batch_size=batch_size, channels=3, width=image_dim, height=image_dim, latent_dim=latent_dim)
# Fit GAN
trainer = pl.Trainer(gpus=0, max_epochs=5, progress_bar_refresh_rate=1)
trainer.fit(model=doppelgan, datamodule=doppel_data_module)
Full traceback:
Traceback (most recent call last):
File "/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-28805d67d74b>", line 1, in <module>
runfile('/Users/wouter/Documents/OneDrive/Hardnose/Projects/Coding/0002_DoppelGANger/doppelganger/gan.py', wdir='/Users/wouter/Documents/OneDrive/Hardnose/Projects/Coding/0002_DoppelGANger/doppelganger')
File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_bundle/pydev_umd.py", line 197, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/Users/wouter/Documents/OneDrive/Hardnose/Projects/Coding/0002_DoppelGANger/doppelganger/gan.py", line 298, in <module>
trainer.fit(model=doppelgan, datamodule=doppel_data_module)
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 510, in fit
results = self.accelerator_backend.train()
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 57, in train
return self.train_or_test()
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in train_or_test
results = self.trainer.train()
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in train
self.train_loop.run_training_epoch()
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 550, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 718, in run_training_batch
self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 485, in optimizer_step
model_ref.optimizer_step(
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/core/lightning.py", line 1298, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 286, in step
self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 144, in __optimizer_step
optimizer.step(closure=closure, *args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/torch/autograd/grad_mode.py", line 26, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/torch/optim/adam.py", line 66, in step
loss = closure()
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 708, in train_step_and_backward_closure
result = self.training_step_and_backward(
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 806, in training_step_and_backward
result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 319, in training_step
training_step_output = self.trainer.accelerator_backend.training_step(args)
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/accelerators/cpu_accelerator.py", line 62, in training_step
return self._step(self.trainer.model.training_step, args)
File "/usr/local/lib/python3.9/site-packages/pytorch_lightning/accelerators/cpu_accelerator.py", line 58, in _step
output = model_step(*args)
File "/Users/wouter/Documents/OneDrive/Hardnose/Projects/Coding/0002_DoppelGANger/doppelganger/gan.py", line 223, in training_step
real_loss = self.adversarial_loss(self.discriminator(images), valid)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/Users/wouter/Documents/OneDrive/Hardnose/Projects/Coding/0002_DoppelGANger/doppelganger/gan.py", line 154, in forward
return self.model(input)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/container.py", line 117, in forward
input = module(input)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/linear.py", line 93, in forward
return F.linear(input, self.weight, self.bias)
File "/usr/local/lib/python3.9/site-packages/torch/nn/functional.py", line 1690, in linear
ret = torch.addmm(bias, input, weight.t())
RuntimeError: mat1 and mat2 shapes cannot be multiplied (7x9 and 25x1)
This multiplication problem comes from the DoppelDiscriminator. There is a linear layer
nn.Linear(25, 1),
that should be
nn.Linear(9, 1),
based on the error message.

RuntimeError: module must have its parameters and buffers on device cuda:1 (device_ids[0]) but found one of them on device: cpu

I am implementing nn.DataParallel class in code and i encapsulate all my models in this class. It gives an error ambiguous error.
Class Code
import time
import os
import argparse
import numpy as np
import torch
import torch.optim as optim
import torch.optim.lr_scheduler as LS
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.utils.data as data
from torchvision import transforms
parser = argparse.ArgumentParser()
parser.add_argument(
'--batch-size', '-N', type=int, default=16, help='batch size')
parser.add_argument(
'--train', '-f', required=True, type=str, help='folder of training images')
parser.add_argument(
'--max-epochs', '-e', type=int, default=4, help='max epochs')
parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
parser.add_argument('--cuda', '-g', action='store_true', help='enables cuda')
parser.add_argument(
'--iterations', type=int, default=16, help='unroll iterations')
parser.add_argument('--checkpoint', type=int, help='unroll iterations')
args = parser.parse_args()
## load 32x32 patches from images
import dataset
train_transform = transforms.Compose([
transforms.ToTensor(),
])
train_set = dataset.ImageFolder(root=args.train, transform=train_transform)
train_loader = data.DataLoader(
dataset=train_set, batch_size=args.batch_size, shuffle=True, num_workers=1)
print('total images: {}; total batches: {}'.format(
len(train_set), len(train_loader)))
## load networks on GPU
import network
print("Devices are ", torch.cuda.device_count())
torch.cuda.set_device(1)
encoder = nn.DataParallel(network.EncoderCell(), device_ids = [1, 3])
binarizer = nn.DataParallel(network.Binarizer(), device_ids = [1, 3])
decoder = nn.DataParallel(network.DecoderCell(), device_ids = [1, 3])
solver = optim.Adam(
[
{
'params': encoder.parameters()
},
{
'params': binarizer.parameters()
},
{
'params': decoder.parameters()
},
],
lr=args.lr)
def resume(epoch=None):
if epoch is None:
s = 'iter'
epoch = 0
else:
s = 'epoch'
encoder.load_state_dict(
torch.load('checkpoint/encoder_{}_{:08d}.pth'.format(s, epoch)))
binarizer.load_state_dict(
torch.load('checkpoint/binarizer_{}_{:08d}.pth'.format(s, epoch)))
decoder.load_state_dict(
torch.load('checkpoint/decoder_{}_{:08d}.pth'.format(s, epoch)))
def save(index, epoch=True):
if not os.path.exists('checkpoint'):
os.mkdir('checkpoint')
if epoch:
s = 'epoch'
else:
s = 'iter'
torch.save(encoder.state_dict(), 'checkpoint/encoder_{}_{:08d}.pth'.format(
s, index))
torch.save(binarizer.state_dict(),
'checkpoint/binarizer_{}_{:08d}.pth'.format(s, index))
torch.save(decoder.state_dict(), 'checkpoint/decoder_{}_{:08d}.pth'.format(
s, index))
# resume()
scheduler = LS.MultiStepLR(solver, milestones=[3, 10, 20, 50, 100], gamma=0.5)
last_epoch = 0
if args.checkpoint:
resume(args.checkpoint)
last_epoch = args.checkpoint
scheduler.last_epoch = last_epoch - 1
for epoch in range(last_epoch + 1, args.max_epochs + 1):
scheduler.step()
for batch, data in enumerate(train_loader):
batch_t0 = time.time()
## init lstm state
encoder_h_1 = (Variable(torch.zeros(data.size(0), 256, 8, 8).cuda()),
Variable(torch.zeros(data.size(0), 256, 8, 8).cuda()))
# print(encoder_h_1)
encoder_h_2 = (Variable(torch.zeros(data.size(0), 512, 4, 4).cuda()),
Variable(torch.zeros(data.size(0), 512, 4, 4).cuda()))
encoder_h_3 = (Variable(torch.zeros(data.size(0), 512, 2, 2).cuda()),
Variable(torch.zeros(data.size(0), 512, 2, 2).cuda()))
decoder_h_1 = (Variable(torch.zeros(data.size(0), 512, 2, 2).cuda()),
Variable(torch.zeros(data.size(0), 512, 2, 2).cuda()))
decoder_h_2 = (Variable(torch.zeros(data.size(0), 512, 4, 4).cuda()),
Variable(torch.zeros(data.size(0), 512, 4, 4).cuda()))
decoder_h_3 = (Variable(torch.zeros(data.size(0), 256, 8, 8).cuda()),
Variable(torch.zeros(data.size(0), 256, 8, 8).cuda()))
decoder_h_4 = (Variable(torch.zeros(data.size(0), 128, 16, 16).cuda()),
Variable(torch.zeros(data.size(0), 128, 16, 16).cuda()))
patches = Variable(data.cuda())
solver.zero_grad()
losses = []
res = patches - 0.5
bp_t0 = time.time()
for _ in range(args.iterations):
encoded, encoder_h_1, encoder_h_2, encoder_h_3 = encoder(
res, encoder_h_1, encoder_h_2, encoder_h_3)
codes = binarizer(encoded)
output, decoder_h_1, decoder_h_2, decoder_h_3, decoder_h_4 = decoder(
codes, decoder_h_1, decoder_h_2, decoder_h_3, decoder_h_4)
res = res - output
losses.append(res.abs().mean())
bp_t1 = time.time()
loss = sum(losses) / args.iterations
loss.backward()
solver.step()
batch_t1 = time.time()
print(
'[TRAIN] Epoch[{}]({}/{}); Loss: {:.6f}; Backpropagation: {:.4f} sec; Batch: {:.4f} sec'.
format(epoch, batch + 1,
len(train_loader), loss.data, bp_t1 - bp_t0, batch_t1 -
batch_t0))
print(('{:.4f} ' * args.iterations +
'\n').format(*[l.data for l in losses]))
index = (epoch - 1) * len(train_loader) + batch
## save checkpoint every 500 training steps
if index % 500 == 0:
save(0, False)
save(epoch)
TraceBack.
total images: 9271670; total batches: 579480
Devices are 4
/data1/khawar/khawar/Conference/CVPR/lib/python3.5/site-packages/torch/optim/lr_scheduler.py:82: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule.See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
Traceback (most recent call last):
File "train.py", line 152, in <module>
res, encoder_h_1, encoder_h_2, encoder_h_3)
File "/data1/khawar/khawar/Conference/CVPR/lib/python3.5/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/data1/khawar/khawar/Conference/CVPR/lib/python3.5/site-packages/torch/nn/parallel/data_parallel.py", line 146, in forward
"them on device: {}".format(self.src_device_obj, t.device))
RuntimeError: module must have its parameters and buffers on device cuda:1 (device_ids[0]) but found one of them on device: cpu
Move DataParallel modules to CUDA memory:
encoder = nn.DataParallel(network.EncoderCell(), device_ids = [1, 3]).cuda()
binarizer = nn.DataParallel(network.Binarizer(), device_ids = [1, 3]).cuda()
decoder = nn.DataParallel(network.DecoderCell(), device_ids = [1, 3]).cuda()

AttributeError: 'list' object has no attribute 'dim' when predicting in pytorch

I'm currently loading in a model and 11 input values. Then I'm sending those 11 values into a tensor and attempting to predict outputs.
Here is my code:
# coding: utf-8
# In[5]:
import torch
import torchvision
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as utils
import numpy as np
data_np = np.loadtxt('input_preds.csv', delimiter=',')
train_ds = utils.TensorDataset(torch.tensor(data_np, dtype=torch.float32).view(-1,11))
trainset = torch.utils.data.DataLoader(train_ds, batch_size=1, shuffle=True)
# setting device on GPU if available, else CPU, replace .cuda() with .to(device)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
class Net(nn.Module):
def __init__(self):
super().__init__()
#self.bn = nn.BatchNorm2d(11)
self.fc1 = nn.Linear(11, 22)
self.fc2 = nn.Linear(22, 44)
self.fc3 = nn.Linear(44, 22)
self.fc4 = nn.Linear(22, 11)
def forward(self, x):
#x = x.view(-1, 11)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.fc4(x)
#return F.log_softmax(x, dim=1)
return x
model1 = torch.load('./1e-2')
model2 = torch.load('./1e-3')
for data in trainset:
X = data
X = X
output = model1(X).to(device)
print(output)
However, I get this error
Traceback (most recent call last):
File "inference.py", line 53, in <module>
output = model1(X).to(device)
File "C:\Users\Happy\Miniconda3\envs\torch\lib\site-packages\torch\nn\modules\module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "inference.py", line 40, in forward
x = F.relu(self.fc1(x))
File "C:\Users\Happy\Miniconda3\envs\torch\lib\site-packages\torch\nn\modules\module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "C:\Users\Happy\Miniconda3\envs\torch\lib\site-packages\torch\nn\modules\linear.py", line 55, in forward
return F.linear(input, self.weight, self.bias)
File "C:\Users\Happy\Miniconda3\envs\torch\lib\site-packages\torch\nn\functional.py", line 1022, in linear
if input.dim() == 2 and bias is not None:
AttributeError: 'list' object has no attribute 'dim'
I've tried to convert the batch to a numpy array but that didn't help. How do I resolve this error? Thank you for your help.
It looks like your X (data) is a list of tensors, while a PyTorch tensor is expected.
Try X = torch.stack(X).to(device) before sending to the model.

AttributeError: 'builtin_function_or_method' object has no attribute 'requires_grad'

I'm getting this error when training the MNIST data, the csvfiles is from Kaggle. Can someone show me where I went wrong? Here is my code. The version of PyTorch is 0.4.0.
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as data
import torchvision
import matplotlib.pyplot as plt
torch.manual_seed(1)
# Training Parameters
EPOCH = 20
BATCH_size = 15
LR = 0.001
img_row, img_col = 28, 28
# Networks structure
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(
in_channels=1, out_channels=32,
kernel_size=5, stride=1, padding=2
),
nn.ReLU(),
nn.Conv2d(32, 32, 5, 1, 2),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
nn.Dropout(0.25)
)
self.conv2 = nn.Sequential(
nn.Conv2d(32, 64, 3, 1, 1),
nn.ReLU(),
nn.Conv2d(64, 64, 3, 1, 1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(0.25)
)
self.out = nn.Sequential(
nn.Linear(64*7*7, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = x.view(x.size(0), -1)
output = self.out(x)
return output
# Torch Dataset
class Torch_Dataset(data.Dataset):
def __init__(self, root_dir, csvfile, img_rows, img_cols, train=True, transform=None):
self.root_dir = root_dir
self.transform = transform
self.train = train
if self.train:
y_data0 = pd.read_csv(csvfile, header=0, usecols=['label'])
y_data1 = np.array(y_data0)
self.y_data = torch.from_numpy(y_data1)
x_data0 = pd.read_csv(csvfile, header=0, usecols=[i for i in range(1, 785)])
x_data1 = np.array(x_data0)
x_data1 = x_data1.reshape(x_data1.shape[0], 1, img_rows, img_cols)
x_data1 = x_data1.astype('float32')
x_data1 /= 255
self.x_data = torch.from_numpy(x_data1)
else:
x_data0 = pd.read_csv(csvfile, header=0)
x_data1 = np.array(x_data0)
x_data1 = x_data1.reshape(x_data1.shape[0], 1, img_rows, img_cols)
x_data1 = x_data1.astype('float32')
x_data1 /= 255
self.x_data = torch.from_numpy(x_data1)
def __len__(self):
return len(self.x_data)
def __getitem__(self, idx):
if self.train:
img, target = self.x_data[idx], self.y_data[idx]
else:
img = self.x_data[idx]
target = None
# sample = {'img': img, 'target': target}
return img, target
train = Torch_Dataset(
root_dir='./', # root
csvfile='train.csv', # filename
img_rows=img_row, # image rows
img_cols=img_col, # image cols
train=True # train or test
)
# DataLoader
loader = data.DataLoader(
dataset=train, # torch dataset format
batch_size=BATCH_size, # mini batch size
shuffle=True, # shuffle the data
)
# train the data
cnn = CNN()
optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)
loss_f = nn.CrossEntropyLoss()
for epoch in range(EPOCH):
for step, (x, y) in enumerate(loader):
b_x = Variable(x)
b_y = Variable(y)
b_y = b_y.squeeze
output = cnn(b_x)
loss = loss_f(output, b_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
Traceback (most recent call last):
File "C:/Users/Bryan Zoe/PycharmProjects/MNIST_TEST/PyTorch/test1.py", line 118, in
loss = loss_f(output, b_y)
File "C:\Users\Bryan Zoe\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 491, in __ call __
result = self.forward(*input, **kwargs)
File "C:\Users\Bryan Zoe\Anaconda3\lib\site-packages\torch\nn\modules\loss.py", line 757, in forward
_assert_no_grad(target)
File "C:\Users\Bryan Zoe\Anaconda3\lib\site-packages\torch\nn\modules\loss.py", line 11, in _assert_no_grad
assert not tensor.requires_grad, \
AttributeError: 'builtin_function_or_method' object has no attribute 'requires_grad'
You are not calling the squeeze method,This should work
b_y = b_y.squeeze()

Resources