Here is a simple example that results in an in-place operation error.
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=3,
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
shared_net = SharedNetwork()
net_1 = Network1()
segmentor = Network2()
optimizer = optim.Adam(list(shared_net.parameters()) + list(segmentor.parameters()), lr=1e-6)
optimizer_conf = optim.Adam(list(shared_net.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
optimizer.zero_grad()
optimizer_conf.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
optimizer.step()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
Error message:
UserWarning: Error detected in ConvolutionBackward0. No forward pass information available. Enable detect anomaly during forward pass for more information. (Triggered internally at C:\cb\pytorch_1000000000000\work\torch\csrc\autograd\python_anomaly_mode.cpp:97.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 3, 3, 3]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I was able to solve the problem by changing the order of running the step function of optimizers.
optimizer_conf.zero_grad()
optimizer.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
optimizer.step()
The following questions, however, remain:
How does the step method cause an inplace operation in convolution?
Why does moving the steps to the end of the file resolve this error?
NOTE: The loss function is used for simplicity, using dice-loss also results in the same error!
Before answering the question, I have to mention that it seems having multiple optimizers for one set of parameters is anti-pattern and it's better to be avoided.
How does the step method cause an inplace operation in convolution?
A: step method adds the gradients to the weights, so it does something like the following:
param.weight += param.grad
which can be interpreted as an in place operation
Why does moving the steps to the end of the file resolve this error?
A: Obviously, by moving the step method after the second backward method, the above-mentioned operation is not executed. As a result, there are no in-place operations and no errors raised due to their existence.
To sum up, it's best to have only one optimizer for one set of parameters, the previous example could coded in the following way:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=(3,3),
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
torch.manual_seed(0)
shared_net = SharedNetwork()
net_1 = Network1()
net_2 = Network2()
shared_optimizer = optim.Adam(list(shared_net.parameters()), lr=1e-6)
net_1_optimizer = optim.Adam(list(net_1.parameters()), lr=1e-6)
net_2_optimizer = optim.Adam(list(segmentor.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
net_2_optimizer.zero_grad()
features = shared_net(fake_data)
net_2_out = net_2(features)
s_loss = loss_fn(net_2_out, target_data_2)
s_loss.backward(retain_graph=True)
net_2_optimizer.step()
net_1_optimizer.zero_grad()
shared_optimizer.zero_grad()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
net_1_optimizer.step()
shared_optimizer.step()
Note: If you want to have two different learning rates for different losses applied to one set of parameters, you can multiply the losses based on their importance by a value. For example, you can multiply loss_1 by 0.1 and loss_1 by 0.5. Or, you can use backward hooks as mentioned in this comment:
backward-hook
So I attempted the kaggle mnist challenge and I used torch's custom dataset module to load the csv files. Whenever I train the network using a dataloader with num_workers set to more than 0, it seems to give me a BrokenPipeError. I followed many tutorials on the net and even put my code under if __name__ == "__main__" line but nothing seems to fix this error also num_workers=0 gives me no error rather it gives me a UserWarning about Named Tensors.
Below is the code
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import torchvision.transforms as transforms
class mnistdataset(torch.utils.data.Dataset):
def __init__(self, file, transform=None):
self.file = pd.read_csv(file)
self.labels = self.file["label"].values
self.transform = transform
def __len__(self):
return self.file.shape[0]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
im = self.file.iloc[idx, 1:].to_numpy(dtype="uint8").reshape(-1)
im = np.array([im]).reshape(28,28)
if self.transform:
im = self.transform(im)
return im, self.labels[idx]
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 64, 5, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(64)
self.conv2 = nn.Conv2d(64, 64, 5, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(64)
self.m1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.d1 = nn.Dropout2d(0.25)
self.conv3 = nn.Conv2d(64,64,3, stride=1, padding=1)
self.bn3 = nn.BatchNorm2d(64)
self.conv4 = nn.Conv2d(64,64,3, stride=1, padding=1)
self.bn4 = nn.BatchNorm2d(64)
self.conv5 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
self.bn5 = nn.BatchNorm2d(64)
self.m2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.d2 = nn.Dropout2d(0.25)
self.conv6 = nn.Conv2d(64, 128, 3, stride=1, padding=1)
self.bn6 = nn.BatchNorm2d(128)
self.d3 = nn.Dropout2d(0.25)
self.lin1 = nn.Linear(4608, 400)
self.d4 = nn.Dropout(0.4)
self.lin2 = nn.Linear(400, 28)
self.d5 = nn.Dropout(0.2)
self.lin3 = nn.Linear(28, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.bn1(x)
x = F.relu(self.conv2(x))
x = self.bn2(x)
x = torch.max_pool2d(x, kernel_size=2, stride=2)
x = self.d1(x)
x = F.relu(self.conv3(x))
x = self.bn3(x)
x = F.relu(self.conv4(x))
x = self.bn4(x)
x = F.relu(self.conv5(x))
x = self.bn5(x)
x = torch.max_pool2d(x, kernel_size=2, stride=2)
x = self.d2(x)
x = F.relu(self.conv6(x))
x = self.bn6(x)
x = self.d3(x)
x = x.view(x.size(0), -1)
x = F.relu(self.lin1(x))
x = self.d1(x)
x = F.relu(self.lin2(x))
x = self.d2(x)
x = self.lin3(x)
return x
def get_dataloaders():
train_transform = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor(),transforms.Normalize((0.1307), (0.3081))])
train = mnistdataset("train.csv", transform = train_transform)
return torch.utils.data.DataLoader(train, batch_size=20, shuffle=True, num_workers=2)
def train_network(train_loader):
net = Net().cuda()
opt = optim.SGD(net.parameters(), lr= 0.01, momentum=0.5)
loss = nn.CrossEntropyLoss().cuda()
epochs = 2
for epoch in range(epochs):
net.train()
for batch_id, (im, target) in enumerate(train_loader):
im = im.to('cuda', non_blocking=True)
target = target.to('cuda', non_blocking=True).long()
opt.zero_grad()
pred = net(im)
l = loss(pred, target)
l.backward()
opt.step()
if (batch_id + 1)% 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (batch_id + 1) * len(im), len(train_loader.dataset),
100. * (batch_id + 1) / len(train_loader), l.item()))
if __name__ == '__main__':
train_loader = get_dataloaders()
train_network(train_loader)
The error I am getting is
---------------------------------------------------------------------------
BrokenPipeError Traceback (most recent call last)
<ipython-input-8-5af6b8b22e93> in <module>
2
3 train_loader = get_dataloaders()
----> 4 train_network(train_loader)
<ipython-input-4-24f1b1c4c822> in train_network(train_loader)
8
9 net.train()
---> 10 for batch_id, (im, target) in enumerate(train_loader):
11
12 im = im.to('cuda', non_blocking=True)
~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in __iter__(self)
357 return self._iterator
358 else:
--> 359 return self._get_iterator()
360
361 #property
~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in _get_iterator(self)
303 else:
304 self.check_worker_number_rationality()
--> 305 return _MultiProcessingDataLoaderIter(self)
306
307 #property
~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in __init__(self, loader)
916 # before it starts, and __del__ tries to join but will get:
917 # AssertionError: can only join a started process.
--> 918 w.start()
919 self._index_queues.append(index_queue)
920 self._workers.append(w)
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py in start(self)
103 'daemonic processes are not allowed to have children'
104 _cleanup()
--> 105 self._popen = self._Popen(self)
106 self._sentinel = self._popen.sentinel
107 # Avoid a refcycle if the target function holds an indirect
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py in _Popen(process_obj)
221 #staticmethod
222 def _Popen(process_obj):
--> 223 return _default_context.get_context().Process._Popen(process_obj)
224
225 class DefaultContext(BaseContext):
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py in _Popen(process_obj)
320 def _Popen(process_obj):
321 from .popen_spawn_win32 import Popen
--> 322 return Popen(process_obj)
323
324 class SpawnContext(BaseContext):
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\popen_spawn_win32.py in __init__(self, process_obj)
63 try:
64 reduction.dump(prep_data, to_child)
---> 65 reduction.dump(process_obj, to_child)
66 finally:
67 set_spawning_popen(None)
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\reduction.py in dump(obj, file, protocol)
58 def dump(obj, file, protocol=None):
59 '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60 ForkingPickler(file, protocol).dump(obj)
61
62 #
BrokenPipeError: [Errno 32] Broken pipe
The warning I am getting with num_workers set to 0 is
ipykernel_launcher:33: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at ..\c10/core/TensorImpl.h:1156.)
The model still trains with num_workers set to 0.
My Environment Details:
Windows 10 Home Edition, Pytorch for CUDA 11.2(installed with pip, no conda), Python 3.6.7 for windows, GTX 1050 Ti GPU, Intel i5 9th Gen
Edit: The code seems to work when I run the code in a python file but doesn't seem to run while using a jupyter notebook
I tried following this tutorial for transformers:
https://www.youtube.com/watch?v=U0s0f995w14
However, when I try to train the code with my own vectors, I get the following error message:
Traceback (most recent call last):
File >"C:\Users\rreichel\Desktop\Smaragd_local\Programming\Scripts\Transformer_se>lfbuilt.py", line 279, in
loss = loss_func(outputs, target)
File "C:\Users\rreichel\Anaconda3\lib\site->packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\rreichel\Anaconda3\lib\site->packages\torch\nn\modules\loss.py", line 1047, in forward
return F.cross_entropy(input, target, weight=self.weight,
File "C:\Users\rreichel\Anaconda3\lib\site->packages\torch\nn\functional.py", line 2693, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, >ignore_index, None, reduction)
File "C:\Users\rreichel\Anaconda3\lib\site->packages\torch\nn\functional.py", line 2397, in nll_loss
raise ValueError("Expected target size {}, got {}".format(out_size, >target.size()))
ValueError: Expected target size (3, 199), got torch.Size([3, 119])
when calculating the loss during training.
The code:
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 6 08:13:38 2021
#author: rreichel
"""
import torch
import torch.nn as nn
import pickle
import glob
import os
from SelfbuiltDataset_test import myDataset
import torch.optim as optim
class SelfAttention(nn.Module):
def __init__(self, embed_size, heads):
super(SelfAttention, self).__init__()
self.embed_size = embed_size
self.heads = heads
self.head_dim = embed_size // heads
assert(self.head_dim * heads == embed_size
), "Embedding size needs to be divisible by heads"
self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.fc_out = nn.Linear(heads * self.head_dim, embed_size)
def forward(self, values, keys, query, mask):
#Get number of training examples
N = query.shape[0]
value_len, key_len, query_len = values.shape[1], keys.shape[1], \
query.shape[1]
#Split the embedding into self.heads different pieces
values = values.reshape(N, value_len, self.heads, self.head_dim)
keys = keys.reshape(N, key_len, self.heads, self.head_dim)
query = query.reshape(N, query_len, self.heads, self.head_dim)
#(N, value_len, heads, head_dim)
values = self.values(values)
#(N, key_len, heads, head_dim)
keys = self.keys(keys)
#(N, query_len, heads, heads_dim)
queries = self.queries(query)
energy = torch.einsum("nqhd, nkhd -> nhqk", [queries, keys])
#queries shape: (N, query_len, heads, heads_dim),
#keys shape: (N, key_len, heads, heads_dim)
#energy: (N, heads, query_len, key_len)
#Mask padded indices so their weights become 0
if mask is not None:
energy = energy.masked_fill(mask == 0, float("-1e20"))
#Normalize energy values
attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
#attention shape: (N, heads, query_len, key_len)
out = torch.einsum("nhql, nlhd -> nqhd", [attention, values]).reshape(
N, query_len, self.heads * self.head_dim)
#attention shape: (N, heads, query_len, key_len)
#values shape: (N, value_len, heads, heads_dim)
#out after matrix multiply: (N, query_len, heads, head_dim), then
#we reshape and flatten the last two dimensions.
out = self.fc_out(out)
return out
class TransformerBlock(nn.Module):
def __init__(self, embed_size, heads, dropout, forward_expansion):
super(TransformerBlock, self).__init__()
self.attention = SelfAttention(embed_size, heads)
self.norm1 = nn.LayerNorm(embed_size)
self.norm2 = nn.LayerNorm(embed_size)
self.feed_forward = nn.Sequential(
nn.Linear(embed_size, forward_expansion * embed_size),
nn.ReLU(),
nn.Linear(forward_expansion * embed_size, embed_size))
self.dropout = nn.Dropout(dropout)
def forward(self, value, key, query, mask):
attention = self.attention(value, key, query, mask)
# Add skip connection, run through normalization and finally dropout
x = self.dropout(self.norm1(attention + query))
forward = self.feed_forward(x)
out = self.dropout(self.norm2(forward + x))
return out
class Encoder(nn.Module):
def __init__(self, src_vocab_size, embed_size, num_layers, heads, device,
forward_expansion, dropout, max_length):
super(Encoder, self).__init__()
self.embed_size = embed_size
self.device = device
self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
self.position_embedding = nn.Embedding(max_length, embed_size)
self.layers = nn.ModuleList([TransformerBlock(embed_size, heads,
dropout=dropout, forward_expansion=forward_expansion)
for _ in range(num_layers)])
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask):
N, seq_length = x.shape
positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
out = self.dropout(
(self.word_embedding(x) +
self.position_embedding(positions))
)
#In the Encoder the query, key, value are all the same, it's in the
#decoder this will change. This might look a bit odd in this case.
for layer in self.layers:
out = layer(out, out, out, mask)
return out
class DecoderBlock(nn.Module):
def __init__(self, embed_size, heads, forward_expansion, dropout, device):
super(DecoderBlock, self).__init__()
self.norm = nn.LayerNorm(embed_size)
self.attention = SelfAttention(embed_size, heads=heads)
self.transformer_block = TransformerBlock(embed_size, heads, dropout,
forward_expansion)
self.dropout = nn.Dropout(dropout)
def forward(self, x, value, key, src_mask, trg_mask):
attention = self.attention(x, x, x, trg_mask)
query = self.dropout(self.norm(attention + x))
out = self.transformer_block(value, key, query, src_mask)
return out
class Decoder(nn.Module):
def __init__(self, trg_vocab_size, embed_size, num_layers, heads,
forward_expansion, dropout, device, max_length):
super(Decoder, self).__init__()
self.device = device
self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
self.position_embedding = nn.Embedding(max_length, embed_size)
self.layers = nn.ModuleList([DecoderBlock(embed_size, heads,
forward_expansion, dropout,
device)
for _ in range(num_layers)])
self.fc_out = nn.Linear(embed_size, trg_vocab_size)
self.dropout = nn.Dropout(dropout)
def forward(self, x, enc_out, src_mask, trg_mask):
N, seq_length = x.shape
positions = torch.arange(0, seq_length).expand(N,seq_length).to(self.device)
x = self.dropout((self.word_embedding(x) +
self.position_embedding(positions)))
for layer in self.layers:
x = layer(x, enc_out, enc_out, src_mask, trg_mask)
out = self.fc_out(x)
return out
class Transformer(nn.Module):
def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx,
trg_pad_idx, embed_size=512, num_layers=6,
forward_expansion=4, heads=8, dropout=0, device="cpu",
max_length=100):
super(Transformer, self).__init__()
self.encoder = Encoder(src_vocab_size, embed_size, num_layers, heads,
device, forward_expansion, dropout, max_length)
self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads,
forward_expansion, dropout, device, max_length)
self.src_pad_idx = src_pad_idx
self.trg_pad_idx = trg_pad_idx
self.device = device
def make_src_mask(self, src):
#(N, 1, 1, src_len)
src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
return src_mask.to(self.device)
def make_trg_mask(self, trg):
N, trg_len = trg.shape
trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(N, 1,
trg_len, trg_len)
return trg_mask.to(self.device)
def forward(self, src, trg):
src_mask = self.make_src_mask(src)
trg_mask = self.make_trg_mask(trg)
enc_src = self.encoder(src, src_mask)
out = self.decoder(trg, enc_src, src_mask, trg_mask)
return out
def nextMultiple(n, x):
n = n + x / 2
n = n - (n % x)
return int(n)
if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
"""
#This shit are the one-hot encoded sentences (word 1, word 4 etc. as sentence)
train = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0, 1, 11],
[1, 8, 7, 3, 4, 5, 6, 11, 2, 1, 3]]).to(device)
target = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0, 2, 2],
[1, 5, 6, 2, 4, 7, 6, 2, 9, 1]]).to(device)
max_len = max([len(x) for x in train]) + 1
"""
#Loading in data
data = pickle.load(open('Testdaten.pkl', 'rb'))
tmp = myDataset(data, 'POS')
#Calculating maximum sentence length (+ 1 because of start tag)
max_len = max([len(x) for x in tmp.sent_encoded]) + 1
pad_element = len(tmp.lookup_words)
#Padding everything out to maximum sentence length
train_tmp = []
for sent in tmp.sent_encoded:
train_tmp.append([pad_element] + sent + [pad_element] * (max_len - len(sent) - 1))
target_tmp = []
for sent in tmp.tags_encoded:
target_tmp.append(sent + [pad_element] * (max_len - len(sent) - 1))
#Creating tensors for model
train = torch.squeeze(torch.tensor(train_tmp))
target = torch.squeeze(torch.tensor(target_tmp))
#"""
src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = int(torch.max(train)) + 1
trg_vocab_size = int(torch.max(target)) + 1
heads = 8
es = nextMultiple(max(src_vocab_size, trg_vocab_size), heads)
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx,
trg_pad_idx, es, 3, 2, heads, 0.1, device,
max_len).to(device)
#Defining loss function and optimizer
lr = 0.001
num_epochs = 2
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
loss_func = nn.CrossEntropyLoss()
# optimization algorithm
optimizer = optim.Adam(model.parameters(), lr=lr)
# train and evaluation
for cnt in range(num_epochs):
optimizer.zero_grad()
outputs = model(train, target)
outputs = outputs
#Outputs now are size[3, 119, 119]
#CrossEntropyLoss mag one-hot-encoding nicht, how to deal with this?
loss = loss_func(outputs, target)
loss.backward()
optimizer.step()
#out = model(train, target)
#print(out.shape)
I am confused since the code works with the vectors from the tutorial, but once I try to run the model with my own vocabulary, it produces this strange error. The data is just integer values encoding the corresponding words, e.g. "Hello World" would result in the training vector [1 2].
There is no differences between my data and the data from the tutorial as far as I can see. The tensor types are the same (Torch.LongTensor), they are both integer values and in a specified range. The difference is in dimensionality, the tutorial uses vectors with dimension (2, 10), while mine are (3, 199).
Also, I am sorry, but I can't reduce the code any more since otherwise, the error might not be reproduceable.
Did anyone encounter this error before?
I'm using stacked Autoencoder, which is a bunch of Conv layers.
However, I'm having a tensor mismatch error, and I'm not sure about the reason. Everything done in the Encoder is reversed in the Decoder!
This is for time-series data. Input shape is (bactch_size, 1, 3000)
Here's the code
class CDAutoEncoder(nn.Module):
def __init__(self, input_size, output_size, kernel, stride):
super(CDAutoEncoder, self).__init__()
self.forward_pass = nn.Sequential(
nn.Conv1d(input_size, output_size, kernel_size=kernel, stride=stride, padding=0),
nn.PReLU(),
)
self.backward_pass = nn.Sequential(
nn.ConvTranspose1d(output_size, input_size, kernel_size=kernel, stride=stride, padding=0),
nn.PReLU(),
)
def forward(self, x):
y = self.forward_pass(x)
return y
def reconstruct(self, x):
return self.backward_pass(x)
class StackedAutoEncoder(nn.Module):
def __init__(self):
super(StackedAutoEncoder, self).__init__()
self.ae1 = CDAutoEncoder(1, 32, 50, 10)
self.ae2 = CDAutoEncoder(32, 64, 10, 3)
self.ae3 = CDAutoEncoder(64, 64, 5, 1)
def forward(self, x):
a1 = self.ae1(x)
a2 = self.ae2(a1)
a3 = self.ae3(a2)
return self.reconstruct(a3)
def reconstruct(self, x):
a2_reconstruct = self.ae3.reconstruct(x)
a1_reconstruct = self.ae2.reconstruct(a2_reconstruct)
x_reconstruct = self.ae1.reconstruct(a1_reconstruct)
return x_reconstruct
The error:
RuntimeError: The size of tensor a (2990) must match the size of tensor b (3000) at non-singleton dimension 2
I've tried adding padding and it worked, but when I changed the kernel size I get different tensor-size-mismatch-error.
Apparently, there's nothing like 'same' padding, so is there automated solution for this?
I am trying to use PyTorch to train a model in a (multi-CPU) distributed manner. However, when I ran the code, I got the following error message
RuntimeError: the Gloo backend is not available; try to recompile the THD package with Gloo support at /opt/conda/conda-bld/pytorch-cpu_1544218188686/work/torch/lib/THD/process_group/General.cpp:20
I tried this both on my own Macbook and the university's slurm cluster (Red Hat 4.8.5-16). The Python version is 3.6. I installed PyTorch using the command conda install pytorch-cpu torchvision-cpu -c pytorch (the version without CUDA support).
I was wondering if I have to re-install PyTorch from the source or install Gloo manually. I was a little confused since according to PyTorch's documentation,
Since version 0.2.0, the Gloo backend is automatically included with the pre-compiled binaries of PyTorch
The code is exactly the example code available at https://github.com/seba-1511/dist_tuto.pth/blob/gh-pages/train_dist.py
import os
import torch
import torch.distributed.deprecated as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from math import ceil
from random import Random
from torch.multiprocessing import Process
from torch.autograd import Variable
from torchvision import datasets, transforms
class Partition(object):
""" Dataset-like object, but only access a subset of it. """
def __init__(self, data, index):
self.data = data
self.index = index
def __len__(self):
return len(self.index)
def __getitem__(self, index):
data_idx = self.index[index]
return self.data[data_idx]
class DataPartitioner(object):
""" Partitions a dataset into different chuncks. """
def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234):
self.data = data
self.partitions = []
rng = Random()
rng.seed(seed)
data_len = len(data)
indexes = [x for x in range(0, data_len)]
rng.shuffle(indexes)
for frac in sizes:
part_len = int(frac * data_len)
self.partitions.append(indexes[0:part_len])
indexes = indexes[part_len:]
def use(self, partition):
return Partition(self.data, self.partitions[partition])
class Net(nn.Module):
""" Network architecture. """
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
def partition_dataset():
""" Partitioning MNIST """
dataset = datasets.MNIST(
'./data',
train=True,
download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307, ), (0.3081, ))
]))
size = dist.get_world_size()
bsz = int(128 / float(size))
partition_sizes = [1.0 / size for _ in range(size)]
partition = DataPartitioner(dataset, partition_sizes)
partition = partition.use(dist.get_rank())
train_set = torch.utils.data.DataLoader(
partition, batch_size=bsz, shuffle=True)
return train_set, bsz
def average_gradients(model):
""" Gradient averaging. """
size = float(dist.get_world_size())
for param in model.parameters():
dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0)
param.grad.data /= size
def run(rank, size):
""" Distributed Synchronous SGD Example """
torch.manual_seed(1234)
train_set, bsz = partition_dataset()
model = Net()
model = model
# model = model.cuda(rank)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
num_batches = ceil(len(train_set.dataset) / float(bsz))
for epoch in range(10):
epoch_loss = 0.0
for data, target in train_set:
data, target = Variable(data), Variable(target)
# data, target = Variable(data.cuda(rank)), Variable(target.cuda(rank))
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
epoch_loss += loss.data.item()
loss.backward()
average_gradients(model)
optimizer.step()
print('Rank ',
dist.get_rank(), ', epoch ', epoch, ': ',
epoch_loss / num_batches)
def init_processes(rank, size, fn, backend='gloo'):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'
dist.init_process_group(backend, rank=rank, world_size=size)
fn(rank, size)
if __name__ == "__main__":
size = 2
processes = []
for rank in range(size):
p = Process(target=init_processes, args=(rank, size, run))
p.start()
processes.append(p)
for p in processes:
p.join()