Embedding 3D data in Pytorch - nlp

I want to implement character-level embedding.
This is usual word embedding.
Word Embedding
Input: [ [‘who’, ‘is’, ‘this’] ]
-> [ [3, 8, 2] ] # (batch_size, sentence_len)
-> // Embedding(Input)
# (batch_size, seq_len, embedding_dim)
This is what i want to do.
Character Embedding
Input: [ [ [‘w’, ‘h’, ‘o’, 0], [‘i’, ‘s’, 0, 0], [‘t’, ‘h’, ‘i’, ‘s’] ] ]
-> [ [ [2, 3, 9, 0], [ 11, 4, 0, 0], [21, 10, 8, 9] ] ] # (batch_size, sentence_len, word_len)
-> // Embedding(Input) # (batch_size, sentence_len, word_len, embedding_dim)
-> // sum each character embeddings # (batch_size, sentence_len, embedding_dim)
The final output shape is same as Word embedding. Because I want to concat them later.
Although I tried it, I am not sure how to implement 3-D embedding. Do you know how to implement such a data?
def forward(self, x):
print('x', x.size()) # (N, seq_len, word_len)
bs = x.size(0)
seq_len = x.size(1)
word_len = x.size(2)
embd_list = []
for i, elm in enumerate(x):
tmp = torch.zeros(1, word_len, self.embd_size)
for chars in elm:
tmp = torch.add(tmp, 1.0, self.embedding(chars.unsqueeze(0)))
Above code got an error because output of self.embedding is Variable.
TypeError: torch.add received an invalid combination of arguments - got (torch.FloatTensor, float, Variable), but expected one of:
* (torch.FloatTensor source, float value)
* (torch.FloatTensor source, torch.FloatTensor other)
* (torch.FloatTensor source, torch.SparseFloatTensor other)
* (torch.FloatTensor source, float value, torch.FloatTensor other)
didn't match because some of the arguments have invalid types: (torch.FloatTensor, float, Variable)
* (torch.FloatTensor source, float value, torch.SparseFloatTensor other)
didn't match because some of the arguments have invalid types: (torch.FloatTensor, float, Variable)
Update
I could do this. But for is not effective for batch. Do you guys know more efficient way?
def forward(self, x):
print('x', x.size()) # (N, seq_len, word_len)
bs = x.size(0)
seq_len = x.size(1)
word_len = x.size(2)
embd = Variable(torch.zeros(bs, seq_len, self.embd_size))
for i, elm in enumerate(x): # every sample
for j, chars in enumerate(elm): # every sentence. [ [‘w’, ‘h’, ‘o’, 0], [‘i’, ‘s’, 0, 0], [‘t’, ‘h’, ‘i’, ‘s’] ]
chars_embd = self.embedding(chars.unsqueeze(0)) # (N, word_len, embd_size) [‘w’,‘h’,‘o’,0]
chars_embd = torch.sum(chars_embd, 1) # (N, embd_size). sum each char's embedding
embd[i,j] = chars_embd[0] # set char_embd as word-like embedding
x = embd # (N, seq_len, embd_dim)
Update2
This is my final code. Thank you, Wasi Ahmad!
def forward(self, x):
# x: (N, seq_len, word_len)
input_shape = x.size()
bs = x.size(0)
seq_len = x.size(1)
word_len = x.size(2)
x = x.view(-1, word_len) # (N*seq_len, word_len)
x = self.embedding(x) # (N*seq_len, word_len, embd_size)
x = x.view(*input_shape, -1) # (N, seq_len, word_len, embd_size)
x = x.sum(2) # (N, seq_len, embd_size)
return x

I am assuming you have a 3d tensor of shape BxSxW where:
B = Batch size
S = Sentence length
W = Word length
And you have declared embedding layer as follows.
self.embedding = nn.Embedding(dict_size, emsize)
Where:
dict_size = No. of unique characters in the training corpus
emsize = Expected size of embeddings
So, now you need to convert the 3d tensor of shape BxSxW to a 2d tensor of shape BSxW and give it to the embedding layer.
emb = self.embedding(input_rep.view(-1, input_rep.size(2)))
The shape of emb will be BSxWxE where E is the embedding size. You can convert the resulting 3d tensor to a 4d tensor as follows.
emb = emb.view(*input_rep.size(), -1)
The final shape of emb will be BxSxWxE which is what you are expecting.

What you are looking for is implemented in allennlp TimeDistributed layer
Here is a demonstration:
from allennlp.modules.time_distributed import TimeDistributed
batch_size = 16
sent_len = 30
word_len = 5
Consider a sentence in input:
sentence = torch.randn(batch_size, sent_len, word_len) # suppose is your data
Define a char embedding layer (suppose you have also the input padded):
char_embedding = torch.nn.Embedding(char_vocab_size, char_emd_dim, padding_idx=char_pad_idx)
Wrap it!
embedding_sentence = TimeDistributed(char_embedding)(sentence) # shape: batch_size, sent_len, word_len, char_emb_dim
embedding_sentence has shape batch_size, sent_len, word_len, char_emb_dim
Actually, you can easily redefine a module in PyTorch to do this.

Related

CrossEntropyLoss on sequences

I need to compute the torch.nn.CrossEntropyLoss on sequences.
The output tensor y_est has shape: [batch_size, sequence_length, embedding_dim]. The values are embedded as one-hot vectors with embedding_dim dimensions (y_est is not binary however).
The target tensor y has shape: [batch_size, sequence_length] and contains the integer index of the correct class in the range [0, embedding_dim).
If I compute the loss on the two input data, with the shape described above, I get an error 1.
What I would like to do is described by the cycle at [2]. For each sequence in the batch, I would like the sum of the losses computed on each element in the sequence.
After reading the documentation of torch.nn.CrossEntropyLoss I came up with the solution [3], which seems to compute exactly what I want: the losses computed at point [2] and [3] are equale.
However, since .permute(.) returns a view of the original tensor, I am afraid it might mess up the backward propagation on the loss. Somewhere (I do not remember where, sorry) I have read that views should not be used in computing the loss.
Is my solution correct?
import torch
batch_size = 5
seq_len = 10
emb_dim = 100
y_est = torch.randn( (batch_size, seq_len, emb_dim))
y = torch.randint(0, emb_dim, (batch_size, seq_len) )
print("y_est, batch x seq x emb:", y_est.shape)
print("y, batch x seq", y.shape)
loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
# [1]
# loss = loss_fn(y_est, y)
# error:
# RuntimeError: Expected target size [5, 100], got [5, 10]
[2]
loss = 0
for i in range(y_est.shape[1]):
loss += loss_fn ( y_est[:, i, :], y[:, i]).sum()
print(loss)
[3]
y_est_2 = torch.permute( y_est, (0, 2, 1))
print("y_est_2", y_est_2.shape)
loss2 = loss_fn(y_est_2, y).sum()
print(loss2)
whose output is:
y_est, batch x seq x emb: torch.Size([5, 10, 100])
y, batch x seq torch.Size([5, 10])
tensor(253.9994)
y_est_2 torch.Size([5, 100, 10])
tensor(253.9994)
Is the solution correct (also for what concerns the backward pass)? Is there a better way?
If y_est are probabilities you really want to compute the error/loss of a categorical output in each timestep/element of a sequence then y and y_est have to have the same shape. To do so, the categories/classes of y can be expanded to the same dim as y_est with one-hot encoding
import torch
batch_size = 5
seq_len = 10
emb_dim = 100
y_est = torch.randn( (batch_size, seq_len, emb_dim))
y = torch.randint(0, emb_dim, (batch_size, seq_len) )
y = torch.nn.functional.one_hot(y, num_classes=emb_dim).type(torch.float)
loss_fn = torch.nn.CrossEntropyLoss()
loss = loss_fn(y_est, y)
print(loss)

How to increase batch size in GPT2 training for translation task?

I am developing a code to use the pre-trained GPT2 model for a machine translation task. The length of my data's word-to-id is 91, and I developed the following code for my model:
import torch
from torch.utils.data import DataLoader
from transformers.models.gpt2.modeling_gpt2 import GPT2Model
# data preparation code
def batch_sequences(x, y, env):
"""
Take as input a list of n sequences (torch.LongTensor vectors) and return
a tensor of size (slen, n) where slen is the length of the longest
sentence, and a vector lengths containing the length of each sentence.
"""
lengths_x = torch.LongTensor([len(s) + 2 for s in x])
lengths_y = torch.LongTensor([len(s) + 2 for s in y])
max_length = max(lengths_x.max().item(), lengths_y.max().item())
sent_x = torch.LongTensor(
max_length, lengths_x.size(0)).fill_(env.pad_index)
sent_y = torch.LongTensor(
max_length, lengths_y.size(0)).fill_(env.pad_index)
assert lengths_x.min().item() > 2
assert lengths_y.min().item() > 2
sent_x[0] = env.eos_index
for i, s in enumerate(x):
sent_x[1:lengths_x[i] - 1, i].copy_(s)
sent_x[lengths_x[i] - 1, i] = env.eos_index
sent_y[0] = env.eos_index
for i, s in enumerate(y):
sent_y[1:lengths_y[i] - 1, i].copy_(s)
sent_y[lengths_y[i] - 1, i] = env.eos_index
return sent_x, sent_y, max_length
def collate_fn(elements):
"""
Collate samples into a batch.
"""
x, y = zip(*elements)
x = [torch.LongTensor([env.word2id[w]
for w in seq if w in env.word2id]) for seq in x]
y = [torch.LongTensor([env.word2id[w]
for w in seq if w in env.word2id]) for seq in y]
x, y, length = batch_sequences(x, y, env)
return (x, length), (y, length), torch.LongTensor(nb_ops)
loader = DataLoader(data, batch_size=1, shuffle=False, collate_fn=collate_fn)
gpt2 = GPT2Model.from_pretrained('gpt2')
in_layer = nn.Embedding(len(env.word2id), 768)
out_layer = nn.Linear(768, len(env.word2id))
parameters = list(gpt2.parameters()) + list(in_layer.parameters()) + list(out_layer.parameters())
optimizer = torch.optim.Adam(parameters)
loss_fn = nn.CrossEntropyLoss()
for layer in (gpt2, in_layer, out_layer):
layer.train()
accuracies = list()
n_epochs = 5
for i in range(n_epochs):
for (x, x_len), (y, y_len) in loader:
x = x.to(device=device)
y = y.to(device=device)
embeddings = in_layer(x.reshape(1, -1))
hidden_state = gpt2(inputs_embeds=embeddings).last_hidden_state[:, :]
logits = out_layer(hidden_state)[0]
loss = loss_fn(logits, y.reshape(-1))
accuracies.append(
(logits.argmax(dim=-1) == y.reshape(-1)).float().mean().item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
if len(accuracies) % 500 == 0:
accuracy = sum(accuracies[-50:]) / len(accuracies[-50:])
print(f'Samples: {len(accuracies)}, Accuracy: {accuracy}')
This code works pretty well when the batch size is 1. But it is so slow. I wanted to increase the batch size from 1 to 32, but I get some dimension compatibility problems. How can I increase the batch size without errors?
My data consists of pair of sentences, the first one is a sentence in the first language and the second one is its translation in the second language.
For example, assume that x.shape is (batch_size, 12) (meaning we have 'batch_size' sentences of length 12 as input and y.shape is also (batch_size, 12) (the translations). And also we have a word-to-id dictionary of length 90 that matches each word in a sentence with its index)
This problem can be solved using padding. We need two special symbols:
code 0 in inputs (x) will denote "blank" tokens that should not be translated.
code -100 in outputs (y) will denote "blank" tokens that should not participate in the calculation of loss. nn.CrossEntropyLoss() is programmed to ignore this value (by the argument ignore_index).
The batch of size 3 could look like this:
x:
[[1, 2, 3, 0, 0],
[ 4, 5, 6, 7, 8],
[ 9, 8, 0, 0, 0]]
y:
[[1, 2, 3, -100, -100],
[ 4, 5, 6, 7, 8],
[ 9, 8, -100, -100, -100]]
You could generate it with code such as:
def pad_sequences(batch, pad_value=0):
n = max(len(v) for v in batch)
return torch.tensor([v + [pad_value] * (n - len(v)) for v in batch])
However, I feel there is an issue with your problem statement. If you perform machine translation, then your inputs and outputs can have different lengths, but your architecture only allows x and y to have the same lengths. If you want to support x and y of different lengths, I would suggest to use a seq2seq architecture such as T5 instead.
Another issue is that GPT is autoregressive, so if y is completely aligned with x, then we cannot use the suffix of x while generating the left part of y. So if you wish your x and y to be perfectly aligned, but still would like to use the full information about x when generating y, I would recommend using a bidirectional encoder such as BERT.

How to properly implement data reorganization using PyTorch?

It's going to be a long post, sorry in advance...
I'm working on a denoising algorithm and my goal is to:
Use PyTorch to design / train the model
Convert the PyTorch model into a CoreML model
The denoising algorithm consists in the following 3 parts:
A "down-sampling" + noise level map
A regular convnet
An "up-sampling"
The first part is quite simple in its idea, but not so easy to explain. Given for instance an input color image and a input value "sigma" that represents the standard deviation of the image noise.
The "down-sampling" part is in fact a space-to-depth. In short, for a given channel and for a subset of 2x2 pixels, the space-to-depth creates a single pixel composed of 4 channels. The number of channels is multiplied by 4 while the height and width are divided by 2. The data is simply reorganized.
The noise level map consists in creating 3 channels containing the standard deviation value so that the convnet knows how to properly denoise the input image.
This will be maybe more clear with some code:
def downsample_and_noise_map(input, sigma):
# Input tensor size (batch, channels, height, width)
in_n, in_c, in_h, in_w = input.size()
# Output tensor size
out_h = in_h // 2
out_w = in_w // 2
sigma_c = in_c # nb of channels of the standard deviation tensor
image_c = in_c * 4 # nb of channels of the image tensor
# Standard deviation tensor
output_sigma = sigma.view(1, 1, 1, 1).repeat(in_n, sigma_c, out_h, out_w)
# Image tensor
output_image = torch.zeros((in_n, image_c, out_h, out_w))
output_image[:, 0::4, :, :] = input[:, :, 0::2, 0::2]
output_image[:, 1::4, :, :] = input[:, :, 0::2, 1::2]
output_image[:, 2::4, :, :] = input[:, :, 1::2, 0::2]
output_image[:, 3::4, :, :] = input[:, :, 1::2, 1::2]
# Concatenate standard deviation and image tensors
return torch.cat((output_sigma, output_image), dim=1)
This function is then called as the first step in the model's forward function:
def forward(self, x, sigma):
x = downsample_and_noise_map(x, sigma)
x = self.convnet(x)
x = upsample(x)
return x
Let's consider an input tensor of size 1x3x100x100 (PyTorch standard: batch, channels, height, width) and a sigma value of 0.1. The output tensor has the following properties:
Tensor's shape is 1x15x50x50
Tensor's values for channels 0, 1 and 2 are all equal to sigma = 0.1
Tensor's values for channels 3, 4, 5, 6 are composed of the input image values of channel 0
Tensor's values for channels 7, 8, 9, 10 are composed of the input image values of channel 1
Tensor's values for channels 11, 12, 13, 14 are composed of the input image values of channel 2
If this code is not clear enough, I can post an even more naive version.
The up-sampling part is the reciprocal function of the downsampling one.
I was able to use this function for training and testing in PyTorch.
Then, I tried to convert the model to CoreML with ONNX as an intermediate step.
The conversion to ONNX generated "TracerWarning". Conversion from ONNX to CoreML failed (TypeError: 1.0 has type numpy.float64, but expected one of: int, long). The problem came from the down-sampling + noise level map (and from up-sampling too).
When I removed the down-sampling + noise level map and up-sampling layers, I was able to convert to ONNX and to CoreML very easily since only a simple convnet remained. This means I have a solution to my problem: implement these 2 layers using 2 shaders on the mobile side. But I'm not satisfied with this solution as I want my model to contain all layers ^^
Before considering writing a post here, I crawled Internet to find an answer and I was able to write a better version of the previous function using reshape and permute. This version removed all ONNX warning, but the CoreML conversion still failed...
def downsample_and_noise_map(input, sigma):
# Input image size
in_n, in_c, in_h, in_w = input.size()
# Output tensor size
out_n = in_n
out_h = in_h // 2
out_w = in_w // 2
# Create standard deviation tensor
output_sigma = sigma.view(out_n, 1, 1, 1).repeat(out_n, in_c, out_h, out_w)
# Split RGB channels
channels_rgb = torch.split(input, 1, dim=1)
# Reshape (space-to-depth) each image channel
channels_reshaped = []
for channel in channels_rgb:
channel = channel.reshape(1, out_h, 2, out_w, 2)
channel = channel.permute(2, 4, 0, 1, 3)
channel = channel.reshape(1, 4, out_h, out_w)
channels_reshaped.append(channel)
# Concatenate all reshaped image channels together
output_image = torch.cat(channels_reshaped, dim=1)
# Concatenate standard deviation and image tensors
output = torch.cat([output_sigma, output_image], dim=1)
return output
So here are (some of) my questions:
What is the preferred PyTorch way to implement a function such as downsample_and_noise_map function within a model?
Same question but when the conversion to ONNX and then to CoreML is part of the equation?
Is the PyTorch -> ONNX -> CoreML still best path to deploy the model for iOS production?
Thanks for your help (and your patience) ^^
Disclaimer I'm not familiar with CoreML or deploying to iOS but I do have experience deploying PyTorch models in TensorRT and OpenVINO via ONNX.
The main issues I've faced when deploying to other frameworks is that operations like slicing and repeating tensors tend to have limited support in other frameworks. Often we can construct equivalent conv or transpose-conv operations which achieve the desired behavior.
In order to ensure we don't export the logic used to construct the conv weights I've separated the weight initialization from the application of the weights. This makes the ONNX export much more straightforward since all it sees is some constant tensors being applied.
class DownsampleAndNoiseMap():
def __init__(self):
self.initialized = False
self.weight = None
self.zeros = None
def init_weights(self, input):
with torch.no_grad():
in_n, in_c, in_h, in_w = input.size()
out_h = int(in_h // 2)
out_w = int(in_w // 2)
sigma_c = in_c
image_c = in_c * 4
# conv weights used for downsampling
self.weight = torch.zeros(image_c, in_c, 2, 2).to(input)
for c in range(in_c):
self.weight[4 * c, c, 0, 0] = 1
self.weight[4 * c + 1, c, 0, 1] = 1
self.weight[4 * c + 2, c, 1, 0] = 1
self.weight[4 * c + 3, c, 1, 1] = 1
# zeros used to replace repeat
self.zeros = torch.zeros(in_n, sigma_c, out_h, out_w).to(input)
self.initialized = True
def __call__(self, input, sigma):
assert self.initialized
output_sigma = self.zeros + sigma
output_image = torch.nn.functional.conv2d(input, self.weight, stride=2)
return torch.cat((output_sigma, output_image), dim=1)
class Upsample():
def __init__(self):
self.initialized = False
self.weight = None
def init_weights(self, input):
with torch.no_grad():
in_n, in_c, in_h, in_w = input.size()
image_c = in_c * 4
self.weight = torch.zeros(in_c + image_c, in_c, 2, 2).to(input)
for c in range(in_c):
self.weight[in_c + 4 * c, c, 0, 0] = 1
self.weight[in_c + 4 * c + 1, c, 0, 1] = 1
self.weight[in_c + 4 * c + 2, c, 1, 0] = 1
self.weight[in_c + 4 * c + 3, c, 1, 1] = 1
self.initialized = True
def __call__(self, input):
assert self.initialized
return torch.nn.functional.conv_transpose2d(input, self.weight, stride=2)
I made the assumption that upsample was the reciprocal of downsample in the sense that x == upsample(downsample_and_noise_map(x, sigma)) (correct me if I'm wrong in this assumption). I also verified that my version of downsample agrees with yours.
# consistency checking code
x = torch.randn(1, 3, 100, 100)
sigma = torch.randn(1)
# OP downsampling
y1 = downsample_and_noise_map(x, sigma)
ds = DownsampleAndNoiseMap()
ds.init_weights(x)
y2 = ds(x, sigma)
print('downsample diff:', torch.sum(torch.abs(y1 - y2)).item())
us = Upsample()
us.init_weights(x)
x_recov = us(ds(x, sigma))
print('recovery error:', torch.sum(torch.abs(x - x_recov)).item())
which results in
downsample diff: 0.0
recovery error: 0.0
Exporting to ONNX
When exporting we need to invoke init_weights for the new classes before using torch.onnx.export. For example
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.downsample = DownsampleAndNoiseMap()
self.upsample = Upsample()
self.convnet = lambda x: x # placeholder
def init_weights(self, x):
self.downsample.init_weights(x)
self.upsample.init_weights(x)
def forward(self, x, sigma):
x = self.downsample(x, sigma)
x = self.convnet(x)
x = self.upsample(x)
return x
x = torch.randn(1, 3, 100, 100)
sigma = torch.randn(1)
model = Model()
# ... load state dict here
model.init_weights(x)
torch.onnx.export(model, (x, sigma), 'deploy.onnx', verbose=True, input_names=["input", "sigma"], output_names=["output"])
which gives the ONNX graph
graph(%input : Float(1, 3, 100, 100)
%sigma : Float(1)) {
%2 : Float(1, 3, 50, 50) = onnx::Constant[value=<Tensor>](), scope: Model
%3 : Float(1, 3, 50, 50) = onnx::Add(%2, %sigma), scope: Model
%4 : Float(12, 3, 2, 2) = onnx::Constant[value=<Tensor>](), scope: Model
%5 : Float(1, 12, 50, 50) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[2, 2], pads=[0, 0, 0, 0], strides=[2, 2]](%input, %4), scope: Model
%6 : Float(1, 15, 50, 50) = onnx::Concat[axis=1](%3, %5), scope: Model
%7 : Float(15, 3, 2, 2) = onnx::Constant[value=<Tensor>](), scope: Model
%output : Float(1, 3, 100, 100) = onnx::ConvTranspose[dilations=[1, 1], group=1, kernel_shape=[2, 2], pads=[0, 0, 0, 0], strides=[2, 2]](%6, %7), scope: Model
return (%output);
}
As for the last question about the recommended way to deploy on iOS I can't answer that since I don't have experience in that area.

Prepare Decoder of a Sequence to Sequence Network in PyTorch

I was working with Sequence to Sequence models in Pytorch. Sequence to Sequence Models comprises of an Encoder and a Decoder.
The Encoder convert a (batch_size X input_features X num_of_one_hot_encoded_classes) -> (batch_size X input_features X hidden_size)
The Decoder will take this input sequence and convert it into (batch_size X output_features X num_of_one_hot_encoded_classes)
An example would be like-
So on the above example, I would need to convert the 22 input features to 10 output features. In Keras it could be done with a RepeatVector(10).
An Example -
model.add(LSTM(256, input_shape=(22, 98)))
model.add(RepeatVector(10))
model.add(Dropout(0.3))
model.add(LSTM(256, return_sequences=True))
Although, I'm not sure if it's the proper way to convert the input sequences into the output ones.
So, my question is -
What's the standard way to convert the input sequences to
output ones. eg. converting from (batch_size, 22, 98) -> (batch_size,
10, 98)? Or how should I prepare the Decoder?
Encoder Code snippet (Written in Pytorch) -
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=1, batch_first=True)
def forward(self, input):
output, hidden = self.lstm(input)
return output, hidden
Well, you have to options, first one is to repeat the encoder's last state for 10 times and give it as input to the decoder, like this:
import torch
input = torch.randn(64, 22, 98)
encoder = torch.nn.LSTM(98, 256, batch_first=True)
encoded, _ = encoder(input)
decoder_input = encoded[:, -1:].repeat(1, 10, 1)
decoder = torch.nn.LSTM(256, 98, batch_first=True)
decoded, _ = decoder(decoder_input)
print(decoded.shape) #torch.Size([64, 10, 98])
Another option is to use an attention mechanism, like this:
#assuming we have obtained the encoded sequence and declared the decoder as before
attention_calculator = torch.nn.Conv1d(256+98, 1, kernel_size=1)
hidden = (torch.zeros(1, 64, 98), torch.zeros(1, 64, 98))
outputs = []
for i in range(10):
attention_input = torch.cat([hidden[0][0][:, None, :].expand(-1, 22, -1), encoded], dim=2).permute(0, 2, 1)
attention_value = torch.nn.functional.softmax(attention_calculator(attention_input).squeeze(), dim=1)
decoder_input = (attention_value[:, :, None] * encoded).sum(dim=1, keepdim=True)
output, hidden = decoder(decoder_input, hidden)
outputs.append(output)
outputs = torch.cat(outputs, dim=1)

Pytorch: How to compute IoU (Jaccard Index) for semantic segmentation

Can someone provide a toy example of how to compute IoU (intersection over union) for semantic segmentation in pytorch?
As of 2021, there's no need to implement your own IoU, as torchmetrics comes equipped with it - here's the link.
It is named torchmetrics.JaccardIndex (previously torchmetrics.IoU) and calculates what you want.
It works with PyTorch and PyTorch Lightning, also with distributed training.
From the documentation:
torchmetrics.JaccardIndex(num_classes, ignore_index=None, absent_score=0.0, threshold=0.5, multilabel=False, reduction='elementwise_mean', compute_on_step=None, **kwargs)
Computes Intersection over union, or Jaccard index calculation:
J(A,B) = \frac{|A\cap B|}{|A\cup B|}
Where: A and B are both tensors of the same size, containing integer class values. They may be subject to conversion from input data (see description below). Note that it is different from box IoU.
Works with binary, multiclass and multi-label data. Accepts probabilities from a model output or integer class values in prediction. Works with multi-dimensional preds and target.
Forward accepts
preds (float or long tensor): (N, ...) or (N, C, ...) where C is the number of classes
target (long tensor): (N, ...) If preds and target
are the same shape and preds is a float tensor, we use the
self.threshold argument to convert into integer labels. This is the case for binary and multi-label probabilities.
If preds has an extra dimension as in the case of multi-class scores we perform an argmax on dim=1.
Official example:
>>> from torchmetrics import JaccardIndex
>>> target = torch.randint(0, 2, (10, 25, 25))
>>> pred = torch.tensor(target)
>>> pred[2:5, 7:13, 9:15] = 1 - pred[2:5, 7:13, 9:15]
>>> jaccard = JaccardIndex(num_classes=2)
>>> jaccard(pred, target)
tensor(0.9660)
I found this somewhere and adapted it for me. I'll post the link if I can find it again. Sorry in case this was a dublicate.
The key function here is the function called iou. The wrapping function evaluate_performance is not universal, but it shows that one needs to iterate over all results before computing IoU.
import torch
import pandas as pd # For filelist reading
import myPytorchDatasetClass # Custom dataset class, inherited from torch.utils.data.dataset
def iou(pred, target, n_classes = 12):
ious = []
pred = pred.view(-1)
target = target.view(-1)
# Ignore IoU for background class ("0")
for cls in xrange(1, n_classes): # This goes from 1:n_classes-1 -> class "0" is ignored
pred_inds = pred == cls
target_inds = target == cls
intersection = (pred_inds[target_inds]).long().sum().data.cpu()[0] # Cast to long to prevent overflows
union = pred_inds.long().sum().data.cpu()[0] + target_inds.long().sum().data.cpu()[0] - intersection
if union == 0:
ious.append(float('nan')) # If there is no ground truth, do not include in evaluation
else:
ious.append(float(intersection) / float(max(union, 1)))
return np.array(ious)
def evaluate_performance(net):
# Dataloader for test data
batch_size = 1
filelist_name_test = '/path/to/my/test/filelist.txt'
data_root_test = '/path/to/my/data/'
dset_test = myPytorchDatasetClass.CustomDataset(filelist_name_test, data_root_test)
test_loader = torch.utils.data.DataLoader(dataset=dset_test,
batch_size=batch_size,
shuffle=False,
pin_memory=True)
data_info = pd.read_csv(filelist_name_test, header=None)
num_test_files = data_info.shape[0]
sample_size = num_test_files
# Containers for results
preds = Variable(torch.zeros((sample_size, 60, 36, 60)))
gts = Variable(torch.zeros((sample_size, 60, 36, 60)))
dataiter = iter(test_loader)
for i in xrange(sample_size):
images, labels, filename = dataiter.next()
images = Variable(images).cuda()
labels = Variable(labels)
gts[i:i+batch_size, :, :, :] = labels
outputs = net(images)
outputs = outputs.permute(0, 2, 3, 4, 1).contiguous()
val, pred = torch.max(outputs, 4)
preds[i:i+batch_size, :, :, :] = pred.cpu()
acc = iou(preds, gts)
return acc
Say your outputs are of shape [32, 256, 256] # 32 is the minibatch size and 256x256 is the image's height and width, and the labels are also the same shape.
Then you can use sklearn's jaccard_similarity_score after some reshaping.
If both are torch tensors, then:
lbl = labels.cpu().numpy().reshape(-1)
target = output.cpu().numpy().reshape(-1)
Now:
from sklearn.metrics import jaccard_similarity_score as jsc
print(jsc(target,lbl))

Resources