How to convert torch.norm to cosine distance - pytorch

I want to change norm distance to cosine distance, help me convert this function to cosine distance
def feat_prototype_distance(self, feat):
N, C, H, W = feat.shape
feat_proto_distance = -torch.ones((N, self.class_numbers, H, W)).to(feat.device)
for i in range(self.class_numbers):
feat_proto_distance[:, i, :, :] = torch.norm(self.objective_vectors[i].reshape(-1,1,1).expand(-1, H, W) - feat, 2, dim=1)
return feat_proto_distance
This is original function using norm distance with shapes:
self.objective_vectors[i].reshape(-1,1,1).expand(-1, H, W): torch.Size([256, 128, 224])
feat: torch.Size([8, 256, 128, 224]) with 8 is batch_size

You could use torch.nn.CosineSimilarity.
The structure of your code is not very clear to me, but you could probably do something like:
def feat_prototype_distance(self, feat):
distance_metric = torch.nn.CosineSimilarity(dim=1)
N, C, H, W = feat.shape
feat_proto_distance = -torch.ones((N, self.class_numbers, H, W)).to(feat.device)
for i in range(self.class_numbers):
feat_proto_distance[:, i, :, :] = distance_metric(self.objective_vectors[i].reshape(-1,1,1).expand(-1, H, W), feat)
return feat_proto_distance

Related

How to implement pair-wise calculation of attention within a batch?

Suppose I now have the following code to calculate source-target attention for two variable, x and y:
class MultiHeadedAttention(nn.Module):
"""Multi-Head Attention layer
:param int n_head: the number of head s
:param int n_feat: the number of features
:param float dropout_rate: dropout rate
"""
def __init__(self, n_head: int, n_feat: int, dropout_rate: float):
super(MultiHeadedAttention, self).__init__()
assert n_feat % n_head == 0
self.d_k = n_feat // n_head
self.h = n_head
self.linear_q = nn.Linear(n_feat, n_feat)
self.linear_k = nn.Linear(n_feat, n_feat)
self.linear_v = nn.Linear(n_feat, n_feat)
self.linear_out = nn.Linear(n_feat, n_feat)
self.dropout = nn.Dropout(p=dropout_rate)
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""Compute 'Scaled Dot Product Attention'
:param torch.Tensor query: (batch, x_len, size)
:param torch.Tensor key: (batch, y_len, size)
:param torch.Tensor value: (batch, y_len, size)
:param torch.Tensor mask: (batch, x_len, y_len)
:param torch.nn.Dropout dropout:
:return torch.Tensor: attentined and transformed `value` (batch, x_len, depth)
weighted by the query dot key attention (batch, head, x_len, y_len)
"""
n_batch = query.size(0)
q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
q = q.transpose(1, 2) # (batch, head, x_len, d_k)
k = k.transpose(1, 2) # (batch, head, x_len, d_k)
v = v.transpose(1, 2) # (batch, head, y_len, d_k)
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(
self.d_k
) # (batch, head, x_len, y_len)
if mask is not None:
mask = mask.unsqueeze(1).eq(0) # (batch, 1, x_len, y_len)
mask = mask.to(device=scores.device)
scores = scores.masked_fill_(mask, -np.inf)
attn = torch.softmax(scores, dim=-1).masked_fill(
mask, 0.0
) # (batch, head, x_len, y_len)
else:
attn = torch.softmax(scores, dim=-1) # (batch, head, x_len, y_len)
p_attn = self.dropout(attn)
x = torch.matmul(p_attn, v) # (batch, head, x_len, d_k)
x = (
x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
) # (batch, x_len, depth)
return self.linear_out(x) # (batch, x_len, depth)
So this class calculate the attention of batch size=B pairs of (x, y)_i, gives output of dim (batch, x_len, depth). So far so good.
The question is: What if I wanted to extend this class to calculate NOT ONLY (x1, y1), (x2, y2)..., but also all combination of xy, i.e. (x1, y2), (x1, y3)... within the batch, so that I will get an output of dim (batch, batch, x_len, depth) WITHOUT LOOPING. How would you implement this? Any recommendation, suggestion, example is appreciated.
EDITED
I just came up with an idea which does the desired job at the expense of extra memory use. Just simply copy X and Y along the batch dimension so that the represent all the pairs of x_i and y_i. Specifically:
b = torch.tensor(list(range(batch_size)))
comb = torch.cartesian_prod(b, b)
x = x[comb[:, 0], :, :]
y = y[comb[:, 1], :, :]
and then after the calculation, view or reshape the first dimension and it will return output which is of dim=(batch_size, batch_size, x_len, depth).
I have tested using toy example and quite sure it does do the job.
However, unfortunately, for my case it got CUDA out of memory.
What would you do under this situation? Should I give up on parallelism and just use loop to make it works?
If I understand you correctly, you might want to check out torch.cdist, which is a torch implementation of pairwise distances, similar to scipy.spatial.distance.cdist. You might have to do some tweaking on your tensor dimensions, as described in the documentation torch cdist

PyTorch gradient computing of multidim tensor

is there an easier way of computing gradients of a matrix in PyTorch?
import torch
x = torch.linspace(0, 1, 50, requires_grad=True).view(-1, 1)
inputs = torch.hstack([torch.cos(w*x) for w in range(5)])
d_inputs = torch.hstack([grad(f.sum(), x, create_graph=True, retain_graph=True)[0] for f in inputs.T])
Thank you.

Computer vision - 2D Convolution with Pytorch

Yesterday I saw an exercise with the related solution.
The text:
Your code will take an input tensor input with shape (n, iC, H, W) and a kernel kernel with shape (oC, iC, kH, kW ). It needs then to apply a 2D convolution over input, using kernel as kernel tensor and no bias, using a stride of 1, no dilation, no grouping, and no padding, and store the result in out. Both input and kernel have dtype torch.float32
The solution:
#set-up code
import random
import torch
n = random.randint(2, 6)
iC = random.randint(2, 6)
oC = random.randint(2, 6)
H = random.randint(10, 20)
W = random.randint(10, 20)
kH = random.randint(2, 6)
kW = random.randint(2, 6)
input = torch.rand(n, iC, H, W, dtype=torch.float32)
kernel = torch.rand(oC, iC, kH, kW, dtype=torch.float32)
#solution code
oH, oW = H-(kH-1), W-(kW-1)
out = torch.zeros((n, oC, oH, oW), dtype=torch.float32)
for i in range(oH):
for j in range(oW):
inp = input.unsqueeze(1)[:, :, :,i: i+kH, j : j+kW] # shape inp => (n, 1, iC, H, W)
ker = kernel.unsqueeze(0) # shape ker => (1, oC, iC, kH, kW)
out[:, :, i, j] = (inp*ker).sum((-1, -2, -3)) #??
My question is:
Why we make the unsqueeze() in this manner?
I know how unsqueeze() works but I can't figure out the problem we solve with this unsqueeze().
Just for a visual reference of the convolution:
Thanks!
So a bit of elaboration on the comment I made.
First, if we unroll this ALL the way, we can represent this convolution operation with the following septuple nested for-loop.
out = torch.zeros((n, oC, oH, oW), dtype=torch.float32)
for out_i in range(oH):
for out_j in range(oW):
for b_idx in range(n):
for out_ch in range(oC):
for in_ch in range(iC):
for ker_i in range(kH):
for ker_j in range(kW):
out[b_idx, out_ch, out_i, out_j] += \
input[b_idx, in_ch, out_i + ker_i, out_j + ker_j] \
* kernel[out_ch, in_ch, ker_i, ker_j]
Of course this is probably going to be pretty slow. Instead we can aggregate the inner three for-loops into a single operation that takes a kH by kW slice of the input tensor spanning all the input channels and multiply that by a slice of the kernel at the desired output channel.
out = torch.zeros((n, oC, oH, oW), dtype=torch.float32)
for out_i in range(oH):
for out_j in range(oW):
for b_idx in range(n):
for out_ch in range(oC):
# input_slice -> [iC, kH, kW]
input_slice = input[b_idx, :, out_i:out_i+kH, out_j:out_j+kW]
# kernel_slice -> [iC, kH, kW]
kernel_slice = kernel[out_ch, :, :, :]
out[b_idx, out_ch, out_i, out_j] = (input_slice * kernel_slice).sum()
Observe that in this latest version the input_slice is taken from a single batch index (b_idx) and the kernel_slice is taken from a single output channel (out_ch). We compute all combinations of b_idx and out_ch to fill the output. When we see this type of pattern then broadcasting should come to mind.
First, if we just took the input slice over all the batches (e.g. input[:, :, out_i:out_i + kH, out_j:out_j + kW]) then this would be a [n, iC, kH, kW] tensor. And since the kernel is shape [oC, iC, kH, kW] these can't be broadcasted together because they don't agree in the first dimension. To deal with this we need to insert some unitary dimensions so they agree everywhere where both have non-unitary dimensions.
Since we want the output of the broadcasted product to be reduced and stored in out which has shape [n, oC, ...], then we want to insert unitary dimensions as follows:
out = torch.zeros((n, oC, oH, oW), dtype=torch.float32)
for out_i in range(oH):
for out_j in range(oW):
# input_slice -> [n, 1, iC, kH, kW]
input_slice = input[:, :, out_i:out_i+kH, out_j:out_j+kW].unsqueeze(1)
# kernel_slice -> [1, oC, iC, kH, kW]
kernel_slice = kernel[:, :, :, :].unsqueeze(0)
# broadcasted shape [n, 1, ...] times shape [1, oC, ...] -> [n, oC, ...]
# therefore prod_slice -> [n, oC, iC, kH, kW]
prod_slice = input_slice * kernel_slice
# sum over last three channels producing reduced_slice -> [n, oC]
reduced_slice = prod_slice.sum((-1, -2, -3))
out[:, :, out_i, out_j] = reduced_slice
Note that we could have achieved a valid broadcast by using .unsqueeze(0) on the input slice and .unsqueeze(1) on the kernel slice. However, this would have resulted in reduced_slice being shape [oC, n] instead of [n, oC] which would have been the transpose of what we wanted to store in out.

Using autograd to compute Jacobian matrix of outputs with respect to inputs

I apologize if this question is obvious or trivial. I am very new to pytorch and I am trying to understand the autograd.grad function in pytorch. I have a neural network G that takes in inputs (x,t) and outputs (u,v). Here is the code for G:
class GeneratorNet(torch.nn.Module):
"""
A three hidden-layer generative neural network
"""
def __init__(self):
super(GeneratorNet, self).__init__()
self.hidden0 = nn.Sequential(
nn.Linear(2, 100),
nn.LeakyReLU(0.2)
)
self.hidden1 = nn.Sequential(
nn.Linear(100, 100),
nn.LeakyReLU(0.2)
)
self.hidden2 = nn.Sequential(
nn.Linear(100, 100),
nn.LeakyReLU(0.2)
)
self.out = nn.Sequential(
nn.Linear(100, 2),
nn.Tanh()
)
def forward(self, x):
x = self.hidden0(x)
x = self.hidden1(x)
x = self.hidden2(x)
x = self.out(x)
return x
Or simply G(x,t) = (u(x,t), v(x,t)) where u(x,t) and v(x,t) are scalar valued. Goal: Compute $\frac{\partial u(x,t)}{\partial x}$ and $\frac{\partial u(x,t)}{\partial t}$. At every training step, I have a minibatch of size $100$ so u(x,t) is a [100,1] tensor. Here is my attempt to compute the partial derivatives, where coords is the input (x,t) and just like below I added the requires_grad_(True) flag to the coords as well:
tensor = GeneratorNet(coords)
tensor.requires_grad_(True)
u, v = torch.split(tensor, 1, dim=1)
du = autograd.grad(u, coords, grad_outputs=torch.ones_like(u), create_graph=True,
retain_graph=True, only_inputs=True, allow_unused=True)[0]
du is now a [100,2] tensor.
Question: Is this the tensor of the partials for the 100 input points of the minibatch?
There are similar questions like computing derivatives of the output with respect to inputs but I could not really figure out what's going on. I apologize once again if this is already answered or trivial. Thank you very much.
The code you posted should give you the partial derivative of your first output w.r.t. the input. However, you also have to set requires_grad_(True) on the inputs, as otherwise PyTorch does not build up the computation graph starting at the input and thus it cannot compute the gradient for them.
This version of your code example computes du and dv:
net = GeneratorNet()
coords = torch.randn(10, 2)
coords.requires_grad = True
tensor = net(coords)
u, v = torch.split(tensor, 1, dim=1)
du = torch.autograd.grad(u, coords, grad_outputs=torch.ones_like(u))[0]
dv = torch.autograd.grad(v, coords, grad_outputs=torch.ones_like(v))[0]
You can also compute the partial derivative for a single output:
net = GeneratorNet()
coords = torch.randn(10, 2)
coords.requires_grad = True
tensor = net(coords)
u, v = torch.split(tensor, 1, dim=1)
du_0 = torch.autograd.grad(u[0], coords)[0]
where du_0 == du[0].

Embedding 3D data in Pytorch

I want to implement character-level embedding.
This is usual word embedding.
Word Embedding
Input: [ [‘who’, ‘is’, ‘this’] ]
-> [ [3, 8, 2] ] # (batch_size, sentence_len)
-> // Embedding(Input)
# (batch_size, seq_len, embedding_dim)
This is what i want to do.
Character Embedding
Input: [ [ [‘w’, ‘h’, ‘o’, 0], [‘i’, ‘s’, 0, 0], [‘t’, ‘h’, ‘i’, ‘s’] ] ]
-> [ [ [2, 3, 9, 0], [ 11, 4, 0, 0], [21, 10, 8, 9] ] ] # (batch_size, sentence_len, word_len)
-> // Embedding(Input) # (batch_size, sentence_len, word_len, embedding_dim)
-> // sum each character embeddings # (batch_size, sentence_len, embedding_dim)
The final output shape is same as Word embedding. Because I want to concat them later.
Although I tried it, I am not sure how to implement 3-D embedding. Do you know how to implement such a data?
def forward(self, x):
print('x', x.size()) # (N, seq_len, word_len)
bs = x.size(0)
seq_len = x.size(1)
word_len = x.size(2)
embd_list = []
for i, elm in enumerate(x):
tmp = torch.zeros(1, word_len, self.embd_size)
for chars in elm:
tmp = torch.add(tmp, 1.0, self.embedding(chars.unsqueeze(0)))
Above code got an error because output of self.embedding is Variable.
TypeError: torch.add received an invalid combination of arguments - got (torch.FloatTensor, float, Variable), but expected one of:
* (torch.FloatTensor source, float value)
* (torch.FloatTensor source, torch.FloatTensor other)
* (torch.FloatTensor source, torch.SparseFloatTensor other)
* (torch.FloatTensor source, float value, torch.FloatTensor other)
didn't match because some of the arguments have invalid types: (torch.FloatTensor, float, Variable)
* (torch.FloatTensor source, float value, torch.SparseFloatTensor other)
didn't match because some of the arguments have invalid types: (torch.FloatTensor, float, Variable)
Update
I could do this. But for is not effective for batch. Do you guys know more efficient way?
def forward(self, x):
print('x', x.size()) # (N, seq_len, word_len)
bs = x.size(0)
seq_len = x.size(1)
word_len = x.size(2)
embd = Variable(torch.zeros(bs, seq_len, self.embd_size))
for i, elm in enumerate(x): # every sample
for j, chars in enumerate(elm): # every sentence. [ [‘w’, ‘h’, ‘o’, 0], [‘i’, ‘s’, 0, 0], [‘t’, ‘h’, ‘i’, ‘s’] ]
chars_embd = self.embedding(chars.unsqueeze(0)) # (N, word_len, embd_size) [‘w’,‘h’,‘o’,0]
chars_embd = torch.sum(chars_embd, 1) # (N, embd_size). sum each char's embedding
embd[i,j] = chars_embd[0] # set char_embd as word-like embedding
x = embd # (N, seq_len, embd_dim)
Update2
This is my final code. Thank you, Wasi Ahmad!
def forward(self, x):
# x: (N, seq_len, word_len)
input_shape = x.size()
bs = x.size(0)
seq_len = x.size(1)
word_len = x.size(2)
x = x.view(-1, word_len) # (N*seq_len, word_len)
x = self.embedding(x) # (N*seq_len, word_len, embd_size)
x = x.view(*input_shape, -1) # (N, seq_len, word_len, embd_size)
x = x.sum(2) # (N, seq_len, embd_size)
return x
I am assuming you have a 3d tensor of shape BxSxW where:
B = Batch size
S = Sentence length
W = Word length
And you have declared embedding layer as follows.
self.embedding = nn.Embedding(dict_size, emsize)
Where:
dict_size = No. of unique characters in the training corpus
emsize = Expected size of embeddings
So, now you need to convert the 3d tensor of shape BxSxW to a 2d tensor of shape BSxW and give it to the embedding layer.
emb = self.embedding(input_rep.view(-1, input_rep.size(2)))
The shape of emb will be BSxWxE where E is the embedding size. You can convert the resulting 3d tensor to a 4d tensor as follows.
emb = emb.view(*input_rep.size(), -1)
The final shape of emb will be BxSxWxE which is what you are expecting.
What you are looking for is implemented in allennlp TimeDistributed layer
Here is a demonstration:
from allennlp.modules.time_distributed import TimeDistributed
batch_size = 16
sent_len = 30
word_len = 5
Consider a sentence in input:
sentence = torch.randn(batch_size, sent_len, word_len) # suppose is your data
Define a char embedding layer (suppose you have also the input padded):
char_embedding = torch.nn.Embedding(char_vocab_size, char_emd_dim, padding_idx=char_pad_idx)
Wrap it!
embedding_sentence = TimeDistributed(char_embedding)(sentence) # shape: batch_size, sent_len, word_len, char_emb_dim
embedding_sentence has shape batch_size, sent_len, word_len, char_emb_dim
Actually, you can easily redefine a module in PyTorch to do this.

Resources