Learnable scalar weight in PyTorch - pytorch

I have two neural networks running in parallel. Each gives a features map of same size say Nx1. Now I want weighted average of these embedding like this w1 * embed1 + w2 * embed2. I have tried these 1 2.But the weights are not updating. Any help would be appreciated. Here is how I am trying to do it:
class LinearWeightedAvg(nn.Module):
def __init__(self, n_inputs):
super(LinearWeightedAvg, self).__init__()
self.weight1 = Variable(torch.randn(1), requires_grad=True).cuda()
self.weight2 = Variable(torch.randn(1), requires_grad=True).cuda()
def forward(self, inp_embed):
return self.weight1 * inp_embed[0] + self.weight2 * inp_embed[1]
class EmbedBranch(nn.Module):
def __init__(self, feat_dim, embedding_dim):
super(EmbedBranch, self).__init__()
fc_layer1 = fc_layer
def forward(self, x):
x = self.fc_layer1(x)
return x
class EmbeddingNetwork(nn.Module):
def __init__(self, args, N):
super(EmbeddingNetwork, self).__init__()
embedding_dim = N
self.embed1 = EmbedBranch(N, N)
self.embed2 = EmbedBranch(N, N)
self.comb_branch = LinearWeightedAvg(metric_dim)
self.args = args
if args.cuda:
self.cuda()
def forward(self, emb1, emb2):
embeds1 = self.text_branch(emb1)
embeds2 = self.image_branch(emb2)
combined = self.comb_branch([embeds1, embeds2])
return combined
def train_forward(self, embed1, embed2):
combined = self(embed1, embed2)
embeds = model.train_forward(embed1, embed2)
loss = loss_func(embeds, labels)
running_loss.update(loss.data.item())
optimizer.zero_grad()
loss.backward()
Also I want the weight to be within 0-1 range.
Thanks,

You should use self.weightx = torch.nn.Parameter(your_inital_tensor) to register a tensor as a learnable parameter of the model.

Related

Does Keras official sample code about Transformer applied in time-series contain Position Embedding part?

The sample code for referring from url:https://keras.io/examples/timeseries/timeseries_transformer_classification/
I could not find out any description about "Position Embedding" content in full page of above url. When I looked through Transformer applied in NLP, I can clearly see the class named "TokenAndPositionEmbedding".
If it does not contain "Position Embedding", how can I apply Position Embedding in time series in sample code?
From what I can tell it does not contain the positional embedding. Something like this should work.
class PositionEmbeddingFixedWeights(Layer):
def __init__(self, sequence_length, vocab_size, output_dim, **kwargs):
super(PositionEmbeddingFixedWeights, self).__init__(**kwargs)
word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
position_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)
self.word_embedding_layer = Embedding(
input_dim=vocab_size, output_dim=output_dim,
weights=[word_embedding_matrix],
trainable=False
)
self.position_embedding_layer = Embedding(
input_dim=sequence_length, output_dim=output_dim,
weights=[position_embedding_matrix],
trainable=False
)
def get_position_encoding(self, seq_len, d, n=10000):
P = np.zeros((seq_len, d))
for k in range(seq_len):
for i in np.arange(int(d/2)):
denominator = np.power(n, 2*i/d)
P[k, 2*i] = np.sin(k/denominator)
P[k, 2*i+1] = np.cos(k/denominator)
return P
def call(self, inputs):
position_indices = tf.range(tf.shape(inputs)[-1])
embedded_words = self.word_embedding_layer(inputs)
embedded_indices = self.position_embedding_layer(position_indices)
return embedded_words + embedded_indices
This class originated from https://machinelearningmastery.com/the-transformer-positional-encoding-layer-in-keras-part-2/

Custom layer from keras to pytorch

Coming from TensorFlow background, I am trying to convert a snippet of code of the custom layer from Keras to PyTorch.
The custom layer in Keras looks like this:
class Attention_module(tf.keras.layers.Layer):
def __init__(self, class_num):
super(Attention_module,self).__init__(class_num)
self.class_num = class_num
self.Ws = None
def build(self, input_shape):
embedding_length = int(input_shape[2])
self.Ws = self.add_weight(shape=(self.class_num, embedding_length),
initializer=tf.keras.initializers.get('glorot_uniform'), trainable=True)
super(Attention_module, self).build(input_shape)
def call(self, inputs):
sentence_trans = tf.transpose(inputs, [0, 2, 1])
at = tf.matmul(self.Ws, sentence_trans)
at = tf.math.tanh(at)
at = K.exp(at - K.max(at, axis=-1, keepdims=True))
at = at / K.sum(at, axis=-1, keepdims=True)
v = K.batch_dot(at, inputs)
return v
I want to implement the same in the torch; I have already done the forward pass block but am confused about how to do the embedding and weight initialization the same as the above layer in PyTorch?
class Attention_module(torch.nn.Module):
def __init__(self, class_num):
# how to initialize weight with same as above keras layer?
def forward(self, inputs):
sentence_trans = inputs.permute(0, 2, 1)
at = torch.mm(self.Ws, sentence_trans)
at = torch.nn.Tanh(at)
at = torch.exp(at - torch.max(torch.Tensor(at), dim=-1, keepdims=True).values)
at = at / torch.sum(at, dim = -1, keepdims=True)
v = torch.einsum('ijk,ikl->ijl', at, inputs)
return v
Thank you!
class Attention_module(torch.nn.Module):
def __init__(self, class_num, input_shape):
super().__init__()
self.class_num = class_num
embedding_length = int(input_shape[2])
self.Ws = torch.nn.Embedding(num_embeddings=class_num,
embedding_dim=embedding_length) # Embedding layer
torch.nn.init.xavier_uniform_(self.Ws.weight) # Glorot initialization
Here's the reference for layer initialization methods. Xavier init is another name for Glorot init.
The _ at the end of torch.nn.init.xavier_uniform_ is a pytorch convention that signifies an inplace operation.
You can also use torch.nn.init at runtime. It doesn't have to be within __init__(). Like:
att = Attention_module(class_num, input_shape)
torch.nn.init.xavier_uniform_(att.Ws.weight)
or :
for param in att.parameters():
torch.nn.init.xavier_uniform_(param)

In Pytorch, when transferring to GPU, I get an error "is on CPU, but expected to be on GPU"

Error example: "Tensor for 'out' is on CPU, Tensor for argument #1 'self' is on CPU, but expected them to be on GPU". I was stuck on the tutorial for classification:
https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
Note: The code is for regression.
Code is below:
class Net(nn.Module):
def __init__(self, num_features, size_hidden_layer, n_hidden_layer):
super(Net, self).__init__()
self.size_hidden_layer = size_hidden_layer
self.n_hidden_layer = n_hidden_layer
self.hidden_layers = list()
self.hidden_layers.append(nn.Linear(num_features, size_hidden_layer))
for _ in range(n_hidden_layer-1):
self.hidden_layers.append(nn.Linear(size_hidden_layer, size_hidden_layer))
self.last_layer = nn.Linear(size_hidden_layer, 1)
def forward(self, x):
for i in range(self.n_hidden_layer):
x = torch.relu(self.hidden_layers[i](x))
return self.last_layer(x)
What does the tutorial section not mention is that the parameters have to be wrapped in order to be read by the GPU. For example, look at __init__ where normal and neural network layers are wrapped in nn.Sequential.
class Net(nn.Module):
def __init__(self, num_features, size_hidden_layer, n_hidden_layer):
super(Net, self).__init__()
self.size_hidden_layer = size_hidden_layer
self.n_hidden_layer = n_hidden_layer
hidden_layers = list()
hidden_layers.append(nn.Linear(num_features, size_hidden_layer))
for _ in range(n_hidden_layer-1):
hidden_layers.append(nn.Linear(size_hidden_layer, size_hidden_layer))
self.hidden_layers = nn.Sequential(*hidden_layers)
self.last_layer = nn.Linear(size_hidden_layer, 1)
def forward(self, x):
for i in range(self.n_hidden_layer):
x = torch.relu(self.hidden_layers[i](x))
return self.last_layer(x)

Using Captum with Pytorch Lightning?

So I tried to use Captum with PyTorch Lightning. I am having issues when passing the Module to Captum, since it seems to do weird reshaping of the tensors.
For example in the below minimal example, the lightning code works easy and well.
But when I use IntegratedGradient with "n_step>=1" I get an issue.
The code of the LighningModule is not that important I would say, I wonder more at the code line at the very bottom.
Does anyone know how to work around this?
from captum.attr import IntegratedGradients
from torch import nn, optim, rand, sum as tsum, reshape, device
import torch.nn.functional as F
from pytorch_lightning import seed_everything, LightningModule, Trainer
from torch.utils.data import DataLoader, Dataset
SAMPLE_DIM = 3
class CustomDataset(Dataset):
def __init__(self, samples=42):
self.dataset = rand(samples, SAMPLE_DIM).cuda().float() * 2 - 1
def __getitem__(self, index):
return (self.dataset[index], (tsum(self.dataset[index]) > 0).cuda().float())
def __len__(self):
return self.dataset.size()[0]
class OurModel(LightningModule):
def __init__(self):
super(OurModel, self).__init__()
# Network layers
self.linear = nn.Linear(SAMPLE_DIM, 2048)
self.linear2 = nn.Linear(2048, 1)
self.output = nn.Sigmoid()
# Hyper-parameters, that we will auto-tune using lightning!
self.lr = 0.001
self.batch_size = 512
def forward(self, x):
x = self.linear(x)
x = self.linear2(x)
output = self.output(x)
return reshape(output, (-1,))
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=self.lr)
def train_dataloader(self):
loader = DataLoader(CustomDataset(samples=1000), batch_size=self.batch_size, shuffle=True)
return loader
def training_step(self, batch, batch_nb):
x, y = batch
loss = F.binary_cross_entropy(self(x), y)
return {'loss': loss, 'log': {'train_loss': loss}}
if __name__ == '__main__':
seed_everything(42)
device = device("cuda")
model = OurModel().to(device)
trainer = Trainer(max_epochs=2, min_epochs=1, auto_lr_find=False,
progress_bar_refresh_rate=10)
trainer.fit(model)
# ok Now the Problem
test_input = CustomDataset(samples=1).__getitem__(0)[0].requires_grad_()
ig = IntegratedGradients(model)
attr, delta = ig.attribute(test_input, target=1, return_convergence_delta=True)
The solution was to wrap the forward function. Make sure that the shape going into the mode.foward() is correct!
# Solution is this wrapper function
def modified_f(in_vec):
# Shape here is wrong
print("IN:", in_vec.size())
x = torch.reshape(in_vec, (int(in_vec.size()[0]/SAMPLE_DIM), SAMPLE_DIM))
print("x:", x.size())
res = model.forward(x)
print("res:", res.size())
res = torch.reshape(res, (res.size()[0], 1))
print("res2:", res.size())
return res
ig = IntegratedGradients(modified_f)
attr, delta = ig.attribute(test_input, return_convergence_delta=True, n_steps=STEP_AMOUNT)

optimizer got an empty parameter list (skorch)

So, I am used to use PyTorch and now decided to give Skorch a shot.
Here they define the network as
class ClassifierModule(nn.Module):
def __init__(
self,
num_units=10,
nonlin=F.relu,
dropout=0.5,
):
super(ClassifierModule, self).__init__()
self.num_units = num_units
self.nonlin = nonlin
self.dropout = dropout
self.dense0 = nn.Linear(20, num_units)
self.nonlin = nonlin
self.dropout = nn.Dropout(dropout)
self.dense1 = nn.Linear(num_units, 10)
self.output = nn.Linear(10, 2)
def forward(self, X, **kwargs):
X = self.nonlin(self.dense0(X))
X = self.dropout(X)
X = F.relu(self.dense1(X))
X = F.softmax(self.output(X), dim=-1)
return X
I prefer inputting lists of neurons in each layer i.e num_units=[30,15,5,2] would have 2 hidden layers with 15 and 5 neurons. Furthermore we have 30 features and 2 classes, thus re-writing it to something like this
class Net(nn.Module):
def __init__(
self,
num_units=[30,15,5,2],
nonlin=[F.relu,F.relu,F.relu],
dropout=[0.5,0.5,0.5],
):
super(Net, self).__init__()
self.layer_units = layer_units
self.nonlin = nonlin #Activation function
self.dropout = dropout #Drop-out rates in each layer
self.layers = [nn.Linear(i,p) for i,p in zip(layer_units,layer_units[1:])] #Dense layers
def forward(self, X, **kwargs):
print("Forwards")
for layer,func,drop in zip(self.layers[:-1],self.nonlin,self.dropout):
print(layer,func,drop)
X=drop(func(layer(X)))
X = F.softmax(X, dim=-1)
return X
should do the trick. The problem is that when calling
net = NeuralNetClassifier(Net,max_epochs=20,lr=0.1,device="cuda")
net.fit(X,y)
I get the error "ValueError: optimizer got an empty parameter list". I have narrowed it down to the removal of self.output = nn.Linear(10, 2) simply makes the net not enter forward i.e it seems like output is some kind of "trigger" variable. Is that really the case the network need a variable called output (being a layer) at the end, and that we are not free to define the variable-names ourself ?
Pytorch will look for subclasses of nn.Module, so changing
self.layers = [nn.Linear(i,p) for i,p in zip(layer_units,layer_units[1:])]
to
self.layers = nn.ModuleList([nn.Linear(i,p) for i,p in zip(layer_units,layer_units[1:])])
should work fine

Resources