ValueError: optimizer got an empty parameter list - pytorch

I create the following simple linear class:
class Decoder(nn.Module):
def __init__(self, K, h=()):
super().__init__()
h = (K,)+h+(K,)
self.layers = [nn.Linear(h1,h2) for h1,h2 in zip(h, h[1:])]
def forward(self, x):
for layer in self.layers[:-1]:
x = F.relu(layer(x))
return self.layers[-1](x)
However, when I try to put the parameters in a optimizer class I get the error ValueError: optimizer got an empty parameter list.
decoder = Decoder(4)
LR = 1e-3
opt = optim.Adam(decoder.parameters(), lr=LR)
Is there something I'm doing obviously wrong with the class definition?

Since you store your layers in a regular pythonic list inside your Decoder, Pytorch has no way of telling these members of the self.list are actually sub modules. Convert this list into pytorch's nn.ModuleList and your problem will be solved
class Decoder(nn.Module):
def __init__(self, K, h=()):
super().__init__()
h = (K,)+h+(K,)
self.layers = nn.ModuleList(nn.Linear(h1,h2) for h1,h2 in zip(h, h[1:]))

Related

Does Keras official sample code about Transformer applied in time-series contain Position Embedding part?

The sample code for referring from url:https://keras.io/examples/timeseries/timeseries_transformer_classification/
I could not find out any description about "Position Embedding" content in full page of above url. When I looked through Transformer applied in NLP, I can clearly see the class named "TokenAndPositionEmbedding".
If it does not contain "Position Embedding", how can I apply Position Embedding in time series in sample code?
From what I can tell it does not contain the positional embedding. Something like this should work.
class PositionEmbeddingFixedWeights(Layer):
def __init__(self, sequence_length, vocab_size, output_dim, **kwargs):
super(PositionEmbeddingFixedWeights, self).__init__(**kwargs)
word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
position_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)
self.word_embedding_layer = Embedding(
input_dim=vocab_size, output_dim=output_dim,
weights=[word_embedding_matrix],
trainable=False
)
self.position_embedding_layer = Embedding(
input_dim=sequence_length, output_dim=output_dim,
weights=[position_embedding_matrix],
trainable=False
)
def get_position_encoding(self, seq_len, d, n=10000):
P = np.zeros((seq_len, d))
for k in range(seq_len):
for i in np.arange(int(d/2)):
denominator = np.power(n, 2*i/d)
P[k, 2*i] = np.sin(k/denominator)
P[k, 2*i+1] = np.cos(k/denominator)
return P
def call(self, inputs):
position_indices = tf.range(tf.shape(inputs)[-1])
embedded_words = self.word_embedding_layer(inputs)
embedded_indices = self.position_embedding_layer(position_indices)
return embedded_words + embedded_indices
This class originated from https://machinelearningmastery.com/the-transformer-positional-encoding-layer-in-keras-part-2/

Custom layer from keras to pytorch

Coming from TensorFlow background, I am trying to convert a snippet of code of the custom layer from Keras to PyTorch.
The custom layer in Keras looks like this:
class Attention_module(tf.keras.layers.Layer):
def __init__(self, class_num):
super(Attention_module,self).__init__(class_num)
self.class_num = class_num
self.Ws = None
def build(self, input_shape):
embedding_length = int(input_shape[2])
self.Ws = self.add_weight(shape=(self.class_num, embedding_length),
initializer=tf.keras.initializers.get('glorot_uniform'), trainable=True)
super(Attention_module, self).build(input_shape)
def call(self, inputs):
sentence_trans = tf.transpose(inputs, [0, 2, 1])
at = tf.matmul(self.Ws, sentence_trans)
at = tf.math.tanh(at)
at = K.exp(at - K.max(at, axis=-1, keepdims=True))
at = at / K.sum(at, axis=-1, keepdims=True)
v = K.batch_dot(at, inputs)
return v
I want to implement the same in the torch; I have already done the forward pass block but am confused about how to do the embedding and weight initialization the same as the above layer in PyTorch?
class Attention_module(torch.nn.Module):
def __init__(self, class_num):
# how to initialize weight with same as above keras layer?
def forward(self, inputs):
sentence_trans = inputs.permute(0, 2, 1)
at = torch.mm(self.Ws, sentence_trans)
at = torch.nn.Tanh(at)
at = torch.exp(at - torch.max(torch.Tensor(at), dim=-1, keepdims=True).values)
at = at / torch.sum(at, dim = -1, keepdims=True)
v = torch.einsum('ijk,ikl->ijl', at, inputs)
return v
Thank you!
class Attention_module(torch.nn.Module):
def __init__(self, class_num, input_shape):
super().__init__()
self.class_num = class_num
embedding_length = int(input_shape[2])
self.Ws = torch.nn.Embedding(num_embeddings=class_num,
embedding_dim=embedding_length) # Embedding layer
torch.nn.init.xavier_uniform_(self.Ws.weight) # Glorot initialization
Here's the reference for layer initialization methods. Xavier init is another name for Glorot init.
The _ at the end of torch.nn.init.xavier_uniform_ is a pytorch convention that signifies an inplace operation.
You can also use torch.nn.init at runtime. It doesn't have to be within __init__(). Like:
att = Attention_module(class_num, input_shape)
torch.nn.init.xavier_uniform_(att.Ws.weight)
or :
for param in att.parameters():
torch.nn.init.xavier_uniform_(param)

In Pytorch, when transferring to GPU, I get an error "is on CPU, but expected to be on GPU"

Error example: "Tensor for 'out' is on CPU, Tensor for argument #1 'self' is on CPU, but expected them to be on GPU". I was stuck on the tutorial for classification:
https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
Note: The code is for regression.
Code is below:
class Net(nn.Module):
def __init__(self, num_features, size_hidden_layer, n_hidden_layer):
super(Net, self).__init__()
self.size_hidden_layer = size_hidden_layer
self.n_hidden_layer = n_hidden_layer
self.hidden_layers = list()
self.hidden_layers.append(nn.Linear(num_features, size_hidden_layer))
for _ in range(n_hidden_layer-1):
self.hidden_layers.append(nn.Linear(size_hidden_layer, size_hidden_layer))
self.last_layer = nn.Linear(size_hidden_layer, 1)
def forward(self, x):
for i in range(self.n_hidden_layer):
x = torch.relu(self.hidden_layers[i](x))
return self.last_layer(x)
What does the tutorial section not mention is that the parameters have to be wrapped in order to be read by the GPU. For example, look at __init__ where normal and neural network layers are wrapped in nn.Sequential.
class Net(nn.Module):
def __init__(self, num_features, size_hidden_layer, n_hidden_layer):
super(Net, self).__init__()
self.size_hidden_layer = size_hidden_layer
self.n_hidden_layer = n_hidden_layer
hidden_layers = list()
hidden_layers.append(nn.Linear(num_features, size_hidden_layer))
for _ in range(n_hidden_layer-1):
hidden_layers.append(nn.Linear(size_hidden_layer, size_hidden_layer))
self.hidden_layers = nn.Sequential(*hidden_layers)
self.last_layer = nn.Linear(size_hidden_layer, 1)
def forward(self, x):
for i in range(self.n_hidden_layer):
x = torch.relu(self.hidden_layers[i](x))
return self.last_layer(x)

optimizer got an empty parameter list (skorch)

So, I am used to use PyTorch and now decided to give Skorch a shot.
Here they define the network as
class ClassifierModule(nn.Module):
def __init__(
self,
num_units=10,
nonlin=F.relu,
dropout=0.5,
):
super(ClassifierModule, self).__init__()
self.num_units = num_units
self.nonlin = nonlin
self.dropout = dropout
self.dense0 = nn.Linear(20, num_units)
self.nonlin = nonlin
self.dropout = nn.Dropout(dropout)
self.dense1 = nn.Linear(num_units, 10)
self.output = nn.Linear(10, 2)
def forward(self, X, **kwargs):
X = self.nonlin(self.dense0(X))
X = self.dropout(X)
X = F.relu(self.dense1(X))
X = F.softmax(self.output(X), dim=-1)
return X
I prefer inputting lists of neurons in each layer i.e num_units=[30,15,5,2] would have 2 hidden layers with 15 and 5 neurons. Furthermore we have 30 features and 2 classes, thus re-writing it to something like this
class Net(nn.Module):
def __init__(
self,
num_units=[30,15,5,2],
nonlin=[F.relu,F.relu,F.relu],
dropout=[0.5,0.5,0.5],
):
super(Net, self).__init__()
self.layer_units = layer_units
self.nonlin = nonlin #Activation function
self.dropout = dropout #Drop-out rates in each layer
self.layers = [nn.Linear(i,p) for i,p in zip(layer_units,layer_units[1:])] #Dense layers
def forward(self, X, **kwargs):
print("Forwards")
for layer,func,drop in zip(self.layers[:-1],self.nonlin,self.dropout):
print(layer,func,drop)
X=drop(func(layer(X)))
X = F.softmax(X, dim=-1)
return X
should do the trick. The problem is that when calling
net = NeuralNetClassifier(Net,max_epochs=20,lr=0.1,device="cuda")
net.fit(X,y)
I get the error "ValueError: optimizer got an empty parameter list". I have narrowed it down to the removal of self.output = nn.Linear(10, 2) simply makes the net not enter forward i.e it seems like output is some kind of "trigger" variable. Is that really the case the network need a variable called output (being a layer) at the end, and that we are not free to define the variable-names ourself ?
Pytorch will look for subclasses of nn.Module, so changing
self.layers = [nn.Linear(i,p) for i,p in zip(layer_units,layer_units[1:])]
to
self.layers = nn.ModuleList([nn.Linear(i,p) for i,p in zip(layer_units,layer_units[1:])])
should work fine

how to use output of sklearn pipeline elements

I have three features:
feature_one -> number of tokens in the given sentence.
feature_two -> number of verbs in the given sentence.
feature_three -> number of tokens - number of verbs in the given sentence.
(feature_one - feature_two)
I have written custom transformers for feature_one and feature_two and want to written custom transformer for feature_three such that I can use result of feature_one and feature_two by running pipeline as:
Pipeline([
#input to feature_one and feature_two is list of sentences.
("feature", FeatureUnion([
("feature_one", feature_one_transformer()),
("feature_two", feature_two_transformer())
])),
("feature_three", feature_three_transformer())
])
feature_one_transformer:
class feature_one_transformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, x, y):
return self
def transform(self, sentence_list):
number_of_tokens_in_sentence_list = list()
for sentence in sentence_list:
number_of_tokens = compute_number_of_tokens
number_of_tokens_in_sentence_lista.append(number_of_tokens)
return pandas.DataFrame(number_of_tokens_in_sentence_list)
feature_two_transformer:
class feature_two_transformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, x, y):
return self
def transform(self, sentence_list):
number_of_verbs_in_sentence_list = list()
for sentence in sentence_list:
number_of_verbs = compute_number_of_verbs_in_sentence
number_of_verbs_in_sentence_lista.append(number_of_verbs)
return pandas.DataFrame(number_of_verbs_in_sentence_list)
Can somebody tell me how should I write custom transformer for feature_three and how to use in pipeline so that I can use result of feature_one and feature_two transformers. Thank you.
It's not clear to me why you would want to make this so complicated. I would just use one transformer that does everything you want. Something like this:
class features_transformer(BaseEstimator, TransformerMixin):
def __init__(self, variable):
self.variable = variable
def fit(self, X):
return self
def transform(self, X):
X['number_of_tokens'] = X[self.variable].apply(lambda cell: compute_number_of_tokens(cell))
X['number_of_verbs'] = X[self.variable].apply(lambda cell: compute_number_of_verbs(cell))
X['tokens_minus_verbs'] = X['number_of_tokens'] - X['number_of_verbs']
return X
new_X = features_transformer('sentences').fit_transform(X)

Resources