I have a question about getting all parameters of the network. My network is defined as follow:
activation = nn.ReLU()
class OneInputBasis(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.l1 = nn.Linear(200, 100, bias = bo_b).to(device)
self.l4 = nn.Linear(100, 100, bias = bo_last).to(device)
def forward(self, v):
v = activation ( self.l1(v) )
v = ( self.l4(v) )
return v
and
class node(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.set_lay = []
for jj in range(dim_output_space_basis):
self.set_lay.append(OneInputBasis())
def forward(self, v):
w = self.set_lay[0](v)
for ii in range(dim_output_space_basis-1):
w = torch.cat((w, self.set_lay[ii+1](v)), dim = 1 )
return w
and
class mesh(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.l3 = nn.Linear(2, 100, bias = bo_b).to(device)
self.l4 = nn.Linear(100, 100, bias = bo_b).to(device)
self.l7 = nn.Linear(100,10, bias = bo_last).to(device)
def forward(self, w):
w = activation ( self.l3(w) )
w = activation ( self.l4(w) )
w = ( self.l7(w) )
return w
finally, I have
activation = nn.ReLU()
class Test(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
self.top = node()
self.bottom = mesh()
def forward(self, v, w, y):
v = self.top(v)
w = self.bottom(w)
e = torch.bmm(w ,torch.bmm(v, y))
return e[:, :, 0]
Now I define the network:
fnn_adam = Test()
When I print the parameters of the network, as
for p in fnn_adam.parameters():
print(p)
I can only see the parameters associated with fnn_adam.bottom, how can I print out the parameters associated with fnn_adam.top? Are the parameters associated with .top trainable? Thank you!
Calling self.set_lay.append(OneInputBasis()) with the instantiation of node does not register the fully-connected layers
self.l1 = nn.Linear(200, 100, bias = bo_b).to(device)
self.l4 = nn.Linear(100, 100, bias = bo_last).to(device)
to the instance fnn_adam of class Test. This is why the respective parameters do not show up in your code above.
Without loss of generality, I chose
import torch
import torch.nn as nn
import torch.nn.functional as F
dim_output_space_basis = 2
device ='cpu'
and modified the init method of class node. The remainder of your code is perfectly fine. Please see below:
class node(nn.Module):
def __init__(self):
super().__init__()
bo_b = True
bo_last = False
# self.set_lay = [] # Legacy
attributeNames = ['l_btm{}'.format(i) for i in range(dim_output_space_basis)]
for jj_index, jj in enumerate(range(dim_output_space_basis)):
# self.set_lay.append(OneInputBasis()) # Legacy
setattr(self, attributeNames[jj_index], OneInputBasis())
Now, the parameters register as evidenced by running fnn_adam._modules and observing its output
OrderedDict([('top',
node(
(l_btm0): OneInputBasis(
(l1): Linear(in_features=200, out_features=100, bias=True)
(l4): Linear(in_features=100, out_features=100, bias=False)
)
(l_btm1): OneInputBasis(
(l1): Linear(in_features=200, out_features=100, bias=True)
(l4): Linear(in_features=100, out_features=100, bias=False)
)
)),
('bottom',
mesh(
(l3): Linear(in_features=2, out_features=100, bias=True)
(l4): Linear(in_features=100, out_features=100, bias=True)
(l7): Linear(in_features=100, out_features=10, bias=False)
))])
Related
When I am training a model, I should use only 10% of data for trainer.fit(model,datamodule)
so I should call DataModule just for 10% of data
Part of DataModule is:
class DataModule(pl.LightningDataModule):
def __init__(self, train_dataset, val_dataset, batch_size = 1):
super(DataModule, self).__init__()
self.train_dataset = train_dataset
self.val_dataset = val_dataset
self.batch_size = batch_size
def train_dataloader(self):
return DataLoader(self.train_dataset, batch_size = self.batch_size,
collate_fn = collate_fn, shuffle = True, num_workers = 2, pin_memory = True)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size = self.batch_size,
collate_fn = collate_fn, shuffle = False, num_workers = 2, pin_memory = True)
So I use a for loop
datamodule = DataModule(train_ds, val_ds)
for i,data in enumerate(datamodule.train_dataloader()):
print( datamodule.train_dataloader(i,data))
But it doesn't work. How can I change it?
I have made a custom model in tensorflow 2, which uses eager execution.
The model is trained using the inherited .fit() function, about 600k training samples are used in a 10 epoch cycle with a batch size of 128 (up to 8k batch has been done). After training the model is saved as a SavedModel format. This is then used in C++ by using the cppflow library. However, this process requires the inference to use the same batch size as the training of the model, while only requiring to do inference on a single sample at a time. The application requires that things are fast and padding a feature vector array with 127 dummy vectors is slowing everyting down.
The batch size is also used in the NormalizeLayer at the end, which is using a hardcoded units value at the moment to initialize a matrix.
I have searched for a way to use variable batch sizes in Tensorflow 2 custom models, but the only thing that is remotely close are TF1 examples; which are so outdated they are unusable.
My model:
class IndividualFeaturesLayer(tf.keras.layers.Layer):
def __init__(self):
super(IndividualFeaturesLayer, self).__init__()
def build(self, input_shape):
stddev = 2 / np.sqrt(input_shape[-1] + input_shape[-1])
self.w = tf.Variable(tf.random.truncated_normal((input_shape[-1], input_shape[-1]), dtype='float64'), trainable=True)
b_init = tf.zeros_initializer()
self.b = tf.Variable(initial_value=b_init(shape=(input_shape[-1]), dtype='float64'), trainable=True)
def call(self, input):
returnVar = tf.math.add(tf.matmul(input, self.w), self.b)
return returnVar
class FullFeatureLayer(tf.keras.layers.Layer):
def __init__(self):
super(FullFeatureLayer, self).__init__()
self.globalFeatures = IndividualFeaturesLayer()
self.pieceFeatures = IndividualFeaturesLayer()
self.squareFeatures = IndividualFeaturesLayer()
def call(self, input):
globalFeature = input[:, :17]
pieceFeature = input[:, 17:225]
squareFeature = input[:, 225:353]
x = self.globalFeatures(globalFeature)
y = self.pieceFeatures(pieceFeature)
z = self.squareFeatures(squareFeature)
returnVar = tf.concat([x, y, z], 1)
return tf.nn.relu(returnVar)
class FullFullyConnectedFeatureLayer(tf.keras.layers.Layer):
def __init__(self):
super(FullFullyConnectedFeatureLayer, self).__init__()
def build(self, input_shape):
stddev = 2 / np.sqrt(input_shape[-1] + input_shape[-1])
self.w = tf.Variable(tf.random.truncated_normal((input_shape[-1], input_shape[-1]), dtype='float64'), trainable=True)
b_init = tf.zeros_initializer()
self.b = tf.Variable(initial_value=b_init(shape=(input_shape[-1]), dtype='float64'), trainable=True)
def call(self, input):
return tf.nn.relu(tf.math.add(tf.matmul(input, self.w), self.b))
class FullFullyConnectedOutputLayer(tf.keras.layers.Layer):
def __init__(self):
super(FullFullyConnectedOutputLayer, self).__init__()
def build(self, input_shape):
stddev = 2 / np.sqrt(input_shape[-1] + 1)
self.w = tf.Variable(tf.random.truncated_normal((input_shape[-1], 1), dtype='float64'), trainable=True)
b_init = tf.zeros_initializer()
self.b = tf.Variable(initial_value=b_init(shape=(1), dtype='float64'), trainable=True)
def call(self, input):
return tf.matmul(input, self.w) + self.b
class NormalizeLayer(tf.keras.layers.Layer):
def __init__(self, units=128):
super(NormalizeLayer, self).__init__()
self.units = units
def build(self, input_shape):
self.divideTensor = tf.fill((self.units, 1), tf.constant(1500, dtype='float64'))
self.minTensor = tf.fill((self.units, 1), tf.constant(-1, dtype='float64'))
self.maxTensor = tf.fill((self.units, 1), tf.constant(1, dtype='float64'))
def call(self, input):
dividedTensor = tf.divide(input, self.divideTensor)
minimizedTensor = tf.math.minimum(dividedTensor, self.maxTensor)
maximizedTensor = tf.math.maximum(minimizedTensor, self.minTensor)
return maximizedTensor
class FullNetwork(tf.keras.Model):
def __init__(self, batch_size):
super(FullNetwork, self).__init__(name='')
self.inputLayer = FullFeatureLayer()
self.hiddenLayer1 = FullFeatureLayer()
self.hiddenLayer2 = FullFullyConnectedFeatureLayer()
self.outputLayer = FullFullyConnectedOutputLayer()
self.normalizeLayer = NormalizeLayer()
def call(self, input, batch_size):
print(batch_size)
x = self.inputLayer(input)
x = self.hiddenLayer1(x)
x = self.hiddenLayer2(x)
x = self.outputLayer(x)
x = self.normalizeLayer(x)
return x
tf.keras.backend.set_floatx('float64')
fullNetwork = FullNetwork()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
fullNetwork.compile(optimizer, loss=tf.keras.losses.MeanSquaredError(), metrics=["MeanAbsoluteError"], run_eagerly=True)
fullNetwork.fit(training_feature_array, training_score_array, epochs=10, batch_size=128)
Is there a way to add conditional statements inside the nn.Sequential(). Something similar to the code below.
import torch
class Building_Blocks(torch.nn.Module):
def conv_block (self, in_features, out_features, kernal_size, upsample=False):
block = torch.nn.Sequential(
torch.nn.Conv2d(in_features, out_features, kernal_size),
torch.nn.ReLU(inplace = True),
torch.nn.Conv2d(out_features, out_features, kernal_size),
torch.nn.ReLU(inplace = True),
if(upsample):
torch.nn.ConvTranspose2d(out_features, out_features, kernal_size)
)
return block
def __init__(self):
super(Building_Blocks, self).__init__()
self.contracting_layer1 = self.conv_block(3, 64, 3, upsample=True)
def forward(self, x):
x=self.contracting_layer1(x)
return x
No, but in your case it's easy to take if out of nn.Sequential:
class Building_Blocks(torch.nn.Module):
def conv_block(self, in_features, out_features, kernal_size, upsample=False):
layers = [
torch.nn.Conv2d(in_features, out_features, kernal_size),
torch.nn.ReLU(inplace=True),
torch.nn.Conv2d(out_features, out_features, kernal_size),
torch.nn.ReLU(inplace=True),
]
if upsample:
layers.append(
torch.nn.ConvTranspose2d(out_features, out_features, kernal_size)
)
block = torch.nn.Sequential(*layers)
return block
def __init__(self):
super(Building_Blocks, self).__init__()
self.contracting_layer1 = self.conv_block(3, 64, 3, upsample=True)
def forward(self, x):
x = self.contracting_layer1(x)
return x
You can always construct a list containing layers however you want and unpack it into torch.nn.Sequential afterwards.
We must first use the list in this case, so we put the condition in the list:(you can use this code:
from typing import Optional
import torch
class Building_Blocks(torch.nn.Module):
def conv_block (self, in_features, out_features, kernal_size, upsample :Optional[nn.Module] = None):
block = torch.nn.Sequential(
torch.nn.Conv2d(in_features, out_features, kernal_size),
torch.nn.ReLU(inplace = True),
torch.nn.Conv2d(out_features, out_features, kernal_size),
torch.nn.ReLU(inplace = True),
upsample = torch.nn.ConvTranspose2d(out_features, out_features, kernal_size)
if upsample = True
else None,
)
return block
def __init__(self):
super(Building_Blocks, self).__init__()
self.contracting_layer1 = self.conv_block(3, 64, 3, upsample=True)
def forward(self, x):
x=self.contracting_layer1(x)
return x
In general:
[f(x) if condition else g(x) for x in sequence]
Why are these two code segments not equivalent:
Segment 1: Creating a model with 2 layers.
class FNNModule(nn.Module):
def __init__(self, input_dim, output_dim, hidden_dim1, hidden_dim2, non_linear_function):
super().__init__()
self.hidden1 = nn.Linear(input_dim, hidden_dim1)
self.hidden2 = nn.Linear(hidden_dim1, hidden_dim2)
self.non_linear_function = non_linear_function()
self.final_linear = nn.Linear(hidden_dim2, output_dim)
def forward(self, x):
out = self.hidden1(x)
out = self.non_linear_function(out)
out = self.hidden2(x)
out = self.non_linear_function(out)
out = self.final_linear(out)
return out
Segment Two: Creating the same model but changing code where hidden_layers is a variable:
class FNNModuleVar(nn.Module):
def __init__(self, input_dim, output_dim, hidden_dim_array = [], non_linear_function_array=[]):
super().__init__()
self.linear_functions = []
self.non_linear_functions = [x() for x in non_linear_function_array]
self.hidden_layers = len(hidden_dim_array)
for l in range(self.hidden_layers):
self.linear_functions.append(nn.Linear(input_dim, hidden_dim_array[l]))
input_dim = hidden_dim_array[l]
self.final_linear = nn.Linear(input_dim, output_dim)
def forward(self, x):
out = x
for i in range(self.hidden_layers):
out = self.linear_functions[i](out)
out = self.non_linear_functions[i](out)
out = self.final_linear(x)
return out
modelVar = FNNModuleVar(input_dim, output_dim, [100, 50], [nn.Tanh, nn.Tanh])
model = FNNModule(input_dim, output_dim, 100, 50, nn.Tanh)
When I try to iterate through modelVar.parameters() and model.parameters() I see that I have very different models.
What am I doing wrong in modelVar?
Those modules are called as you would expect them to be called they are just not visible to the Module. In order to make them visible you can wrap them in a nn.ModuleList like this:
class FNNModuleVar(nn.Module):
def __init__(self, input_dim, output_dim, hidden_dim_array = [], non_linear_function_array=[]):
super().__init__()
self.linear_functions = []
self.non_linear_functions = [x() for x in non_linear_function_array]
self.hidden_layers = len(hidden_dim_array)
for l in range(self.hidden_layers):
self.linear_functions.append(nn.Linear(input_dim, hidden_dim_array[l]))
input_dim = hidden_dim_array[l]
self.linear_functions = nn.ModuleList(self.linear_functions)
self.final_linear = nn.Linear(input_dim, output_dim)
def forward(self, x):
out = x
for i in range(self.hidden_layers):
out = self.linear_functions[i](out)
out = self.non_linear_functions[i](out)
out = self.final_linear(out)
return out
printing the models now would yield:
FNNModule(
(hidden1): Linear(in_features=50, out_features=100, bias=True)
(hidden2): Linear(in_features=100, out_features=50, bias=True)
(non_linear_function): Tanh()
(final_linear): Linear(in_features=50, out_features=100, bias=True)
)
FNNModuleVar(
(linear_functions): ModuleList(
(0): Linear(in_features=50, out_features=100, bias=True)
(1): Linear(in_features=100, out_features=50, bias=True)
)
(final_linear): Linear(in_features=50, out_features=100, bias=True)
)
More details: https://pytorch.org/docs/stable/nn.html#torch.nn.ModuleList
I am trying to implement the attention described in Luong et al. 2015 in PyTorch myself, but I couldn't get it work. Below is my code, I am only interested in the "general" attention case for now. I wonder if I am missing any obvious error. It runs, but doesn't seem to learn.
class AttnDecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size, dropout_p=0.1):
super(AttnDecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.embedding = nn.Embedding(
num_embeddings=self.output_size,
embedding_dim=self.hidden_size
)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.attn = nn.Linear(self.hidden_size, self.hidden_size)
# hc: [hidden, context]
self.Whc = nn.Linear(self.hidden_size * 2, self.hidden_size)
# s: softmax
self.Ws = nn.Linear(self.hidden_size, self.output_size)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
gru_out, hidden = self.gru(embedded, hidden)
# [0] remove the dimension of directions x layers for now
attn_prod = torch.mm(self.attn(hidden)[0], encoder_outputs.t())
attn_weights = F.softmax(attn_prod, dim=1) # eq. 7/8
context = torch.mm(attn_weights, encoder_outputs)
# hc: [hidden: context]
out_hc = F.tanh(self.Whc(torch.cat([hidden[0], context], dim=1)) # eq.5
output = F.log_softmax(self.Ws(out_hc), dim=1) eq. 6
return output, hidden, attn_weights
I have studied the attention implemented in
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
and
https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
The first one isn't the exact attention mechanism I am looking for. A major disadvantage is that its attention depends on the sequence length (self.attn = nn.Linear(self.hidden_size * 2, self.max_length)), which could be expensive for long sequences.
The second one is more similar to what's described in the paper, but still not the same as there is not tanh. Besides, it is really slow after updating it to latest version of pytorch (ref). Also I don't know why it takes the last context (ref).
This version works, and it follows the definition of Luong Attention (general), closely. The main difference from that in the question is the separation of embedding_size and hidden_size, which appears to be important for training after experimentation. Previously, I made both of them the same size (256), which creates trouble for learning, and it seems that the network could only learn half the sequence.
class EncoderRNN(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size,
num_layers=1, bidirectional=False, batch_size=1):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.batch_size = batch_size
self.embedding = nn.Embedding(input_size, embedding_size)
self.gru = nn.GRU(embedding_size, hidden_size, num_layers,
bidirectional=bidirectional)
def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output, hidden = self.gru(embedded, hidden)
return output, hidden
def initHidden(self):
directions = 2 if self.bidirectional else 1
return torch.zeros(
self.num_layers * directions,
self.batch_size,
self.hidden_size,
device=DEVICE
)
class AttnDecoderRNN(nn.Module):
def __init__(self, embedding_size, hidden_size, output_size, dropout_p=0):
super(AttnDecoderRNN, self).__init__()
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.embedding = nn.Embedding(
num_embeddings=output_size,
embedding_dim=embedding_size
)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(embedding_size, hidden_size)
self.attn = nn.Linear(hidden_size, hidden_size)
# hc: [hidden, context]
self.Whc = nn.Linear(hidden_size * 2, hidden_size)
# s: softmax
self.Ws = nn.Linear(hidden_size, output_size)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
gru_out, hidden = self.gru(embedded, hidden)
attn_prod = torch.mm(self.attn(hidden)[0], encoder_outputs.t())
attn_weights = F.softmax(attn_prod, dim=1)
context = torch.mm(attn_weights, encoder_outputs)
# hc: [hidden: context]
hc = torch.cat([hidden[0], context], dim=1)
out_hc = F.tanh(self.Whc(hc))
output = F.log_softmax(self.Ws(out_hc), dim=1)
return output, hidden, attn_weights