I'm experimenting with TF 2.0.
I want to record the gradient and weights norm across my NN. To do so I'm using the following code.
def get_weights_norm(layer, optim_iters, log=False):
"""
Calculate norm of layer's weights and save it as tf.summary
if log = true it also print it
"""
w_l = layer.trainable_weights
name = layer.name
if log:
print("Layer " + name)
for w in w_l:
shape = str(w.shape.as_list())
norm = tf.norm(w.numpy(), name="norm").numpy()
s_name = name + "_layer_norm/ shape-" + shape
tf.summary.scalar(s_name, norm, step=optim_iters)
if log:
print("\tWeights norm: %s shape: %s" % (norm, shape))
def get_grad_norm(g_tape, loss_value, layer, optim_iters, log=False):
"""
Calculate norm of gradients of the loss respect to layer weights weights and save it as tf.summary
if log = true it also print it
"""
grad = g_tape.gradient(loss_value, layer.trainable_weights)
name = layer.name
if log:
print("Layer " + name)
for w in grad:
shape = str(w.shape.as_list())
norm = tf.norm(w.numpy(), name="norm").numpy()
s_name = name + "_layer_grad_norm/ shape-" + shape
tf.summary.scalar(s_name, norm, step=optim_iters)
if log:
print("\tGrad norm: %s shape: %s" % (norm, shape))
print("{:.2E}".format(norm))
And here is the training loop:
for epoch in range(epochs):
print('Start of epoch %d' % (epoch,))
# Iterate over the batches of the dataset.
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
# Open a GradientTape to record the operations run
# during the forward pass, which enables autodifferentiation.
with tf.GradientTape(persistent=True) as tape:
# Run the forward pass of the layer.
logits = model(x_batch_train) # Logits for this minibatch
# Compute the loss value for this minibatch.
loss_value = loss_fn(y_batch_train, logits)
# Use the gradient tape to automatically retrieve
# the gradients of the trainable variables with respect to the loss.
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
g_bidi = tape.gradient(loss_value, model.get_layer("bi_lstm").trainable_weights)
g_out = tape.gradient(loss_value, model.get_layer("output").trainable_weights)
g_dense = tape.gradient(loss_value, model.get_layer("dense").trainable_weights)
print("Out Layer")
w_out = model.get_layer("output").trainable_weights
print(model.get_layer("output").name)
print(float(tf.norm(w_out[0].numpy(), name="norm")))
print("\tWeights norm: %s shape: %s" % (tf.norm(w_out[0].numpy(), name="norm"), w_out[0].shape))
print("\tWeights norm: %s shape: %s" % (tf.norm(w_out[1].numpy(), name="norm"), w_out[1].shape))
print()
print("\t ||dE/dw_out|| = %s shape: %s" % (tf.norm(g_out[0].numpy(), name='norm'), g_out[0].shape))
print("\t ||dE/db_out|| = %s shape: %s" % (tf.norm(g_out[1].numpy(), name='norm'), g_out[1].shape))
get_weights_norm(model.get_layer("output"), optimizer.iterations, True)
get_grad_norm(tape, loss_value, model.get_layer("output"), optimizer.iterations, True)
print()
print()
print("Bidirect")
w_bid = model.get_layer("bi_lstm").trainable_weights
print("\tWeights fwd norm: %s shape %s:" % (tf.norm(w_bid[0].numpy(), name="norm"), w_bid[0].shape))
print("\tWeights fwd_rec norm: %s shape %s:" % (tf.norm(w_bid[1].numpy(), name="norm"), w_bid[1].shape))
print("\tWeights fwd bias norm: %s shape %s:" % (tf.norm(w_bid[2].numpy(), name="norm"), w_bid[2].shape))
print("\tWeights bwd norm: %s shape %s:" % (tf.norm(w_bid[3].numpy(), name="norm"), w_bid[3].shape))
print("\tWeights bwd_rec norm: %s shape %s:" % (tf.norm(w_bid[4].numpy(), name="norm"), w_bid[4].shape))
print("\tWeights bwd bias norm: %s shape %s:" % (tf.norm(w_bid[5].numpy(), name="norm"), w_bid[5].shape))
print()
print("\t ||dE/dw_forw|| = %s shape: %s" % (tf.norm(g_bidi[0].numpy(), name='norm'), g_bidi[0].shape))
print("\t ||dE/dw_forw_rec|| = %s shape: %s" % (tf.norm(g_bidi[1].numpy(), name='norm'), g_bidi[1].shape))
print("\t ||dE/dw_forw_bias|| = %s shape: %s" % (tf.norm(g_bidi[2].numpy(), name='norm'), g_bidi[2].shape))
print("\t ||dE/dw_bckw|| = %s shape: %s" % (tf.norm(g_bidi[3].numpy(), name='norm'), g_bidi[3].shape))
print("\t ||dE/dw_bkw_rec|| = %s shape: %s" % (tf.norm(g_bidi[4].numpy(), name='norm'), g_bidi[4].shape))
print("\t ||dE/dw_bkw_bias|| = %s shape: %s" % (tf.norm(g_bidi[5].numpy(), name='norm'), g_bidi[5].shape))
get_weights_norm(model.get_layer("bi_lstm"), optimizer.iterations, True)
get_grad_norm(tape, loss_value, model.get_layer("bi_lstm"), optimizer.iterations, True)
The Problem:
When running the script I get the same values for the gradient norm of the output layers but different values for the norm of the bidirectional layer (bi_lstm)
Here's the output:
Weights norm: tf.Tensor(0.33847392, shape=(), dtype=float32) shape: (64, 1)
Weights norm: tf.Tensor(88.14, shape=(), dtype=float32) shape: (1,)
||dE/dw_out|| = tf.Tensor(1.7349662, shape=(), dtype=float32) shape: (64, 1)
||dE/db_out|| = tf.Tensor(0.31759995, shape=(), dtype=float32) shape: (1,)
Layer output
Weights norm: 0.33847392 shape: [64, 1]
Weights norm: 88.14 shape: [1]
Bidirect
Weights fwd norm: tf.Tensor(13.112313, shape=(), dtype=float32) shape (256, 128):
Weights fwd_rec norm: tf.Tensor(5.691354, shape=(), dtype=float32) shape (32, 128):
Weights fwd bias norm: tf.Tensor(11.340048, shape=(), dtype=float32) shape (128,):
Weights bwd norm: tf.Tensor(13.147353, shape=(), dtype=float32) shape (256, 128):
Weights bwd_rec norm: tf.Tensor(5.685838, shape=(), dtype=float32) shape (32, 128):
Weights bwd bias norm: tf.Tensor(11.3102255, shape=(), dtype=float32) shape (128,):
||dE/dw_forw|| = tf.Tensor(9.418793e-07, shape=(), dtype=float32) shape: (256, 128)
||dE/dw_forw_rec|| = tf.Tensor(3.8971484e-06, shape=(), dtype=float32) shape: (32, 128)
||dE/dw_forw_bias|| = tf.Tensor(1.0172046e-06, shape=(), dtype=float32) shape: (128,)
||dE/dw_bckw|| = tf.Tensor(9.837944e-07, shape=(), dtype=float32) shape: (256, 128)
||dE/dw_bkw_rec|| = tf.Tensor(4.134917e-06, shape=(), dtype=float32) shape: (32, 128)
||dE/dw_bkw_bias|| = tf.Tensor(1.0577168e-06, shape=(), dtype=float32) shape: (128,)
Layer bi_lstm
Weights norm: 13.112313 shape: [256, 128]
Weights norm: 5.691354 shape: [32, 128]
Weights norm: 11.340048 shape: [128]
Weights norm: 13.147353 shape: [256, 128]
Weights norm: 5.685838 shape: [32, 128]
Weights norm: 11.3102255 shape: [128]
Layer bi_lstm
Grad norm: 0.0 shape: [256, 128]
0.00E+00
Grad norm: 0.0 shape: [32, 128]
0.00E+00
Grad norm: 0.0 shape: [128]
0.00E+00
Grad norm: 0.0 shape: [256, 128]
0.00E+00
Grad norm: 0.0 shape: [32, 128]
0.00E+00
Grad norm: 0.0 shape: [128]
0.00E+0
What am I missing here?
Thanks in advance
Related
I am trying to use Transfer Learning using ResNet-50 in TensorFlow2 and Keras on CIFAR-10 dataset which has (32, 32, 3) images.
The default ResNet-50's first conv layer uses a filter size of (7, 7) with stride = 2, the resulting CIFAR-10 is reduced too much spatially here which is to be avoided. As a 'hack', the images are attempted to be upscaled from (32, 32) to (224, 224). The code is:
import tensorflow.keras as K
# Define KerasTensor as input-
input_t = K.Input(shape = (32, 32, 3))
res_model = K.applications.ResNet50(
include_top = False,
weights = "imagenet",
input_tensor = input_t
)
# Since CIFAR-10 dataset is small as compared to ImageNet, the images are upscaled to (224, 224)-
to_res = (224, 224)
model = K.models.Sequential()
model.add(K.layers.Lambda(lambda image: tf.image.resize(image, to_res)))
model.add(res_model)
model.add(K.layers.Flatten())
model.add(K.layers.BatchNormalization())
model.add(K.layers.Dense(units = 10, activation = 'softmax'))
# Choose an optimizer and loss function for training-
loss_fn = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate = 0.1, momentum = 0.9)
model.compile(
# loss = 'categorical_crossentropy',
loss = loss_fn,
# optimizer = K.optimizers.RMSprop(lr=2e-5),
optimizer = optimizer,
metrics=['accuracy']
)
history = model.fit(
x = X_train, y = y_train,
batch_size = batch_size, epochs = 10,
validation_data = (X_test, y_test),
# callbacks=[check_point]
)
To which I get the error:
Epoch 1/10 WARNING:tensorflow:Model was constructed with shape (None,
32, 32, 3) for input KerasTensor(type_spec=TensorSpec(shape=(None, 32,
32, 3), dtype=tf.float32, name='input_1'), name='input_1',
description="created by layer 'input_1'"), but it was called on an
input with incompatible shape (None, 224, 224, 3).
ValueError Traceback (most recent call
last)
in ()
2 x = X_train, y = y_train,
3 batch_size = batch_size, epochs = 10,
----> 4 validation_data = (X_test, y_test),
5 # callbacks=[check_point]
6 )
9 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py
in wrapper(*args, **kwargs)
975 except Exception as e: # pylint:disable=broad-except
976 if hasattr(e, "ag_error_metadata"):
--> 977 raise e.ag_error_metadata.to_exception(e)
978 else:
979 raise
ValueError: in user code:
ValueError: Input 0 is incompatible with layer resnet50: expected
shape=(None, 32, 32, 3), found shape=(None, 224, 224, 3)
The input of the model is still (32, 32, 3)
input_t = K.Input(shape = (32, 32, 3))
I try to run the following programe for images classification problem in Pytorch:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.utils.data as data
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Hyper parameters
num_epochs = 5
num_classes = 10
batch_size = 100
learning_rate = 0.001
TRAIN_DATA_PATH = "train/"
TEST_DATA_PATH = "test/"
TRANSFORM_IMG = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(256),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225] )
])
train_dataset = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_dataset = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
# Convolutional neural network (two convolutional layers)
class ConvNet(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7 * 7 * 32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
model = ConvNet(num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
# Test the model
model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))
# Save the model checkpoint
torch.save(model.state_dict(), 'model/model.ckpt')
But I get a RuntimeError:
Traceback (most recent call last):
RuntimeError: Given groups=1, weight of size 16 1 5 5, expected input[100, 3, 256, 256] to have 1 channels, but got 3 channels instead
Someone could help to fix the bug? Thanks a lot.
Reference related:
https://discuss.pytorch.org/t/given-groups-1-weight-16-1-5-5-so-expected-input-100-3-64-64-to-have-1-channels-but-got-3-channels-instead/28831/17
RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[3, 1, 224, 224] to have 3 channels, but got 1 channels instead
Your input layer self.layer1 starts with a 2d convolution nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2). This conv layer expects an input with two spatial dimensions and one channel, and outputs a tesnor with the same spatial dimensions and 16 channels.
However, your input has three channels and not one (RGB image instead of gray level image).
Make sure your net and data are in synch.
I have been trying to obtaining a vector representation of a sequence of vectors using an LSTM autoencoder so that I can classify the sequence using a SVM or other such supervised algorithms. The amount of data is preventing me from using a fully connected dense layer for classification.
The shortest size of my input is 7 timesteps and the longest sequence is 356 timesteps. Accordingly, I have padded the shorter sequences with zeros to obtain a final x_train of shape (1326, 356, 8) where 1326 is the number of training samples and 8 is the dimension of one timestep. I am trying to encode these sequences into a single vector using the given LSTM autoencoder.
model.add(Masking(mask_value=0.0, input_shape=(max_len, 8)))
model.add(LSTM(100, activation='relu'))
model.add(RepeatVector(max_len))
model.add(LSTM(8, activation='relu', return_sequences=True))
model.compile(optimizer='adam', loss='mse')
model.fit(x_train, x_train, batch_size=32, callbacks=[chk], epochs=1000, validation_split=0.05, shuffle=True)
I am trying to mask the zero padded results but the RepeatVector() layer may be hindering the process. Hence, after sometime the mean square error loss is becoming nan. Can anyone help me out as to how I can only include the relevant timestep in calculating the loss function and ignore the other timesteps?
Each layer in Keras has an input_mask and output_mask, the mask was already lost right after the first LSTM layer (when return_sequence = False) in your example. Let me explain this in following example and show 2 solutions to achieve masking in LSTM-autoencoder.
time_steps = 3
n_features = 2
input_layer = tfkl.Input(shape=(time_steps, n_features))
# I want to mask the timestep where all the feature values are 1 (usually we pad by 0)
x = tfk.layers.Masking(mask_value=1)(input_layer)
x = tfkl.LSTM(2, return_sequences=True)(x)
x = tfkl.LSTM(2, return_sequences=False)(x)
x = tfkl.RepeatVector(time_steps)(x)
x = tfkl.LSTM(2, return_sequences=True)(x)
x = tfkl.LSTM(2, return_sequences=True)(x)
x = tfk.layers.Dense(n_features)(x)
lstm_ae = tfk.models.Model(inputs=input_layer, outputs=x)
lstm_ae.compile(optimizer='adam', loss='mse')
print(lstm_ae.summary())
Model: "model_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_3 (InputLayer) [(None, 3, 2)] 0
_________________________________________________________________
masking_2 (Masking) (None, 3, 2) 0
_________________________________________________________________
lstm_8 (LSTM) (None, 3, 2) 40
_________________________________________________________________
lstm_9 (LSTM) (None, 2) 40
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 3, 2) 0
_________________________________________________________________
lstm_10 (LSTM) (None, 3, 2) 40
_________________________________________________________________
lstm_11 (LSTM) (None, 3, 2) 40
_________________________________________________________________
dense_2 (Dense) (None, 3, 2) 6
=================================================================
Total params: 166
Trainable params: 166
Non-trainable params: 0
_________________________________________________________________
for i, l in enumerate(lstm_ae.layers):
print(f'layer {i}: {l}')
print(f'has input mask: {l.input_mask}')
print(f'has output mask: {l.output_mask}')
layer 0: <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x645b49cf8>
has input mask: None
has output mask: None
layer 1: <tensorflow.python.keras.layers.core.Masking object at 0x645b49c88>
has input mask: None
has output mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
layer 2: <tensorflow.python.keras.layers.recurrent_v2.LSTM object at 0x645b4d0b8>
has input mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
has output mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
layer 3: <tensorflow.python.keras.layers.recurrent_v2.LSTM object at 0x645b4dba8>
has input mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
has output mask: None
layer 4: <tensorflow.python.keras.layers.core.RepeatVector object at 0x645db0390>
has input mask: None
has output mask: None
layer 5: <tensorflow.python.keras.layers.recurrent_v2.LSTM object at 0x6470b5da0>
has input mask: None
has output mask: None
layer 6: <tensorflow.python.keras.layers.recurrent_v2.LSTM object at 0x6471410f0>
has input mask: None
has output mask: None
layer 7: <tensorflow.python.keras.layers.core.Dense object at 0x647dfdf60>
has input mask: None
has output mask: None
As you can see above, the second LSTM layer (return_sequence=False) returns a None, which makes sense because the timesteps are lost (shape are changed) and the layer doesn't know how to pass the mask, you can also check the source code and you will see that it returns the input_mask if return_sequence=True, otherwise None. Another problem is of course the RepeatVector layer, this layer doesn't support masking explicitly at all, again this is because the shape has changed. Except this bottleneck part (the second LSTM + RepeatVector), other parts of the model are able to pass the mask, so we only have to deal with the bottleneck part.
Here are 2 possible solutions, I will also validate based on calculating the loss.
First solution: ignore the timesteps explicitly by passing sample_weight
# last timestep should be masked because all feature values are 1
x = np.array([1, 2, 1, 2, 1, 1], dtype='float32').reshape(1, 3, 2)
print(x)
array([[[1., 2.],
[1., 2.],
[1., 1.]]], dtype=float32)
y = lstm_ae.predict(x)
print(y)
array([[[0.00020542, 0.00011909],
[0.0007361 , 0.00047323],
[0.00158514, 0.00107504]]], dtype=float32)
# the expected loss should be the sum of square error between the first 2 timesteps
# (2 features each timestep) divided by 6. you might expect that this should be
# divided by 4, but in the source code this is actually divided by 6, which doesn't
# matter a lot because only the gradient of loss matter, but not the loss itself.
expected_loss = np.square(x[:, :2, :] - y[:, :2, :]).sum()/6
print(expected_loss)
1.665958086649577
actual_loss_with_masking = lstm_ae.evaluate(x=x, y=x)
print(actual_loss_with_masking)
1.9984053373336792
# the actual loss still includes the last timestep, which means the masking is not # effectively passed to the output layer for calculating the loss
print(np.square(x-y).sum()/6)
1.9984052975972493
# if we provide the sample_weight 0 for each timestep that we want to mask, the
# loss will be ignored correctly
lstm_ae.compile(optimizer='adam', loss='mse', sample_weight_mode='temporal')
sample_weight_array = np.array([1, 1, 0]).reshape(1, 3) # it means to ignore the last timestep
actual_loss_with_sample_weight = lstm_ae.evaluate(x=x, y=x, sample_weight=sample_weight_array)
# the actual loss now is correct
print(actual_loss_with_sample_weight)
1.665958046913147
Second solution: make a customized bottleneck layer to pass the mask manually
class lstm_bottleneck(tf.keras.layers.Layer):
def __init__(self, lstm_units, time_steps, **kwargs):
self.lstm_units = lstm_units
self.time_steps = time_steps
self.lstm_layer = tfkl.LSTM(lstm_units, return_sequences=False)
self.repeat_layer = tfkl.RepeatVector(time_steps)
super(lstm_bottleneck, self).__init__(**kwargs)
def call(self, inputs):
# just call the two initialized layers
return self.repeat_layer(self.lstm_layer(inputs))
def compute_mask(self, inputs, mask=None):
# return the input_mask directly
return mask
time_steps = 3
n_features = 2
input_layer = tfkl.Input(shape=(time_steps, n_features))
# I want to mask the timestep where all the feature values are 1 (usually we pad by 0)
x = tfk.layers.Masking(mask_value=1)(input_layer)
x = tfkl.LSTM(2, return_sequences=True)(x)
x = lstm_bottleneck(lstm_units=2, time_steps=3)(x)
# x = tfkl.LSTM(2, return_sequences=False)(x)
# x = tfkl.RepeatVector(time_steps)(x)
x = tfkl.LSTM(2, return_sequences=True)(x)
x = tfkl.LSTM(2, return_sequences=True)(x)
x = tfk.layers.Dense(n_features)(x)
lstm_ae = tfk.models.Model(inputs=input_layer, outputs=x)
lstm_ae.compile(optimizer='adam', loss='mse')
print(lstm_ae.summary())
Model: "model_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_3 (InputLayer) [(None, 3, 2)] 0
_________________________________________________________________
masking_2 (Masking) (None, 3, 2) 0
_________________________________________________________________
lstm_10 (LSTM) (None, 3, 2) 40
_________________________________________________________________
lstm_bottleneck_3 (lstm_bott (None, 3, 2) 40
_________________________________________________________________
lstm_12 (LSTM) (None, 3, 2) 40
_________________________________________________________________
lstm_13 (LSTM) (None, 3, 2) 40
_________________________________________________________________
dense_2 (Dense) (None, 3, 2) 6
=================================================================
Total params: 166
Trainable params: 166
Non-trainable params: 0
_________________________________________________________________
for i, l in enumerate(lstm_ae.layers):
print(f'layer {i}: {l}')
print(f'has input mask: {l.input_mask}')
print(f'has output mask: {l.output_mask}')
layer 0: <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x64dbf98d0>
has input mask: None
has output mask: None
layer 1: <tensorflow.python.keras.layers.core.Masking object at 0x64dbf9f60>
has input mask: None
has output mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
layer 2: <tensorflow.python.keras.layers.recurrent_v2.LSTM object at 0x64dbf9550>
has input mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
has output mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
layer 3: <__main__.lstm_bottleneck object at 0x64dbf91d0>
has input mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
has output mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
layer 4: <tensorflow.python.keras.layers.recurrent_v2.LSTM object at 0x64e04ca20>
has input mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
has output mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
layer 5: <tensorflow.python.keras.layers.recurrent_v2.LSTM object at 0x64eeb8b00>
has input mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
has output mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
layer 6: <tensorflow.python.keras.layers.core.Dense object at 0x64ef43208>
has input mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
has output mask: Tensor("masking_2/Identity_1:0", shape=(None, 3), dtype=bool)
As we can already see, the masks are now passed successfully to the output layer. We will also validate that the loss do not include the masked timesteps.
# last timestep should be masked because all feature values are 1
x = np.array([1, 2, 1, 2, 1, 1], dtype='float32').reshape(1, 3, 2)
print(x)
array([[[1., 2.],
[1., 2.],
[1., 1.]]], dtype=float32)
y = lstm_ae.predict(x)
print(y)
array([[[ 0.00065455, -0.00294413],
[ 0.00166675, -0.00742249],
[ 0.00166675, -0.00742249]]], dtype=float32)
# the expected loss should be the square error between the first 2 timesteps divided by 6
expected_loss = np.square(x[:, :2, :] - y[:, :2, :]).sum()/6
print(expected_loss)
1.672815163930257
# now the loss is correct with a custom layer
actual_loss_with_masking = lstm_ae.evaluate(x=x, y=x)
print(actual_loss_with_masking)
1.672815203666687
I'm trying to train a model Keras but I'm having a problem:
g = ImageDataGenerator(featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=45,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True,
validation_split=validation_split,
preprocessing_function=lambda x: x / 127 - 1)
g_train = g.flow(x_train, y_train,
batch_size=batch_size,
subset='training')
g_valid = g.flow(x_train, y_train,
batch_size=batch_size,
shuffle=False,
subset='validation')
history = network.fit_generator(g_train,
steps_per_epoch=len(x_train) / 32,
epochs=epochs)
ValueError: Error when checking target: expected predictions to have 4 dimensions, but got array with shape (256, 1)
Someone have any idea why? It seems much like the example in documentation to me.
x_train.shape
(50000, 32, 32, 1)
y_train.shape
(50000, 1, 1)
I was doing some classification with keras, when met this error:
InvalidArgumentError: Dimensions must be equal, but are 256 and 8 for 'dense_185/MatMul' (op: 'MatMul') with input shapes: [?,256], [8,300].
It surprised me because the dimension of the input to the dense is 1.
This is a sequential model with a few custom layers. I have no idea why 8 appears in the error of dense layer.
class Residual(Layer):
def __init__(self,input_shape,**kwargs):
super(Residual, self).__init__(**kwargs)
self.input_shapes = input_shape
def call(self, x):
print(np.shape(x)) # (?, 128, 8)
first_layer = Conv1D(256, 4, activation='relu', input_shape = self.input_shapes)(x)
print(np.shape(first_layer)) (?, 125, 256)
x = Conv1D(256, 4, activation='relu')(first_layer)
print(np.shape(x)) (?, 122, 256)
x = Conv1D(256, 4, activation='relu')(x)
print(np.shape(x)) (?, 119, 256)
x = ZeroPadding1D(padding=3)(x)
residual = Add()([x, first_layer])
x = Activation("relu")(residual)
return x
class Pooling(Layer):
def __init__(self,**kwargs):
super(Pooling, self).__init__(**kwargs)
def call(self, x):
first_layer = GlobalMaxPooling1D(data_format='channels_last')(x)
second_layer = GlobalAveragePooling1D(data_format='channels_last')(x)
pooling = Add()([first_layer, second_layer])
print(np.shape(pooling)) (?, 256)
return pooling
model = Sequential()
model.add(Residual(input_shape=(128,8)))
model.add(Pooling())
model.add(Dense(300, activation='relu'))
model.add(Dense(150, activation='relu'))
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(np.array(dataset_data), dataset_target, epochs=1000, validation_split=0.1, verbose=1, batch_size=8)
Dimensions:
(1000, 128, 8) - input (1000 audio, 8 features, 128 seq_length)
(1000, 10) - target (1000 audio, 10 classes)
I think there are two edits required:
Add InputLayer as entrance for the data
Define compute_output_shape method at least for Pooling layer (link). If this method is not defined, Dense layer can't figure out what's input shape for it, I guess, and then fails.
Also there's minor editing - since model have InputLayer, you need no more input_shape kwarg in Residual layer.
class Residual(Layer):
def __init__(self, **kwargs): # remove input shape
super(Residual, self).__init__(**kwargs)
def call(self, x):
print(np.shape(x))
first_layer = Conv1D(256, 4, activation='relu')(x)
print(np.shape(first_layer))
x = Conv1D(256, 4, activation='relu')(first_layer)
print(np.shape(x))
x = Conv1D(256, 4, activation='relu')(x)
print(np.shape(x))
x = ZeroPadding1D(padding=3)(x)
residual = Add()([x, first_layer])
x = Activation("relu")(residual)
return x
class Pooling(Layer):
def __init__(self, **kwargs):
super(Pooling, self).__init__(**kwargs)
def call(self, x):
# !!! I build model without data_format argument - my version of keras
# doesn't support it !!!
first_layer = GlobalMaxPooling1D(data_format='channels_last')(x)
second_layer = GlobalAveragePooling1D(data_format='channels_last')(x)
pooling = Add()([first_layer, second_layer])
print(np.shape(pooling))
self.output_dim = int(np.shape(pooling)[-1]) # save output shape
return pooling
def compute_output_shape(self, input_shape):
# compute output shape here
return (input_shape[0], self.output_dim)
Initialize model:
model = Sequential()
model.add(InputLayer((128,8)))
model.add(Residual())
model.add(Pooling())
model.add(Dense(300, activation='relu'))
model.add(Dense(150, activation='relu'))
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer=keras.optimizers.Adadelta(),
metrics=['accuracy'])
Out:
(?, 128, 8)
(?, 125, 256)
(?, 122, 256)
(?, 119, 256)
(?, 256)
Summary of the model (don't know why Residual and Pooling don't show params the have. I guess some additional method required for this classes to count internal params):
model.summary()
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
residual_10 (Residual) (None, 128, 8) 0
_________________________________________________________________
pooling_8 (Pooling) (None, 256) 0
_________________________________________________________________
dense_15 (Dense) (None, 300) 77100
_________________________________________________________________
dense_16 (Dense) (None, 150) 45150
_________________________________________________________________
dense_17 (Dense) (None, 10) 1510
=================================================================
Total params: 123,760
Trainable params: 123,760
Non-trainable params: 0
_________________________________________________________________
Create fake data and check training process:
dataset_data = np.random.randn(1000, 128, 8)
dataset_target = np.zeros((1000, 10))
dataset_target[:, 0] = 1
model.fit(np.array(dataset_data), dataset_target, epochs=1000,
validation_split=0.1, verbose=1, batch_size=8)
Train on 900 samples, validate on 100 samples
Epoch 1/1000
900/900 [==============================] - 2s 2ms/step - loss: 0.0235 - acc: 0.9911 - val_loss: 9.4426e-05 - val_acc: 1.0000
Epoch 2/1000
900/900 [==============================] - 1s 1ms/step - loss: 4.2552e-05 - acc: 1.0000 - val_loss: 1.7458e-05 - val_acc: 1.0000
Epoch 3/1000
900/900 [==============================] - 1s 1ms/step - loss: 1.1342e-05 - acc: 1.0000 - val_loss: 7.3141e-06 - val_acc: 1.0000
... and so on
Looks like it works.