Keras multiply parallel layers' outputs with constrained weigths

Keras multiply parallel layers' outputs with constrained weigths - keras

I have 3 parallel MLPs and want to obtain the following in Keras:
Out = W1 * Out_MLP1 + W2 * Out_MLP2 + W3 * Out_MLP3
where Out_MLPs are output layer of each MLP and have dimension of (10,) and W1, W2 and W3 are three trainable weights (floats) where they satisfy the following condition:
W1 + W2 + W3 = 1
What is the best way to implement this with Keras functional API? What if we had N parallel layers?

what you need is to apply a softmax on a set of learnable weights, in order to grant that they sum up to 1.
We initialize our learnable weights in a custom layer. this layer receives the output of our MLPs and combines them following our logic W1 * Out_MLP1 + W2 * Out_MLP2 + W3 * Out_MLP3. the output will be a tensor of shape (10,).
class W_ADD(Layer):
def __init__(self, n_output):
super(W_ADD, self).__init__()
self.W = tf.Variable(initial_value=tf.random.uniform(shape=[1,1,n_output], minval=0, maxval=1),
trainable=True) # (1,1,n_inputs)
def call(self, inputs):
# inputs is a list of tensor of shape [(n_batch, n_feat), ..., (n_batch, n_feat)]
# expand last dim of each input passed [(n_batch, n_feat, 1), ..., (n_batch, n_feat, 1)]
inputs = [tf.expand_dims(i, -1) for i in inputs]
inputs = Concatenate(axis=-1)(inputs) # (n_batch, n_feat, n_inputs)
weights = tf.nn.softmax(self.W, axis=-1) # (1,1,n_inputs)
# weights sum up to one on last dim
return tf.reduce_sum(weights*inputs, axis=-1) # (n_batch, n_feat)
in this dummy example, I create a network that has 3 parallel MLPs
inp1 = Input((100))
inp2 = Input((100))
inp3 = Input((100))
x1 = Dense(32, activation='relu')(inp1)
x2 = Dense(32, activation='relu')(inp2)
x3 = Dense(32, activation='relu')(inp3)
x1 = Dense(10, activation='linear')(x1)
x2 = Dense(10, activation='linear')(x2)
x3 = Dense(10, activation='linear')(x3)
mlp_outputs = [x1,x2,x3]
out = W_ADD(n_output=len(mlp_outputs))(mlp_outputs)
m = Model([inp1,inp2,inp3], out)
m.compile('adam','mse')
X1 = np.random.uniform(0,1, (1000,100))
X2 = np.random.uniform(0,1, (1000,100))
X3 = np.random.uniform(0,1, (1000,100))
y = np.random.uniform(0,1, (1000,10))
m.fit([X1,X2,X3], y, epochs=10)
as you can see this is easily generalizable in case of N parallel layers

Related

How to create a custom loss that does not directly use the output of the network with pytorch

I would like to create a custom loss that does not directly use the output of my network. Indeed, I need to create a loss that returns the difference between the result of a function f(x) (where x is the output of my network) and max(f(x)). Unfortunately my code doesn't work and I don't know how to proceed... Here is my code:
def forward(self, x, y, hidden):
c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
y = torch.reshape(y, (y.shape[0], 1, 1))
tmp = torch.cat((x, y), 2)
output, (hn, cn) = self.lstm(tmp, (hidden, c_0))
out = self.fc(output)
return out, hn
def _train(self):
num_epochs = 10
num_iteration = 10
save_loss_global = []
save_loss_epoch = []
for epoch in range(num_epochs):
print("NOUVELLE EPOCH")
X_train, Y_train = donneesAleatoires()
self.maxRes = 0
self.hidden = Variable(torch.zeros(self.num_layers, 1, self.hidden_size))
tabY = torch.Tensor()
tabY = torch.cat((tabY, Y_train), 1)
for iteration in range(num_iteration):
x_i = X_train[0]
x_i = torch.reshape(x_i, (x_i.shape[0], 1, x_i.shape[1]))
y_i = Y_train[0]
outputs, self.hidden = self(x_i, y_i, self.hidden)
YiPlus1 = self.function(outputs.detach().numpy().reshape(1, -1))
self.optimizer.zero_grad()
Yadd = Variable(torch.Tensor(YiPlus1))
tabY = torch.cat((tabY, Yadd), 1)
loss = self.my_loss(tabY, iteration)
if YiPlus1 > self.maxRes:
self.maxRes = YiPlus1
if y_i.detach().numpy() > self.maxRes:
self.maxRes = y_i.detach().numpy()
#loss = Variable(loss, requires_grad=True)
loss.backward(retain_graph=True)
X_train = outputs
Y_train = YiPlus1
Y_train = Variable(torch.Tensor(Y_train))
self.optimizer.step()
save_loss_global.append(loss.item())
if iteration == num_iteration -1:
save_loss_epoch.append(loss.item())
print(X_train)
def my_loss(self, target, epoch):
if isinstance(target, np.ndarray):
target = Variable(torch.Tensor(target))
tmp = self.maxRes
loss = target[0][0] - tmp
if epoch > 0:
for i in range(1, epoch + 1):
loss = loss + (target[0][i] - tmp)
loss = -loss
return loss / (epoch+1)

To calculate gradients based on loss, toolchain needs computation graph. Said graph is builded implicitly on forward pass, but to do so, all computations must use toolchain's tensors (no .numpy()s!) with preserved gradients (no .detach()s!). Try to rewrite your code accordingly, don't wory about doing computations outside forward, it is normal.
You can check your tensors are computed right way, printing them, should look like
print( myTensor )
tensor([[-2.9016, -2.8739, ... ,-2.8929, -2.9033]], grad_fn=<AliasBackward0>)

How to code softmax function using Python on tensorflow graph

I want to program the softmax function from scratch using Python on tensorflow mood.
def sigmoid(p):
return tf.cond(p >= 0, lambda: 1 / (1 + tf.exp(-p)), \
lambda: tf.exp(p) / (1 + tf.exp(p)))
While running this code chunk, I got this traceback:
InvalidArgumentError: Shape must be rank 0 but is rank 2 for 'cond/Switch' (op: 'Switch') with input shapes: [?,256], [?,256].
Here's my reproducible code:
# 1st hidden layer
W1 = tf.get_variable("W1", shape=(784, 256), dtype=tf.float32, initializer = tf.contrib.layers.xavier_initializer())
b1 = tf.get_variable("b1", shape=(256), dtype=tf.float32, initializer = tf.zeros_initializer)
# 2nd hidden layer
W2 = tf.get_variable("W2", shape=(256, 10), dtype=tf.float32, initializer = tf.contrib.layers.xavier_initializer())
b2 = tf.get_variable("b2", shape=(10), dtype=tf.float32, initializer = tf.zeros_initializer)
def sigmoid(z):
"""Numerically stable sigmoid function."""
return tf.where(z >= 0, 1 / (1 + tf.exp(-z)), tf.exp(z) / (1 + tf.exp(z)))
### Compute predictions
logits = X # W1 + b1
probas = sigmoid(logits)
y_pred = probas # W2 + b2
def softmax(z): ## this approach provides numerical stability
"""Compute softmax values for each sets of scores in z."""
e = tf.exp(z - tf.reduce_max(z))
return e / tf.reduce_sum(e)
### Cross-Entropy loss
cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(softmax(y_pred)), reduction_indices=[1]))
lr = 0.01
optimizer = tf.train.AdamOptimizer(lr)
step = optimizer.minimize(cost) #
This gives me a very bad score with BATCH_SIZE = 512 and EPOCHS= 55
Test cost after 10 epochs: 6.3319
Test cost after 30 epochs: 6.2753
Test cost after 50 epochs: nan
OPTIMIZATION IS DONE!
Score = 0.098982

Matrix calculations in SIMPLERNN KERAS

I'm trying to understand the Matrix Calculations involved in SIMPLERNN.
I have understood from some blog posts and stackoverflow that SimpleRNN(units) creates a layer where in each layer contains no.of units of RNN.
SimpleRNN involves the following calculations
W = kernel #shape-(1,units)
U = recurrent_kernel #shape-(units,units)
B = bias #shape -(units,)
output = new_state = act(W * input + U * state + B)
Help me understand what are the input and output dimensions of the following below code snippet.
Function for generating X and Y. Where Y is the Cummulative sum of X.
'def generate_batch(n=256):
X = np.random.choice(a=[0, 1], size = n*seq_len, p=[0.9, 0.1]).reshape(n, -1)
y = np.cumsum(X, axis=1)
X = X.reshape(n, -1, 1)
y = y.reshape(n, -1, 1)
return(X, y) #returns shape X,y-(256,60,1)
model = Sequential()
model.add(SimpleRNN(10, input_shape=(60, 1), return_sequences=True))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')
X, y = generate_batch()#(256,60,1)
model.fit(X, y, verbose=0, epochs=1)'
please help me figure out the dimensions of input to RNN ,state and output dimension from RNN.How does the matrix calculations occur i.e (W * input + U * state + B).

Variational autoencoder layers for float and binary (one-hot encoded) columns

Which layers I need for modeling variational autoencoder? I want to predict X_test to detect anomalies in dataset which consist 2 float variables(money) and 40 binaries columns (one-hot encoded).
m = 50
n_z = 2
n_epoch = 10
# Q(z|X) -- encoder
inputs = Input(shape=(42,))
h_q = Dense(21, activation='relu')(inputs)
mu = Dense(n_z, activation='linear')(h_q)
log_sigma = Dense(n_z, activation='linear')(h_q)
def sample_z(args):
mu, log_sigma = args
eps = K.random_normal(shape=(m, n_z), mean=0., std=1.)
return mu + K.exp(log_sigma / 2) * eps
# Sample z ~ Q(z|X)
z = Lambda(sample_z)([mu, log_sigma])
# P(X|z) -- decoder
decoder_hidden = Dense(21, activation='relu')
decoder_out = Dense(42, activation='sigmoid')
h_p = decoder_hidden(z)
outputs = decoder_out(h_p)
# Overall VAE model, for reconstruction and training
vae = Model(inputs, outputs)
# Encoder model, to encode input into latent variable
# We use the mean as the output as it is the center point, the representative of the gaussian
encoder = Model(inputs, mu)
# Generator model, generate new data given latent variable z
d_in = Input(shape=(n_z,))
d_h = decoder_hidden(d_in)
d_out = decoder_out(d_h)
decoder = Model(d_in, d_out)
def vae_loss(y_true, y_pred):
""" Calculate loss = reconstruction loss + KL loss for each data in minibatch """
# E[log P(X|z)]
recon = K.sum(K.binary_crossentropy(y_pred, y_true), axis=1)
# D_KL(Q(z|X) || P(z|X)); calculate in closed form as both dist. are Gaussian
kl = 0.5 * K.sum(K.exp(log_sigma) + K.square(mu) - 1. - log_sigma, axis=1)
return recon + kl
vae.compile(optimizer='adam', loss=vae_loss)
vae.fit(X_train, X_train, batch_size=m, nb_epoch=n_epoch)

Convergence issues in 2D RBF Neuron implemented as a Keras layer

We implemented a 2D Gaussian radial basis layer (RBF) in Keras and are running into convergence issues with batch sizes larger than 1. The Neuron should implement the following function:
f(x,y)=exp(-a((x-x_0)²+(y-y_0)²)
Here x_0, y_0 and a are fit parameters.
Testcase
Currently we are doing correctness tests and are trying to fit just a single Neuron on the 2D function above. The Neuron should be (and is in case of batch_size 1) able to approximate this function exactly. The optimal loss is 0.
Problem
If we choose a batch size of 1 in this code, the prediction with Keras will converge very often and will be nearly independent of the starting parameters.
If we increase the batch size, the fit might produce a random walk, freeze or not converge at all. In all of these cases (even batch_size 2) convergence is a lot worse than in the batch_size 1 case. If we choose the batch_size as the size of the trainingset (i.e. 1296, our desired batch size), the fit will freeze most of the time mostly independent of learning rate.
Code
We implemented this layer in the following code:
# 2D RBF Layer
# In case anybody wants to use this code afterwards:
# Licenses: Apache, MIT, BSD, LGPLv2 and v3 and Public Domain
# Input: x,y Pairs, shape: (2,)
# Output: exp(a* ((x-x_0)**2 + (y-y_0)**2)), shape: (1,)
# Parameters: x_0, y_0, a - called: mean_x, mean_y and opening in the following code:
# x and y should both lie in [0,1] - only [0,infinity] is enforced currently
class RBFLayer2D(Layer):
def __init__(self, **kwargs):
super(RBFLayer2D, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.mean_x = K.variable(0.35)
self.constraints[self.mean_x] = NonNeg()
self.mean_y = K.variable(0.35)
self.constraints[self.mean_y] = NonNeg()
self.opening = K.variable(2.0)
self.constraints[self.opening] = NonNeg()
self.trainable_weights = [self.mean_x,self.mean_y,self.opening]
super(RBFLayer2D, self).build(input_shape) # Be sure to call this somewhere!
def call(self, x):
x_m = x[:,0] - self.mean_x
y_m = x[:,1] - self.mean_y
out = x_m*x_m + y_m*y_m
outexp = 50.0*K.exp(-64.8*self.opening*out)
# Output: exp(-a* ((x-x_0)**2 + (y-y_0)**2))
return outexp
def compute_output_shape(self, input_shape):
# If Inputshape is (None, N) Outputshape is (None,N/2)
# In our example we only look at (None, 2), which outputs (None,1)
output_shape = (input_shape[0], input_shape[1]//2)
return output_shape
Reproduction
To reproduce set a batch_size of 1 in the (not-so) minimal example after this section. When you run it, the code will display the target distribution (a circle in the lower left corner), the starting guess for our RBF ANN (a smaller circler in the middle) and then after each iteration the current guess (a circle getting bigger and moving to the lower left corner).
Afterwards set a batch_size of 12 and restart the code and you will not observe convergence anymore.
Minimal Example
from __future__ import print_function
from __future__ import division
import numpy as np
np.random.seed(1234)
import matplotlib.pyplot as plt
from keras.engine import Layer
from keras.optimizers import SGD
from keras.models import Sequential
from keras.constraints import NonNeg
from keras import backend as K
# 2D RBF Layer
# Input: x,y Pairs, shape: (2,)
# Output: exp(a* ((x-x_0)**2 + (y-y_0)**2)), shape: (1,)
# Parameters: x_0, y_0, a - called: mean_x, mean_y and opening in the following code:
# x and y should both lie in [0,1] - only [0,infinity] is enforced currently
class RBFLayer2D(Layer):
def __init__(self, **kwargs):
super(RBFLayer2D, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.mean_x = K.variable(0.35)
self.constraints[self.mean_x] = NonNeg()
self.mean_y = K.variable(0.35)
self.constraints[self.mean_y] = NonNeg()
self.opening = K.variable(2.0)
self.constraints[self.opening] = NonNeg()
self.trainable_weights = [self.mean_x,self.mean_y,self.opening]
super(RBFLayer2D, self).build(input_shape)
def call(self, x):
x_m = x[:,0] - self.mean_x
y_m = x[:,1] - self.mean_y
out = x_m*x_m + y_m*y_m
outexp = 50.0*K.exp(-64.8*self.opening*out)
# Output: exp(-a* ((x-x_0)**2 + (y-y_0)**2))
return outexp
def compute_output_shape(self, input_shape):
# If Inputshape is (None, N) Outputshape is (None,N/2)
# In our example we only look at (None, 2), which outputs (None,1)
output_shape = (input_shape[0], input_shape[1]//2)
return output_shape
# The function we want to train.
# It can be exactly represented using a single Neuron.
def twodenergy(phi, psi):
r0 = np.array([-180, -180])
b = 0.00005
return 50.0 * np.exp(- b * ((phi - r0[0]) ** 2 + (psi - r0[1]) ** 2))
# One of two plotting helper functions to show the results
def make_plot(y,numsteps,numbins,minangle,maxangle,plotnum, batch_size):
evaluation = np.zeros((numsteps, numsteps))
for i in range(0, numbins):
mx = i % numsteps
my = int(i / numsteps)
evaluation[mx,my]=y[i]
plt.imshow(evaluation.T, origin='lower',extent=[minangle, maxangle, minangle, maxangle])
plt.xlabel("x")
plt.ylabel("y")
if plotnum == 0:
plt.title("Startconfiguration")
else:
plt.title("RBF for batch_size %i at frame %03d" % (batch_size, plotnum))
plt.show()
# One of two plotting helper functions to show the target function
def plot_target_function(phi, psi, minangle, maxangle, delta_angle_half, numbins, numsteps ):
eval_matrix_corr = np.zeros((numsteps, numsteps))
for i in range(0, numbins):
mx = i % numsteps
my = int(i / numsteps)
ph = phi[mx] +delta_angle_half
ps = psi[my] +delta_angle_half
eval_matrix_corr[mx,my] = twodenergy(ph,ps)
plt.imshow(eval_matrix_corr.T, origin='lower', extent=[minangle, maxangle, minangle, maxangle])
plt.title("Target Function")
plt.xlabel("phi")
plt.ylabel("psi")
plt.show()
if __name__ == "__main__":
# batch_size == 1: converges very often nearly independent of input parameters
# batch_size == 2: no to slow convergence, but distribution stays in the right place more or less
# batch_size == 3-12: random walk
# batch_size == 1296: no movement in case of low learning_rate, random_walk in case of high learning_rate
# (this is the case where the whole map is evaluated in every step.
# 1296 is our desired testcase, because it evaluates the whole map we want to fit.
batch_size = 1
learning_rate = 1E-5
### Here we generate the target function ###
### f(phi,psi)
### phi is [-180,180]
### psi is [-180,180]
anglestep = 10.0
minangle = -180.0
maxangle = 180.0
numsteps = int((maxangle - minangle)/anglestep)
anglerange = maxangle - minangle
numbins = numsteps*numsteps
delta_angle_half = anglerange /(2.0* numsteps)
phi = np.arange(minangle, maxangle, anglestep)
psi = np.arange(minangle, maxangle, anglestep)
#Target Function Plot, Gaussian in lower left
plot_target_function(phi, psi, minangle, maxangle, delta_angle_half, numbins, numsteps )
# Input Parameter Regularization
# we map -180..180 to 0..1
# we also calculate the training parameters for our x,y pairs:
x_train = np.zeros((numbins, 2))
y_train = np.zeros((numbins, 1))
for x,ph in enumerate(phi):
for y,ps in enumerate(psi):
myphi = (ph + delta_angle_half - minangle)/(anglerange)
mypsi = (ps + delta_angle_half- minangle)/(anglerange)
x_train[x * numsteps + y, 0] = (ph +delta_angle_half - minangle)/(anglerange)
x_train[x * numsteps + y, 1] = (ps + delta_angle_half- minangle)/(anglerange)
y_train[x * numsteps + y] = twodenergy(ph +delta_angle_half,ps +delta_angle_half)
# Prediction with Keras
model = Sequential()
# Single RBF Layer, only one node
model.add(RBFLayer2D(input_shape=(2,)))
sgd = SGD(lr=learning_rate)
model.compile(loss="mean_squared_error", optimizer=sgd)
# We plot the starting configuration.
y = model.predict(x_train, batch_size=batch_size)
make_plot(y, numsteps, numbins, minangle, maxangle, 0, batch_size)
#Plot the first 15 iterations:
for i in range(0,15):
# For demonstration purposes, we fit 1 epoch and plot the output.
model.fit(x_train,y_train, epochs=1, batch_size=batch_size)
y = model.predict(x_train, batch_size=batch_size)
make_plot(y, numsteps, numbins, minangle, maxangle, 1 + i, batch_size)

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Keras multiply parallel layers' outputs with constrained weigths - keras

Related

How to create a custom loss that does not directly use the output of the network with pytorch

How to code softmax function using Python on tensorflow graph

Matrix calculations in SIMPLERNN KERAS

Variational autoencoder layers for float and binary (one-hot encoded) columns

Convergence issues in 2D RBF Neuron implemented as a Keras layer

Categories

Resources