Efficiently training a network simultaneously on labels and partial derivaties - pytorch

I'm trying to train a network in pytorch along the lines of this idea.
The author creates a simple MLP (4 hidden layers) and then explicitly works out what the partial derivatives of the output is wrt the inputs. He then trains the network on the training labels as well as the gradients of the output wrt the input data (which is also part of the training data).
To replicate the idea in pytorch, my training loop looks like this:
import torch
import torch.nn.functional as F
class vanilla_net(torch.nn.Module):
def __init__(self,
input_dim, # dimension of inputs, e.g. 10
hidden_units, # units in hidden layers, assumed constant, e.g. 20
hidden_layers): # number of hidden layers, e.g. 4):
super(vanilla_net, self).__init__()
self.input = torch.nn.Linear(input_dim, hidden_units)
self.hidden = torch.nn.ModuleList()
for hl in range(hidden_layers):
layer = torch.nn.Linear(hidden_units, hidden_units)
self.hidden.append(layer)
self.output = torch.nn.Linear(hidden_units, 1)
def forward(self, x):
x = self.input(x)
x = F.softplus(x)
for h in self.hidden:
x = h(x)
x = F.softplus(x)
x = self.output(x)
return x
....
def lossfn(x, y, dx, dy):
# some loss function involving both sets of training data (y and dy)
# the network outputs x and what's needed is an efficient way of calculating dx - the partial
# derivatives of x wrt the batch inputs.
pass
def train(net, x_train, y_train, dydx_train, batch_size=256)
m, n = x_train.shape
first = 0
last = min(batch_size, m)
while first < m:
xi = x_train[first:last]
yi = y_train[first:last]
zi = dydx_train[first:last]
xi.requires_grad_()
# Perform forward pass
outputs = net(xi)
minimizer.zero_grad()
outputs.backward(torch.ones_like(outputs), create_graph=True)
xi_grad = xi.grad
# Compute loss
loss = lossfn(outputs, yi, xi_grad, zi)
minimizer.zero_grad()
# Perform backward pass
loss.backward()
# Perform optimization
minimizer.step()
first = last
last = min(first + batch_size, m)
net = vanilla_net(4, 10, 4)
minimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
...
This seems to work but is there a more elegant/efficient way to achieve the same thing? Also - not sure I know where the best place to put the minimizer.zero_grad()
Thanks

Related

Nested optimization in pytorch

I wrote a short snippet to train a classification model, and learn the learning rate of its optimization algorithm. In my example I tried to update weights of a network in an inner optimization loop and to learn the learning rate of the weight updates using an outer optimization loop (meta-optimization). I'm getting the error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3, 10]], which is output 0 of AsStridedBackward0, is at version 12; expected version 2 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
My code snippet is as following (NOTE: I'm using _stateless, an experimental functional API for nn. You need to run with the nightly build of pytorch.)
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import _stateless
class MyDataset(Dataset):
def __init__(self, N):
self.N = N
self.x = torch.rand(self.N, 10)
self.y = torch.randint(0, 3, (self.N,))
def __len__(self):
return self.N
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.fc1 = nn.Linear(10, 10)
self.fc2 = nn.Linear(10, 3)
self.relu = nn.ReLU()
self.alpha = nn.Parameter(torch.randn(1))
self.beta = nn.Parameter(torch.randn(1))
def forward(self, x):
y = self.relu(self.fc1(x))
return self.fc2(y)
epochs = 20
N = 100
dataset = DataLoader(dataset=MyDataset(N), batch_size=10)
model = MyModel()
loss_func = nn.CrossEntropyLoss()
optim = optim.Adam([model.alpha], lr=1e-3)
params = dict(model.named_parameters())
for i in range(epochs):
model.train()
train_loss = 0
for batch_idx, (x, y) in enumerate(dataset):
logits = _stateless.functional_call(model, params, x) # predict
loss_inner = loss_func(logits, y) # loss
optim.zero_grad() # reset grad
loss_inner.backward(create_graph=True, inputs=params.values()) # compute grad
train_loss += loss_inner.item() # store loss
for k, p in params.items():
if k is not 'alpha' and k is not 'beta':
p.update = - model.alpha * p.grad
params[k] = p + p.update # update weight
print('Train Epoch: {}\tLoss: {:.6f}'.format(i, train_loss / N))
logits = _stateless.functional_call(model, params, x) # predict
loss_meta = loss_func(logits, y)
loss_meta.backward()
loss_meta.step()
From the error message, I understand that the issue comes from weight update for the weights of the second layer of the network, which points to an error in my inner loop optimization. Any suggestions would be appreciated.
Check this link and save PARAMs per each epoch and use same inner batch:
https://discuss.pytorch.org/t/issue-using-parameters-internal-method/134549/11
for i in range(epochs):
model.train()
train_loss = 0
params = dict(model.named_parameters()) # add this
for batch_idx, (x, y) in enumerate(dataset):
params = {k: v.clone() for k,v in params.items()} # add this
logits = _stateless.functional_call(model, params, x) # predict
loss_inner = loss_func(logits, y)
..................
You should be updating params[k].data instead of params[k]
(Deleted the example to avoid distraction)
Let me enter in a kind of fundamental discussion (not an answer to your question).
If I undertand correctly you want to compute loss(f(w[i], x)) , and computing the w[i+1,j] = w[i,j] + g(v[j], w[i,j].grad(w.r.t loss)) . Then in the end you want to compute v[j+1] = v[j] + v[j].grad(w.r.t loss).
The gradient of v[j] is computed using the backward propagation, as a function of grad w[i,j]. So what you are trying to do is to choose v[j] that results in a good w[i,j]. I would ask: why would you bother about v[j] if you can control w[i,j] directly? And that's what the standard approach.

Computing the Hessian of a Simple NN in PyTorch wrt to Parameters

I am relatively new to PyTorch and trying to compute the Hessian of a very simple feedforward networks with respect to its weights. I am trying to get torch.autograd.functional.hessian to work. I have been digging the forums and since this is a relatively new function added to PyTorch, I am unable to find a whole lot of information on it. Here is my simple network architecture which is from some sample code on Kaggle on Mnist.
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.l1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.l3 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = self.l1(x)
x = self.relu(x)
x = self.l3(x)
return F.log_softmax(x, dim = 1)
net = Network()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
loss_func = nn.CrossEntropyLoss()
and I am running the NN for a bunch of epochs like:
for e in range(epochs):
for i in range(0, x.shape[0], batch_size):
x_mini = x[i:i + batch_size]
y_mini = y[i:i + batch_size]
x_var = Variable(x_mini)
y_var = Variable(y_mini)
optimizer.zero_grad()
net_out = net(x_var)
loss = loss_func(net_out, y_var)
loss.backward()
optimizer.step()
if i % 100 == 0:
loss_log.append(loss.data)
Then, I add all the parameters to a list and make a tensor out of it as below:
param_list = []
for param in net.parameters():
param_list.append(param.view(-1))
param_list = torch.cat(param_list)
Finally, I am trying to compute the Hessian of the converged network by running:
hessian = torch.autograd.functional.hessian(loss_func, param_list,create_graph=True)
but it gives me this error:
TypeError: forward() missing 1 required positional argument: 'target'
Any help would be appreciated.
Computing the hessian with regard to the parameters of a model (as opposed to the inputs to the model) isn't really well-supported right now. There's some work being done on this at https://github.com/pytorch/pytorch/issues/49171 , but for the moment it's very inconvenient.
Your code has a few other problems -- where you're passing loss_func, you should be passing a function that constructs the computation graph. Also, you never specify the input to the network or the target for the loss function.
Here's some code that cheats a little bit to use the existing functional interface to compute the hessian of the model weights, and concatenates everything together to give the same form as what you were trying to do:
# Pick a random input to the network
src = torch.rand(1, 2)
# Say our target for our loss is all ones
dst = torch.ones(1, dtype=torch.long)
keys = list(net.state_dict().keys())
parameters = list(net.parameters())
sizes = [x.view(-1).shape[0] for x in parameters]
ndims = sum(sizes)
def hessian_hack(*params):
for i in range(len(keys)):
path = keys[i].split('.')
cur = net
for f in range(0, len(path)-1):
cur = net.__getattr__(path[f])
cur.__delattr__(path[-1])
cur.__setattr__(path[-1], params[i])
return loss_func(net(src), dst)
# sub_hessians[i][f] is the hessian of parameter i vs parameter f
sub_hessians = torch.autograd.functional.hessian(
hessian_hack,
tuple(parameters),
create_graph=True)
# We can combine them all into a nice big hessian.
hessian = torch.cat([
torch.cat([
sub_hessians[i][f].reshape(sizes[i], sizes[f])
for f in range(len(sub_hessians[i]))
], axis=1)
for i in range(len(sub_hessians))
], axis=0)
print(hessian)

Time prediction using specialised setup in Keras

I'm working on a project where I have to predict the future states of a 1D vector with y entries. I'm trying to do this using an ANN setup with LSTM units in combination with a convolution layer. The method I'm using is based on the method they used in a (pre-release paper). The suggested setup is as follows:
In the picture c is the 1D vector with y entries. The ANN gets the n previous states as an input and produces o next states as an output.
Currently, my ANN setup looks like this:
inputLayer = Input(shape = (n, y))
encoder = LSTM(200)(inputLayer)
x = RepeatVector(1)(encoder)
decoder = LSTM(200, return_sequences=True)(x)
x = Conv1D(y, 4, activation = 'linear', padding = 'same')(decoder)
model = Model(inputLayer, x)
Here n is the length of the input sequences and y is the length of the state array. As can be seen I'm repeating the d vector only 1 time, as I'm trying to predict only 1 time step in the future. Is this the way to setup the above mentioned network?
Furthermore, I have a numpy array (data) with a shape of (Sequences, Time Steps, State Variables) to train with. I was trying to divide this in randomly selected batches with a generator like this:
def BatchGenerator(batch_size, n, y, data):
# Infinite loop.
while True:
# Allocate a new array for the batch of input-signals.
x_shape = (batch_size, n, y)
x_batch = np.zeros(shape=x_shape, dtype=np.float16)
# Allocate a new array for the batch of output-signals.
y_shape = (batch_size, 1, y)
y_batch = np.zeros(shape=y_shape, dtype=np.float16)
# Fill the batch with random sequences of data.
for i in range(batch_size):
# Select a random sequence
seq_idx = np.random.randint(data.shape[0])
# Get a random start-index.
# This points somewhere into the training-data.
start_idx = np.random.randint(data.shape[1] - n)
# Copy the sequences of data starting at this
# Each batch inside x_batch has a shape of [n, y]
x_batch[i,:,:] = data[seq_idx, start_idx:start_idx+n, :]
# Each batch inside y_batch has a shape of [1, y] (as we predict only 1 time step in advance)
y_batch[i,:,:] = data[seq_idx, start_idx+n, :]
yield (x_batch, y_batch)
The problem is that it gives an error if I'm using a batch_size of more than 1. Could anyone help me to set this data up in a way that it can be used optimally to train my neural network?
The model is now trained using:
generator = BatchGenerator(batch_size, n, y, data)
model.fit_generator(generator = generator, steps_per_epoch = steps_per_epoch, epochs = epochs)
Thanks in advance!

How do I obtain predictions and probabilities from new data input to a CNN in Tensorflow

I'll preface this by saying this is my first posted question on SO. I've just recently started working with Tensorflow, and have been attempting to apply a convolutional-neural network model approach for classification of .csv records in a file representing images from scans of microarray data. (FYI: Microarrays are a grid of spotted DNA on a glass slide, representing specific DNA target sequences for determining the presence of those DNA targets in a sample. The individual pixels represent fluorescence intensity value from 0-1). The file has ~200,000 records in total. Each record (image) has 10816 pixels that represent DNA sequences from known viruses, and one index label which identifies the virus species. The pixels create a pattern which is unique to each of the different viruses. There are 2165 different viruses in total represented within the 200,000 records. I have trained the network on images of labeled microarray datasets, but when I try to pass a new dataset through to classify it/them as one of the 2165 different viruses and determine predicted values and probabilities, I don't seem to be having much luck. This is the code that I am currently using for this:
import tensorflow as tf
import numpy as np
import csv
def extract_data(filename):
print("extracting data...")
NUM_LABELS = 2165
NUM_FEATURES = 10816
labels = []
fvecs = []
rowCount = 0
#iterate over the rows, split the label from the features
#convert the labels to integers and features to floats
for line in open(filename):
rowCount = rowCount + 1
row = line.split(',')
labels.append(row[3])#(int(row[7])) #<<<IT ALWAYS PREDICTS THIS VALUE!
for x in row [4:10820]:
fvecs.append(float(x))
#convert the array of float arrasy into a numpy float matrix
fvecs_np = np.matrix(fvecs).astype(np.float32)
#convert the array of int lables inta a numpy array
labels_np = np.array(labels).astype(dtype=np.uint8)
#convert the int numpy array into a one-hot matrix
labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)
print("arrays converted")
return fvecs_np, labels_onehot
def TestModels():
fvecs_np, labels_onehot = extract_data("MicroarrayTestData.csv")
print('RESTORING NN MODEL')
weights = {}
biases = {}
sess=tf.Session()
init = tf.global_variables_initializer()
#Load meta graph and restore weights
ModelID = "MicroarrayCNN_Data-1000.meta"
print("RESTORING:::", ModelID)
saver = tf.train.import_meta_graph(ModelID)
saver.restore(sess,tf.train.latest_checkpoint('./'))
graph = tf.get_default_graph()
x = graph.get_tensor_by_name("x:0")
y = graph.get_tensor_by_name("y:0")
keep_prob = tf.placeholder(tf.float32)
y_ = tf.placeholder("float", shape=[None, 2165])
wc1 = graph.get_tensor_by_name("wc1:0")
wc2 = graph.get_tensor_by_name("wc2:0")
wd1 = graph.get_tensor_by_name("wd1:0")
Wout = graph.get_tensor_by_name("Wout:0")
bc1 = graph.get_tensor_by_name("bc1:0")
bc2 = graph.get_tensor_by_name("bc2:0")
bd1 = graph.get_tensor_by_name("bd1:0")
Bout = graph.get_tensor_by_name("Bout:0")
weights = {wc1, wc2, wd1, Wout}
biases = {bc1, bc2, bd1, Bout}
print("NEXTArgmax")
prediction=tf.argmax(y,1)
probabilities = y
predY = prediction.eval(feed_dict={x: fvecs_np, y: labels_onehot}, session=sess)
probY = probabilities.eval(feed_dict={x: fvecs_np, y: labels_onehot}, session=sess)
accuracy = tf.reduce_mean(tf.cast(prediction, "float"))
print(sess.run(accuracy, feed_dict={x: fvecs_np, y: labels_onehot}))
print("%%%%%%%%%%%%%%%%%%%%%%%%%%")
print("Predicted::: ", predY, accuracy)
print("%%%%%%%%%%%%%%%%%%%%%%%%%%")
feed_dictTEST = {y: labels_onehot}
probabilities=probY
print("probabilities", probabilities.eval(feed_dict={x: fvecs_np}, session=sess))
########## Run Analysis ###########
TestModels()
So, when I run this code I get the correct prediction for the test set, although I am not sure I believe it, because it appears that whatever value I append in line 14 (see below) is the output it predicts:
labels.append(row[3])#<<<IT ALWAYS PREDICTS THIS VALUE!
I don't understand this, and it makes me suspicious that I've set up the CNN incorrectly, as I would have expected it to ignore my input label and determine a bast match from the trained network based on the trained patterns. The only thing I can figure is that when I pass the value through for the prediction; it is instead training the model on this data as well, and then predicting itself. Is this a correct assumption, or am I misinterpreting how Tensorflow works?
The other issue is that when I try to use code that (based on other tutorials) which is supposed to output the probabilities of all of the 2165 possible outputs, I get the error:
InvalidArgumentError (see above for traceback): Shape [-1,2165] has negative dimensions
[[Node: y = Placeholder[dtype=DT_FLOAT, shape=[?,2165], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
To me, it looks like it is the correct layer based on the 2165 value in the Tensor shape, but I don't understand the -1 value. So, to wrap up the summary, my questions are:
Based on the fact that I get the value that I have in the label of the input data, is this the correct method to make a classification using this model?
Am I missing a layer or have I configured the model incorrectly in order to extract the probabilities of all of the possible output classes, or am I using the wrong code to extract the information? I try to print out the accuracy to see if that would work, but instead it outputs the description of a tensor, so clearly that is incorrect as well.
(ADDITIONAL INFORMATION)
As requested, I'm also including the original code that was used to train the model, which is now below. You can see I do sort of a piece meal training of a limited number of related records at a time by their taxonomic relationships as I iterate through the file. This is mostly because the Mac that I'm training on (Mac Pro w/ 64GB ram) tends to give me the "Killed -9" error due to overuse of resources if I don't do it this way. There may be a better way to do it, but this seems to work.
Original Author: Aymeric Damien
Project: https://github.com/aymericdamien/TensorFlow-Examples/
from __future__ import print_function
import tensorflow as tf
import numpy as np
import csv
import random
# Parameters
num_epochs = 2
train_size = 1609
learning_rate = 0.001 #(larger >speed, lower >accuracy)
training_iters = 5000 # How much do you want to train (more = better trained)
batch_size = 32 #How many samples to train on, size of the training batch
display_step = 10 # How often to diplay what is going on during training
# Network Parameters
n_input = 10816 # MNIST data input (img shape: 28*28)...in my case 104x104 = 10816(rough array size)
n_classes = 2165 #3280 #2307 #787# Switched to 100 taxa/training set, dynamic was too wonky.
dropout = 0.75 # Dropout, probability to keep units. Jeffery Hinton's group developed it, that prevents overfitting to find new paths. More generalized model.
# Functions
def extract_data(filename):
print("extracting data...")
# arrays to hold the labels and feature vectors.
NUM_LABELS = 2165
NUM_FEATURES = 10826
taxCount = 0
taxCurrent = 0
labels = []
fvecs = []
rowCount = 0
#iterate over the rows, split the label from the features
#convert the labels to integers and features to floats
print("entering CNN loop")
for line in open(filename):
rowCount = rowCount + 1
row = line.split(',')
taxCurrent = row[3]
print("profile:", row[0:12])
labels.append(int(row[3]))
fvecs.append([float(x) for x in row [4:10820]])
#convert the array of float arrasy into a numpy float matrix
fvecs_np = np.matrix(fvecs).astype(np.float32)
#convert the array of int lables inta a numpy array
labels_np = np.array(labels).astype(dtype=np.uint8)
#convert the int numpy array into a one-hot matrix
labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)
print("arrays converted")
return fvecs_np, labels_onehot
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1): #Layer 1 : Convolutional layer
# Conv2D wrapper, with bias and relu activation
print("conv2d")
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') # Strides are the tensors...list of integers. Tensors=data
x = tf.nn.bias_add(x, b) #bias is the tuning knob
return tf.nn.relu(x) #rectified linear unit (activation function)
def maxpool2d(x, k=2): #Layer 2 : Takes samples from the image. (This is a 4D tensor)
print("maxpool2d")
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
# Create model
def conv_net(x, weights, biases, dropout):
print("conv_net setup")
# Reshape input picture
x = tf.reshape(x, shape=[-1, 104, 104, 1]) #-->52x52 , -->26x26x64
# Convolution Layer
conv1 = conv2d(x, weights['wc1'], biases['bc1']) #defined above already
# Max Pooling (down-sampling)
conv1 = maxpool2d(conv1, k=2)
print(conv1.get_shape)
# Convolution Layer
conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) #wc2 and bc2 are just placeholders...could actually skip this layer...maybe
# Max Pooling (down-sampling)
conv2 = maxpool2d(conv2, k=2)
print(conv2.get_shape)
# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
fc1 = tf.nn.relu(fc1) #activation function for the NN
# Apply Dropout
fc1 = tf.nn.dropout(fc1, dropout)
# Output, class prediction
out = tf.add(tf.matmul(fc1, weights['Wout']), biases['Bout'])
return out
def Train_Network(Txid_IN, Sess_File_Name):
import tensorflow as tf
tf.reset_default_graph()
x,y = 0,0
weights = {}
biases = {}
# tf Graph input
print("setting placeholders")
x = tf.placeholder(tf.float32, [None, n_input], name="x") #Gateway for data (images)
y = tf.placeholder(tf.float32, [None, n_classes], name="y") # Gateway for data (labels)
keep_prob = tf.placeholder(tf.float32) #dropout # Gateway for dropout(keep probability)
# Store layers weight & bias
#CREATE weights
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32]),name="wc1"), #
# 5x5 conv, 32 inputs, 64 outputs
'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64]),name="wc2"),
# fully connected, 7*7*64 inputs, 1024 outputs
'wd1': tf.Variable(tf.random_normal([26*26*64, 1024]),name="wd1"),
# 1024 inputs, 10 outputs (class prediction)
'Wout': tf.Variable(tf.random_normal([1024, n_classes]),name="Wout")
}
biases = {
'bc1': tf.Variable(tf.random_normal([32]), name="bc1"),
'bc2': tf.Variable(tf.random_normal([64]), name="bc2"),
'bd1': tf.Variable(tf.random_normal([1024]), name="bd1"),
'Bout': tf.Variable(tf.random_normal([n_classes]), name="Bout")
}
# Construct model
print("constructing model")
pred = conv_net(x, weights, biases, keep_prob)
print(pred)
# Define loss(cost) and optimizer
#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y)) Deprecated version of the statement
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = pred, labels=y)) #added reduce_mean 6/27
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
print("%%%%%%%%%%%%%%%%%%%%")
print ("%% ", correct_pred)
print ("%% ", accuracy)
print("%%%%%%%%%%%%%%%%%%%%")
# Initializing the variables
#init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
saver = tf.train.Saver()
fvecs_np, labels_onehot = extract_data("MicroarrayDataOUT.csv") #CHAGE TO PICORNAVIRUS!!!!!AHHHHHH!!!
print("starting session")
# Launch the graph
FitStep = 0
with tf.Session() as sess: #graph is encapsulated by its session
sess.run(init)
step = 1
# Keep training until reach max iterations (training_iters)
while step * batch_size < training_iters:
if FitStep >= 5:
break
else:
#iterate and train
print(step)
print(fvecs_np, labels_onehot)
for step in range(num_epochs * train_size // batch_size):
sess.run(optimizer, feed_dict={x: fvecs_np, y: labels_onehot, keep_prob:dropout}) #no dropout???...added Keep_prob:dropout
if FitStep >= 5:
break
#else:
###batch_x, batch_y = mnist.train.next_batch(batch_size)
# Run optimization op (backprop)
###sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
### keep_prob: dropout}) <<<<SOMETHING IS WRONG IN HERE?!!!
if step % display_step == 0:
# Calculate batch loss and accuracy
loss, acc = sess.run([cost, accuracy], feed_dict={x: fvecs_np,
y: labels_onehot,
keep_prob: 1.})
print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(np.mean(loss)) + ", Training Accuracy= " + \
"{:.5f}".format(acc))
TrainAcc = float("{:.5f}".format(acc))
#print("******", TrainAcc)
if TrainAcc >= .99: #Changed from .95 temporarily
print(FitStep)
FitStep = FitStep+1
saver.save(sess, Sess_File_Name, global_step=1000) #
print("Saved Session:", Sess_File_Name)
step += 1
print("Optimization Finished!")
print("Testing Accuracy:", \
sess.run(accuracy, feed_dict={x: fvecs_np[:256],
y: labels_onehot[:256],
keep_prob: 1.}))
#feed_dictTEST = {x: fvecs_np[50]}
#prediction=tf.argmax(y,1)
#print(prediction)
#best = sess.run([prediction],feed_dictTEST)
#print(best)
print("DONE")
sess.close()
def Tax_Iterator(CSV_inFile, CSV_outFile): #Deprecate
#Need to copy *.csv file to MySQL for sorting
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
TaxCount = 0
TaxThreshold = 2165
ThresholdStep = 2165
PrevTax = 0
linecounter = 0
#Open all GenBank profile list
for line in open(CSV_inFile):
linecounter = linecounter+1
print(linecounter)
resultFile = open(CSV_outFile,'a')
wr = csv.writer(resultFile, dialect='excel')
# Check for new TXID
row = line.split(',')
print(row[7], "===", PrevTax)
if row[7] != PrevTax:
print("X1")
TaxCount = TaxCount+1
PrevTax = row[7]
#Check it current Tax count is < or > threshold
# < threshold
print(TaxCount,"=+=", TaxThreshold)
if TaxCount<=3300:
print("X2")
CurrentTax= row[7]
CurrTxCount = CurrentTax
print("TaxCount=", TaxCount)
print( "Add to CSV")
print("row:", CurrentTax, "***", row[0:15])
wr.writerow(row[0:-1])
# is > threshold
else:
print("X3")
# but same TXID....
print(row[7], "=-=", CurrentTax)
if row[7]==CurrentTax:
print("X4")
CurrentTax= row[7]
print("TaxCount=", TaxCount)
print( "Add to CSV")
print("row:", CurrentTax, "***", row[0:15])
wr.writerow(row[0:-1])
# but different TXID...
else:
print(row[7], "=*=", CurrentTax)
if row[7]>CurrentTax:
print("X5")
TaxThreshold=TaxThreshold+ThresholdStep
resultFile.close()
Sess_File_Name = "CNN_VirusIDvSPECIES_XXALL"+ str(TaxThreshold-ThresholdStep)
print("<<<< Start Training >>>>"
print("Training on :: ", CurrTxCount, "Taxa", TaxCount, "data points.")
Train_Network(CurrTxCount, Sess_File_Name)
print("Training complete")
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
CurrentTax= row[7]
#reset tax count
CurrTxCount = 0
TaxCount = 0
resultFile.close()
Sess_File_Name = "MicroarrayCNN_Data"+ str(TaxThreshold+ThresholdStep)
print("<<<< Start Training >>>>")
print("Training on :: ", CurrTxCount, "Taxa", TaxCount, "data points.")
Train_Network(CurrTxCount, Sess_File_Name)
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
CurrentTax= row[7]
Tax_Iterator("MicroarrayInput.csv", "MicroarrayOutput.csv")
You defined prediction as prediction=tf.argmax(y,1). And in both feed_dict, you feed labels_onehot for y. Consequently, your "prediction" is always equal to the labels.
As you didn't post the code you used to train your network, I can't tell you what exactly you need to change.
Edit: I have isses understanding the underlying problem you're trying to solve - based on your code, you're trying to train a neural network with 2165 different classes using 1609 training examples. How is this even possible? If each example had a different class, there would still be some classes without any training example. Or does one image belong to many classes? From your statement at the beginning of your question, I had assumed you're trying to output a real-valued number between 0-1.
I'm actually surprised that the code actually worked as it looks like you're adding only a single number to your labels list, but your model expects a list with length 2165 for each training example.

Convergence issues in 2D RBF Neuron implemented as a Keras layer

We implemented a 2D Gaussian radial basis layer (RBF) in Keras and are running into convergence issues with batch sizes larger than 1. The Neuron should implement the following function:
f(x,y)=exp(-a((x-x_0)²+(y-y_0)²)
Here x_0, y_0 and a are fit parameters.
Testcase
Currently we are doing correctness tests and are trying to fit just a single Neuron on the 2D function above. The Neuron should be (and is in case of batch_size 1) able to approximate this function exactly. The optimal loss is 0.
Problem
If we choose a batch size of 1 in this code, the prediction with Keras will converge very often and will be nearly independent of the starting parameters.
If we increase the batch size, the fit might produce a random walk, freeze or not converge at all. In all of these cases (even batch_size 2) convergence is a lot worse than in the batch_size 1 case. If we choose the batch_size as the size of the trainingset (i.e. 1296, our desired batch size), the fit will freeze most of the time mostly independent of learning rate.
Code
We implemented this layer in the following code:
# 2D RBF Layer
# In case anybody wants to use this code afterwards:
# Licenses: Apache, MIT, BSD, LGPLv2 and v3 and Public Domain
# Input: x,y Pairs, shape: (2,)
# Output: exp(a* ((x-x_0)**2 + (y-y_0)**2)), shape: (1,)
# Parameters: x_0, y_0, a - called: mean_x, mean_y and opening in the following code:
# x and y should both lie in [0,1] - only [0,infinity] is enforced currently
class RBFLayer2D(Layer):
def __init__(self, **kwargs):
super(RBFLayer2D, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.mean_x = K.variable(0.35)
self.constraints[self.mean_x] = NonNeg()
self.mean_y = K.variable(0.35)
self.constraints[self.mean_y] = NonNeg()
self.opening = K.variable(2.0)
self.constraints[self.opening] = NonNeg()
self.trainable_weights = [self.mean_x,self.mean_y,self.opening]
super(RBFLayer2D, self).build(input_shape) # Be sure to call this somewhere!
def call(self, x):
x_m = x[:,0] - self.mean_x
y_m = x[:,1] - self.mean_y
out = x_m*x_m + y_m*y_m
outexp = 50.0*K.exp(-64.8*self.opening*out)
# Output: exp(-a* ((x-x_0)**2 + (y-y_0)**2))
return outexp
def compute_output_shape(self, input_shape):
# If Inputshape is (None, N) Outputshape is (None,N/2)
# In our example we only look at (None, 2), which outputs (None,1)
output_shape = (input_shape[0], input_shape[1]//2)
return output_shape
Reproduction
To reproduce set a batch_size of 1 in the (not-so) minimal example after this section. When you run it, the code will display the target distribution (a circle in the lower left corner), the starting guess for our RBF ANN (a smaller circler in the middle) and then after each iteration the current guess (a circle getting bigger and moving to the lower left corner).
Afterwards set a batch_size of 12 and restart the code and you will not observe convergence anymore.
Minimal Example
from __future__ import print_function
from __future__ import division
import numpy as np
np.random.seed(1234)
import matplotlib.pyplot as plt
from keras.engine import Layer
from keras.optimizers import SGD
from keras.models import Sequential
from keras.constraints import NonNeg
from keras import backend as K
# 2D RBF Layer
# Input: x,y Pairs, shape: (2,)
# Output: exp(a* ((x-x_0)**2 + (y-y_0)**2)), shape: (1,)
# Parameters: x_0, y_0, a - called: mean_x, mean_y and opening in the following code:
# x and y should both lie in [0,1] - only [0,infinity] is enforced currently
class RBFLayer2D(Layer):
def __init__(self, **kwargs):
super(RBFLayer2D, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.mean_x = K.variable(0.35)
self.constraints[self.mean_x] = NonNeg()
self.mean_y = K.variable(0.35)
self.constraints[self.mean_y] = NonNeg()
self.opening = K.variable(2.0)
self.constraints[self.opening] = NonNeg()
self.trainable_weights = [self.mean_x,self.mean_y,self.opening]
super(RBFLayer2D, self).build(input_shape)
def call(self, x):
x_m = x[:,0] - self.mean_x
y_m = x[:,1] - self.mean_y
out = x_m*x_m + y_m*y_m
outexp = 50.0*K.exp(-64.8*self.opening*out)
# Output: exp(-a* ((x-x_0)**2 + (y-y_0)**2))
return outexp
def compute_output_shape(self, input_shape):
# If Inputshape is (None, N) Outputshape is (None,N/2)
# In our example we only look at (None, 2), which outputs (None,1)
output_shape = (input_shape[0], input_shape[1]//2)
return output_shape
# The function we want to train.
# It can be exactly represented using a single Neuron.
def twodenergy(phi, psi):
r0 = np.array([-180, -180])
b = 0.00005
return 50.0 * np.exp(- b * ((phi - r0[0]) ** 2 + (psi - r0[1]) ** 2))
# One of two plotting helper functions to show the results
def make_plot(y,numsteps,numbins,minangle,maxangle,plotnum, batch_size):
evaluation = np.zeros((numsteps, numsteps))
for i in range(0, numbins):
mx = i % numsteps
my = int(i / numsteps)
evaluation[mx,my]=y[i]
plt.imshow(evaluation.T, origin='lower',extent=[minangle, maxangle, minangle, maxangle])
plt.xlabel("x")
plt.ylabel("y")
if plotnum == 0:
plt.title("Startconfiguration")
else:
plt.title("RBF for batch_size %i at frame %03d" % (batch_size, plotnum))
plt.show()
# One of two plotting helper functions to show the target function
def plot_target_function(phi, psi, minangle, maxangle, delta_angle_half, numbins, numsteps ):
eval_matrix_corr = np.zeros((numsteps, numsteps))
for i in range(0, numbins):
mx = i % numsteps
my = int(i / numsteps)
ph = phi[mx] +delta_angle_half
ps = psi[my] +delta_angle_half
eval_matrix_corr[mx,my] = twodenergy(ph,ps)
plt.imshow(eval_matrix_corr.T, origin='lower', extent=[minangle, maxangle, minangle, maxangle])
plt.title("Target Function")
plt.xlabel("phi")
plt.ylabel("psi")
plt.show()
if __name__ == "__main__":
# batch_size == 1: converges very often nearly independent of input parameters
# batch_size == 2: no to slow convergence, but distribution stays in the right place more or less
# batch_size == 3-12: random walk
# batch_size == 1296: no movement in case of low learning_rate, random_walk in case of high learning_rate
# (this is the case where the whole map is evaluated in every step.
# 1296 is our desired testcase, because it evaluates the whole map we want to fit.
batch_size = 1
learning_rate = 1E-5
### Here we generate the target function ###
### f(phi,psi)
### phi is [-180,180]
### psi is [-180,180]
anglestep = 10.0
minangle = -180.0
maxangle = 180.0
numsteps = int((maxangle - minangle)/anglestep)
anglerange = maxangle - minangle
numbins = numsteps*numsteps
delta_angle_half = anglerange /(2.0* numsteps)
phi = np.arange(minangle, maxangle, anglestep)
psi = np.arange(minangle, maxangle, anglestep)
#Target Function Plot, Gaussian in lower left
plot_target_function(phi, psi, minangle, maxangle, delta_angle_half, numbins, numsteps )
# Input Parameter Regularization
# we map -180..180 to 0..1
# we also calculate the training parameters for our x,y pairs:
x_train = np.zeros((numbins, 2))
y_train = np.zeros((numbins, 1))
for x,ph in enumerate(phi):
for y,ps in enumerate(psi):
myphi = (ph + delta_angle_half - minangle)/(anglerange)
mypsi = (ps + delta_angle_half- minangle)/(anglerange)
x_train[x * numsteps + y, 0] = (ph +delta_angle_half - minangle)/(anglerange)
x_train[x * numsteps + y, 1] = (ps + delta_angle_half- minangle)/(anglerange)
y_train[x * numsteps + y] = twodenergy(ph +delta_angle_half,ps +delta_angle_half)
# Prediction with Keras
model = Sequential()
# Single RBF Layer, only one node
model.add(RBFLayer2D(input_shape=(2,)))
sgd = SGD(lr=learning_rate)
model.compile(loss="mean_squared_error", optimizer=sgd)
# We plot the starting configuration.
y = model.predict(x_train, batch_size=batch_size)
make_plot(y, numsteps, numbins, minangle, maxangle, 0, batch_size)
#Plot the first 15 iterations:
for i in range(0,15):
# For demonstration purposes, we fit 1 epoch and plot the output.
model.fit(x_train,y_train, epochs=1, batch_size=batch_size)
y = model.predict(x_train, batch_size=batch_size)
make_plot(y, numsteps, numbins, minangle, maxangle, 1 + i, batch_size)

Resources