Why is my DQN training with tensorflow becoming slower each iteration? - python-3.x

I’m trying to implement a DQN with experience replay in tensorflow. It seems to be working, i.e. my loss is decreasing. However, as the the training loop is running I’ve noticed that each training iteration becomes slower and slower. It is as if my tensorflow graph is becoming bigger and bigger and slowing down the training. I cannot see myself what the problem with my code is. Any tensorflow guru out there that can point it out? I have made a scaled-down version of my code here which operates on random data but produces the same issue.
import numpy as np
import tensorflow as tf
# Function which initializes tensorflow weights for feed-forward NN.
def InitWeights(LayerSizes):
# Make tensorflow input/output placeholders
X = tf.placeholder(shape = (None,LayerSizes[0]), dtype = tf.float32, name ='InputData')
y = tf.placeholder(shape = (None,LayerSizes[-1]), dtype = tf.float32, name ='OutputData')
# Initialize dictionaries for weights and biases.
W = {}
b = {}
for ii in range(len(LayerSizes)-1):
layername = f'layer%s' % ii
with tf.variable_scope(layername):
ny = LayerSizes[ii]
nx = LayerSizes[ii+1]
# Weights (initialized with xavier initializatiion).
W['Weights_'+layername] = tf.get_variable(
name = 'Weights_'+layername,
shape = (ny, nx),
initializer = tf.contrib.layers.xavier_initializer(),
dtype = tf.float32
)
# Bias (initialized with xavier initializatiion).
b['Bias_'+layername] = tf.get_variable(
name = 'Bias_'+layername,
shape = (nx),
initializer = tf.contrib.layers.xavier_initializer(),
dtype = tf.float32
)
return W, b, X, y
# Function which defines feed-forward neural network operation.
def FeedForward(X, W, b):
a = X
# Loop all layers of the network.
for ii in range(len(W)):
# Use name of each layer as index.
layername = f'layer%s' % ii
# Weighted sum: z = input*W + b
z = tf.add(tf.matmul(a, W['Weights_'+layername], name = 'WeightedSum_z_'+layername), b['Bias_'+layername])
# Passed through actication fcn: a = h(z)
if ii == len(W)-1:
a = z
else:
a = tf.nn.relu(z, name = 'activation_a_'+layername)
return a
# Function used for experience replay
def ExperienceReplay(s, a, r, s_prime, gamma, TermState, X, y, yhat, yhatNN2, train_op, loss, sess):
# Inputs:
# s - state(s)
# a - actions(s)
# r - rewards(s)
# s_prime - new state(s)
# gamma - discount factor
# TermState - scalar of which action is termenating
# X - tensorflow placeholder for network inputs
# y - tensorflow placeholder for network outputs
# yhat - tensorflow operation for feed foward with NN 1
# yhatNN2 - tensorflow operation for feed foward with NN 2
# train_op - tensorflow training operation
# loss - tensorflow fcn for calculating loss
# sess - tensorflow session
# Forward pass throught NN2 using s_prime to find max(Q(s',a',theta')).
Q = sess.run(yhatNN2, feed_dict={X : s_prime})
# Actions that NN1 thinks is best # sprime state
a_argmax = np.argmax(sess.run(yhat, feed_dict={X : s_prime}), axis=1)
# Values from NN2's opinion about the actions NN1 picked.
Qm = np.zeros(len(r))
for obs in range(len(r)):
Qm[obs] = Q[obs,a_argmax[obs]]
# First make all targets equal to NN1's approximation of Q (so the error is 0 in all unobserved cases)
Targets = sess.run(yhat, feed_dict={X : s})
# If the action was experienced, change the target to either real reward or discounted future reward.
for obs in range(len(r)):
# If the action was episode-terminating, use only reward as target.
if int(a[obs]) == TermState:
Targets[obs,int(a[obs])] = r[obs]
# Otherwise use discounted future reward.
else:
Targets[obs,int(a[obs])] = r[obs] + gamma*Qm[obs]
# Gradient decent one step on NN1 weights.
sess.run(train_op, feed_dict={X : s, y : Targets})
# Calculate the losses.
loss_val = sess.run(loss, feed_dict={X : s, y : Targets})
meanloss = np.mean(loss_val)
return loss_val, meanloss
if __name__ == "__main__":
#### Hyperparameter settings
N = 64 # Minibatch size during training
gamma = 0.99 # Discount rate
C = 100 # How many iterations between NN sync NN2 = NN1
lr = 1e-7 # Learning rate of NN during training
nstates = 256 # Number of possible states
nactions = 256 # Number of possible actions
TermState = 255 # Which state ends episode
"""
Initialize tensorflow session and create one NN with two set of weights
"""
# Initialize & configure action-value function Q with random weights theta.
LayerSizes = [nstates, 1024, 1024, nactions]
W, b, X, y = InitWeights(LayerSizes)
# Define loss function to optimize. Here: quadratic loss fcn. (Outputdata-a)^2
yhat = FeedForward(X, W, b)
loss = tf.reduce_sum(tf.square(y - yhat),reduction_indices=[0])
# Define optimizer to use when minimizing loss function.
all_variables = tf.trainable_variables()
optimizer = tf.train.AdamOptimizer(learning_rate = lr)
train_op = optimizer.minimize(loss, var_list = all_variables)
# Initialize target action-value function Qhat with random weights theta_= theta.
with tf.device('/gpu:0'):
W2 = {}
b2 = {}
# Make hard copy of tensorflow Weights and biases
for key in W:
W2[key] = tf.Variable(W[key].initialized_value())
for key in b:
b2[key] = tf.Variable(b[key].initialized_value())
yhatNN2 = FeedForward(X, W2, b2)
# Start tf session and initialize variables.
sess = tf.Session()
sess.run(tf.global_variables_initializer())
## Generate random data representing state transitions <s,a,r,s'>.
# Random states
Ds = np.random.rand(100000,nstates)>0.5
Ds = Ds.astype(np.float32)
# Random actions
Da = np.random.randint(0,nstates,(100000,1)).astype(np.float32)
# Random rewards
Dr = np.random.rand(100000,1).astype(np.float32)
# Random new states
Ds_prime = np.random.rand(100000,nstates)>0.5
Ds_prime = Ds.astype(np.float32)
"""
Pretrain network and report time each C iterations
"""
import time
t0 = time.time()
for i in range(100000):
# Randomly pick minibatch to use
MemsToUse = np.random.choice(len(Dr), N)
s = Ds[MemsToUse,:]
a = Da[MemsToUse,0]
r = Dr[MemsToUse,0]
sprime = Ds_prime[MemsToUse,:]
# Experience replay.
loss_val, meanloss = ExperienceReplay(s, a, r, sprime, gamma,TermState, X, y, yhat, yhatNN2, train_op, loss, sess)
# every C iteration copy NN2 = NN1
if (i % C) == 0:
t1 = time.time()
print('iter: %i meanloss: %0.5f iteration took %0.2f s' %(i,meanloss,t1-t0))
t0 = time.time()
with tf.device('/gpu:0'):
for key in W:
W2[key] = tf.Variable(W[key].initialized_value())
for key in b:
b2[key] = tf.Variable(b[key].initialized_value())
Update: After having timed individual segments of the code it appears as if it’s the first line of my Experience replay function:
Q = sess.run(yhatNN2, feed_dict={X : s_prime})
That is causing most if not all of the slowdown. I don’t understand the logic behind why that is, there are several feedforward passes occurring in the program but only this one seem to cause a problem..

Related

Efficiently training a network simultaneously on labels and partial derivaties

I'm trying to train a network in pytorch along the lines of this idea.
The author creates a simple MLP (4 hidden layers) and then explicitly works out what the partial derivatives of the output is wrt the inputs. He then trains the network on the training labels as well as the gradients of the output wrt the input data (which is also part of the training data).
To replicate the idea in pytorch, my training loop looks like this:
import torch
import torch.nn.functional as F
class vanilla_net(torch.nn.Module):
def __init__(self,
input_dim, # dimension of inputs, e.g. 10
hidden_units, # units in hidden layers, assumed constant, e.g. 20
hidden_layers): # number of hidden layers, e.g. 4):
super(vanilla_net, self).__init__()
self.input = torch.nn.Linear(input_dim, hidden_units)
self.hidden = torch.nn.ModuleList()
for hl in range(hidden_layers):
layer = torch.nn.Linear(hidden_units, hidden_units)
self.hidden.append(layer)
self.output = torch.nn.Linear(hidden_units, 1)
def forward(self, x):
x = self.input(x)
x = F.softplus(x)
for h in self.hidden:
x = h(x)
x = F.softplus(x)
x = self.output(x)
return x
....
def lossfn(x, y, dx, dy):
# some loss function involving both sets of training data (y and dy)
# the network outputs x and what's needed is an efficient way of calculating dx - the partial
# derivatives of x wrt the batch inputs.
pass
def train(net, x_train, y_train, dydx_train, batch_size=256)
m, n = x_train.shape
first = 0
last = min(batch_size, m)
while first < m:
xi = x_train[first:last]
yi = y_train[first:last]
zi = dydx_train[first:last]
xi.requires_grad_()
# Perform forward pass
outputs = net(xi)
minimizer.zero_grad()
outputs.backward(torch.ones_like(outputs), create_graph=True)
xi_grad = xi.grad
# Compute loss
loss = lossfn(outputs, yi, xi_grad, zi)
minimizer.zero_grad()
# Perform backward pass
loss.backward()
# Perform optimization
minimizer.step()
first = last
last = min(first + batch_size, m)
net = vanilla_net(4, 10, 4)
minimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
...
This seems to work but is there a more elegant/efficient way to achieve the same thing? Also - not sure I know where the best place to put the minimizer.zero_grad()
Thanks

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation for reinforcement learning

Why pytorch gives me the error for this code? I am trying to implement REINFORCE algorithm.
import torch
import torch.optim as optim
from model_utils import Reinforce
from torch.distributions import Categorical
# train method
def reinforce(environment):
learning_rate = 1e-3
gamma = 0.99
num_steps = 30
max_episodes = 3000
num_state_features = 21
num_actions = 1000
net = Reinforce(num_state_features, num_actions)
optimizer = optim.Adam(net.parameters(), lr=learning_rate)
net.train()
for episode in range(max_episodes):
state = torch.from_numpy(environment.reset())
log_prob_seq = []
reward_seq = []
# generate trajectory
for step in range(num_steps):
policy_distro = net.forward(state)
distro = Categorical(policy_distro)
action = distro.sample()
c = action.item()
log_prob_seq.append(distro.log_prob(action))
# compute reward, go to next state
reward, new_state = environment.step(c)
reward_seq.append(reward)
new_state = torch.from_numpy(new_state)
state = new_state
# compute the return and loss
loss = []
returns = reward_seq.copy()
for step in reversed(range(num_steps)):
if step != num_steps - 1:
returns[step] += gamma * returns[step + 1]
loss.append(- (gamma ** step) * returns[step] * log_prob_seq[step])
# update policy model parameters
for step in range(num_steps):
optimizer.zero_grad()
loss[step].backward()
optimizer.step()
and it gives:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [128, 115]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead.
First thing to try is to replace all the in-place operators in your code. In your case one of those is +=, you can try replacing
returns[step] += gamma * returns[step + 1] with returns[step] = returns[step] + gamma * returns[step + 1]
Also, PyTorch can give you a little more precise information about where to look for it, you need to run the code with a specific flag (it should be mentioned in the error message how to do it).

How to use Tensorflow to compute per-sample gradient and reduce them with an arbitrary function?

Given a (mini-)batch of size M, I want to acquire M gradients (one for each sample) and reduce them to a single gradient with an arbitrary function (instead of the typical mean). Then, I want to use this last gradient to train the network with.
I have the example below which somewhat works. The problem is that not only it gets slower and slower as iterations go by, but also the programs keeps on using more and more memory. As far as I can tell, this happens because I'm adding new operations to Tensorflow's computational graph on each iteration, but I lack the knowledge of Tensorflow to avoid doing this and still achieve my objective.
import sys
import numpy as np
import tensorflow as tf
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Parameters
learning_rate = 0.0001
n_iterations = 1000000
batch_size = 2
display_step = 1000
# Network Parameters
n_hidden_1 = 256
n_hidden_2 = 256
n_input = 784
n_output = 10
# Create model
def build_network(x, weights, biases):
assert sys.version_info[:2] >= (3,6) # otherwise iteration on dict is ill-defined
layer = x
for (w_str,w),(b_str,b) in zip(weights.items(), biases.items()):
print(w_str,b_str)
layer = tf.add(tf.matmul(layer, w), b)
return layer
def max_gradient(gradients):
assert len(gradients) >= 1
# compute the resulting gradient, by choosing the (abs) max component wise
tgv = gradients[0]
for gv in gradients[1:]:
for (tg,tv),(g,v) in zip(tgv,gv):
assert (tv == v).all()
np.copyto(tg,g,where=abs(g) > abs(tg))
return tgv
def main(sess):
# tf Graph input/output
X = tf.placeholder('float', [None, n_input])
Y = tf.placeholder('float', [None, n_output])
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_2, n_output]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'b2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_output]))
}
# Construct model
model = build_network(X, weights, biases)
# Loss
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=model, labels=Y))
# Optimizer
opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
# Gradient
gv_op = opt.compute_gradients(loss_op)
# Trainable variables
t_vars = tf.trainable_variables()
# Initializing the variables
sess.run(tf.global_variables_initializer())
# Training cycle
for i in range(n_iterations):
# Get new training data
X_train, Y_train = mnist.train.next_batch(batch_size)
# Run the cost operation (to get the loss value)
c = sess.run(loss_op, feed_dict={X: X_train, Y: Y_train})
print(i, c)
# get the gradients for each batch sample
gradients = [sess.run(gv_op,
feed_dict={X: X_train[[j]], Y: Y_train[[j]]})
for j in range(batch_size)]
# compute the resulting gradient
tgv = max_gradient(gradients)
# assert that all the variables match
for (_,v1),v2 in zip(tgv,t_vars):
assert (v1 == sess.run(v2)).all()
# place the actual variables for the variable slots
tgv = [(g,v) for (g,_),v in zip(tgv, t_vars)]
# apply the transformation
sess.run(opt.apply_gradients(tgv))
if __name__ == '__main__':
with tf.Session() as sess:
main(sess)
How can I compute per-sample gradient and reduce them with an arbitrary function without constantly adding new operations to Tensorflow's graph?

How do I obtain predictions and probabilities from new data input to a CNN in Tensorflow

I'll preface this by saying this is my first posted question on SO. I've just recently started working with Tensorflow, and have been attempting to apply a convolutional-neural network model approach for classification of .csv records in a file representing images from scans of microarray data. (FYI: Microarrays are a grid of spotted DNA on a glass slide, representing specific DNA target sequences for determining the presence of those DNA targets in a sample. The individual pixels represent fluorescence intensity value from 0-1). The file has ~200,000 records in total. Each record (image) has 10816 pixels that represent DNA sequences from known viruses, and one index label which identifies the virus species. The pixels create a pattern which is unique to each of the different viruses. There are 2165 different viruses in total represented within the 200,000 records. I have trained the network on images of labeled microarray datasets, but when I try to pass a new dataset through to classify it/them as one of the 2165 different viruses and determine predicted values and probabilities, I don't seem to be having much luck. This is the code that I am currently using for this:
import tensorflow as tf
import numpy as np
import csv
def extract_data(filename):
print("extracting data...")
NUM_LABELS = 2165
NUM_FEATURES = 10816
labels = []
fvecs = []
rowCount = 0
#iterate over the rows, split the label from the features
#convert the labels to integers and features to floats
for line in open(filename):
rowCount = rowCount + 1
row = line.split(',')
labels.append(row[3])#(int(row[7])) #<<<IT ALWAYS PREDICTS THIS VALUE!
for x in row [4:10820]:
fvecs.append(float(x))
#convert the array of float arrasy into a numpy float matrix
fvecs_np = np.matrix(fvecs).astype(np.float32)
#convert the array of int lables inta a numpy array
labels_np = np.array(labels).astype(dtype=np.uint8)
#convert the int numpy array into a one-hot matrix
labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)
print("arrays converted")
return fvecs_np, labels_onehot
def TestModels():
fvecs_np, labels_onehot = extract_data("MicroarrayTestData.csv")
print('RESTORING NN MODEL')
weights = {}
biases = {}
sess=tf.Session()
init = tf.global_variables_initializer()
#Load meta graph and restore weights
ModelID = "MicroarrayCNN_Data-1000.meta"
print("RESTORING:::", ModelID)
saver = tf.train.import_meta_graph(ModelID)
saver.restore(sess,tf.train.latest_checkpoint('./'))
graph = tf.get_default_graph()
x = graph.get_tensor_by_name("x:0")
y = graph.get_tensor_by_name("y:0")
keep_prob = tf.placeholder(tf.float32)
y_ = tf.placeholder("float", shape=[None, 2165])
wc1 = graph.get_tensor_by_name("wc1:0")
wc2 = graph.get_tensor_by_name("wc2:0")
wd1 = graph.get_tensor_by_name("wd1:0")
Wout = graph.get_tensor_by_name("Wout:0")
bc1 = graph.get_tensor_by_name("bc1:0")
bc2 = graph.get_tensor_by_name("bc2:0")
bd1 = graph.get_tensor_by_name("bd1:0")
Bout = graph.get_tensor_by_name("Bout:0")
weights = {wc1, wc2, wd1, Wout}
biases = {bc1, bc2, bd1, Bout}
print("NEXTArgmax")
prediction=tf.argmax(y,1)
probabilities = y
predY = prediction.eval(feed_dict={x: fvecs_np, y: labels_onehot}, session=sess)
probY = probabilities.eval(feed_dict={x: fvecs_np, y: labels_onehot}, session=sess)
accuracy = tf.reduce_mean(tf.cast(prediction, "float"))
print(sess.run(accuracy, feed_dict={x: fvecs_np, y: labels_onehot}))
print("%%%%%%%%%%%%%%%%%%%%%%%%%%")
print("Predicted::: ", predY, accuracy)
print("%%%%%%%%%%%%%%%%%%%%%%%%%%")
feed_dictTEST = {y: labels_onehot}
probabilities=probY
print("probabilities", probabilities.eval(feed_dict={x: fvecs_np}, session=sess))
########## Run Analysis ###########
TestModels()
So, when I run this code I get the correct prediction for the test set, although I am not sure I believe it, because it appears that whatever value I append in line 14 (see below) is the output it predicts:
labels.append(row[3])#<<<IT ALWAYS PREDICTS THIS VALUE!
I don't understand this, and it makes me suspicious that I've set up the CNN incorrectly, as I would have expected it to ignore my input label and determine a bast match from the trained network based on the trained patterns. The only thing I can figure is that when I pass the value through for the prediction; it is instead training the model on this data as well, and then predicting itself. Is this a correct assumption, or am I misinterpreting how Tensorflow works?
The other issue is that when I try to use code that (based on other tutorials) which is supposed to output the probabilities of all of the 2165 possible outputs, I get the error:
InvalidArgumentError (see above for traceback): Shape [-1,2165] has negative dimensions
[[Node: y = Placeholder[dtype=DT_FLOAT, shape=[?,2165], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
To me, it looks like it is the correct layer based on the 2165 value in the Tensor shape, but I don't understand the -1 value. So, to wrap up the summary, my questions are:
Based on the fact that I get the value that I have in the label of the input data, is this the correct method to make a classification using this model?
Am I missing a layer or have I configured the model incorrectly in order to extract the probabilities of all of the possible output classes, or am I using the wrong code to extract the information? I try to print out the accuracy to see if that would work, but instead it outputs the description of a tensor, so clearly that is incorrect as well.
(ADDITIONAL INFORMATION)
As requested, I'm also including the original code that was used to train the model, which is now below. You can see I do sort of a piece meal training of a limited number of related records at a time by their taxonomic relationships as I iterate through the file. This is mostly because the Mac that I'm training on (Mac Pro w/ 64GB ram) tends to give me the "Killed -9" error due to overuse of resources if I don't do it this way. There may be a better way to do it, but this seems to work.
Original Author: Aymeric Damien
Project: https://github.com/aymericdamien/TensorFlow-Examples/
from __future__ import print_function
import tensorflow as tf
import numpy as np
import csv
import random
# Parameters
num_epochs = 2
train_size = 1609
learning_rate = 0.001 #(larger >speed, lower >accuracy)
training_iters = 5000 # How much do you want to train (more = better trained)
batch_size = 32 #How many samples to train on, size of the training batch
display_step = 10 # How often to diplay what is going on during training
# Network Parameters
n_input = 10816 # MNIST data input (img shape: 28*28)...in my case 104x104 = 10816(rough array size)
n_classes = 2165 #3280 #2307 #787# Switched to 100 taxa/training set, dynamic was too wonky.
dropout = 0.75 # Dropout, probability to keep units. Jeffery Hinton's group developed it, that prevents overfitting to find new paths. More generalized model.
# Functions
def extract_data(filename):
print("extracting data...")
# arrays to hold the labels and feature vectors.
NUM_LABELS = 2165
NUM_FEATURES = 10826
taxCount = 0
taxCurrent = 0
labels = []
fvecs = []
rowCount = 0
#iterate over the rows, split the label from the features
#convert the labels to integers and features to floats
print("entering CNN loop")
for line in open(filename):
rowCount = rowCount + 1
row = line.split(',')
taxCurrent = row[3]
print("profile:", row[0:12])
labels.append(int(row[3]))
fvecs.append([float(x) for x in row [4:10820]])
#convert the array of float arrasy into a numpy float matrix
fvecs_np = np.matrix(fvecs).astype(np.float32)
#convert the array of int lables inta a numpy array
labels_np = np.array(labels).astype(dtype=np.uint8)
#convert the int numpy array into a one-hot matrix
labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)
print("arrays converted")
return fvecs_np, labels_onehot
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1): #Layer 1 : Convolutional layer
# Conv2D wrapper, with bias and relu activation
print("conv2d")
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') # Strides are the tensors...list of integers. Tensors=data
x = tf.nn.bias_add(x, b) #bias is the tuning knob
return tf.nn.relu(x) #rectified linear unit (activation function)
def maxpool2d(x, k=2): #Layer 2 : Takes samples from the image. (This is a 4D tensor)
print("maxpool2d")
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
# Create model
def conv_net(x, weights, biases, dropout):
print("conv_net setup")
# Reshape input picture
x = tf.reshape(x, shape=[-1, 104, 104, 1]) #-->52x52 , -->26x26x64
# Convolution Layer
conv1 = conv2d(x, weights['wc1'], biases['bc1']) #defined above already
# Max Pooling (down-sampling)
conv1 = maxpool2d(conv1, k=2)
print(conv1.get_shape)
# Convolution Layer
conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) #wc2 and bc2 are just placeholders...could actually skip this layer...maybe
# Max Pooling (down-sampling)
conv2 = maxpool2d(conv2, k=2)
print(conv2.get_shape)
# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
fc1 = tf.nn.relu(fc1) #activation function for the NN
# Apply Dropout
fc1 = tf.nn.dropout(fc1, dropout)
# Output, class prediction
out = tf.add(tf.matmul(fc1, weights['Wout']), biases['Bout'])
return out
def Train_Network(Txid_IN, Sess_File_Name):
import tensorflow as tf
tf.reset_default_graph()
x,y = 0,0
weights = {}
biases = {}
# tf Graph input
print("setting placeholders")
x = tf.placeholder(tf.float32, [None, n_input], name="x") #Gateway for data (images)
y = tf.placeholder(tf.float32, [None, n_classes], name="y") # Gateway for data (labels)
keep_prob = tf.placeholder(tf.float32) #dropout # Gateway for dropout(keep probability)
# Store layers weight & bias
#CREATE weights
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32]),name="wc1"), #
# 5x5 conv, 32 inputs, 64 outputs
'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64]),name="wc2"),
# fully connected, 7*7*64 inputs, 1024 outputs
'wd1': tf.Variable(tf.random_normal([26*26*64, 1024]),name="wd1"),
# 1024 inputs, 10 outputs (class prediction)
'Wout': tf.Variable(tf.random_normal([1024, n_classes]),name="Wout")
}
biases = {
'bc1': tf.Variable(tf.random_normal([32]), name="bc1"),
'bc2': tf.Variable(tf.random_normal([64]), name="bc2"),
'bd1': tf.Variable(tf.random_normal([1024]), name="bd1"),
'Bout': tf.Variable(tf.random_normal([n_classes]), name="Bout")
}
# Construct model
print("constructing model")
pred = conv_net(x, weights, biases, keep_prob)
print(pred)
# Define loss(cost) and optimizer
#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y)) Deprecated version of the statement
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = pred, labels=y)) #added reduce_mean 6/27
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
print("%%%%%%%%%%%%%%%%%%%%")
print ("%% ", correct_pred)
print ("%% ", accuracy)
print("%%%%%%%%%%%%%%%%%%%%")
# Initializing the variables
#init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
saver = tf.train.Saver()
fvecs_np, labels_onehot = extract_data("MicroarrayDataOUT.csv") #CHAGE TO PICORNAVIRUS!!!!!AHHHHHH!!!
print("starting session")
# Launch the graph
FitStep = 0
with tf.Session() as sess: #graph is encapsulated by its session
sess.run(init)
step = 1
# Keep training until reach max iterations (training_iters)
while step * batch_size < training_iters:
if FitStep >= 5:
break
else:
#iterate and train
print(step)
print(fvecs_np, labels_onehot)
for step in range(num_epochs * train_size // batch_size):
sess.run(optimizer, feed_dict={x: fvecs_np, y: labels_onehot, keep_prob:dropout}) #no dropout???...added Keep_prob:dropout
if FitStep >= 5:
break
#else:
###batch_x, batch_y = mnist.train.next_batch(batch_size)
# Run optimization op (backprop)
###sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
### keep_prob: dropout}) <<<<SOMETHING IS WRONG IN HERE?!!!
if step % display_step == 0:
# Calculate batch loss and accuracy
loss, acc = sess.run([cost, accuracy], feed_dict={x: fvecs_np,
y: labels_onehot,
keep_prob: 1.})
print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(np.mean(loss)) + ", Training Accuracy= " + \
"{:.5f}".format(acc))
TrainAcc = float("{:.5f}".format(acc))
#print("******", TrainAcc)
if TrainAcc >= .99: #Changed from .95 temporarily
print(FitStep)
FitStep = FitStep+1
saver.save(sess, Sess_File_Name, global_step=1000) #
print("Saved Session:", Sess_File_Name)
step += 1
print("Optimization Finished!")
print("Testing Accuracy:", \
sess.run(accuracy, feed_dict={x: fvecs_np[:256],
y: labels_onehot[:256],
keep_prob: 1.}))
#feed_dictTEST = {x: fvecs_np[50]}
#prediction=tf.argmax(y,1)
#print(prediction)
#best = sess.run([prediction],feed_dictTEST)
#print(best)
print("DONE")
sess.close()
def Tax_Iterator(CSV_inFile, CSV_outFile): #Deprecate
#Need to copy *.csv file to MySQL for sorting
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
TaxCount = 0
TaxThreshold = 2165
ThresholdStep = 2165
PrevTax = 0
linecounter = 0
#Open all GenBank profile list
for line in open(CSV_inFile):
linecounter = linecounter+1
print(linecounter)
resultFile = open(CSV_outFile,'a')
wr = csv.writer(resultFile, dialect='excel')
# Check for new TXID
row = line.split(',')
print(row[7], "===", PrevTax)
if row[7] != PrevTax:
print("X1")
TaxCount = TaxCount+1
PrevTax = row[7]
#Check it current Tax count is < or > threshold
# < threshold
print(TaxCount,"=+=", TaxThreshold)
if TaxCount<=3300:
print("X2")
CurrentTax= row[7]
CurrTxCount = CurrentTax
print("TaxCount=", TaxCount)
print( "Add to CSV")
print("row:", CurrentTax, "***", row[0:15])
wr.writerow(row[0:-1])
# is > threshold
else:
print("X3")
# but same TXID....
print(row[7], "=-=", CurrentTax)
if row[7]==CurrentTax:
print("X4")
CurrentTax= row[7]
print("TaxCount=", TaxCount)
print( "Add to CSV")
print("row:", CurrentTax, "***", row[0:15])
wr.writerow(row[0:-1])
# but different TXID...
else:
print(row[7], "=*=", CurrentTax)
if row[7]>CurrentTax:
print("X5")
TaxThreshold=TaxThreshold+ThresholdStep
resultFile.close()
Sess_File_Name = "CNN_VirusIDvSPECIES_XXALL"+ str(TaxThreshold-ThresholdStep)
print("<<<< Start Training >>>>"
print("Training on :: ", CurrTxCount, "Taxa", TaxCount, "data points.")
Train_Network(CurrTxCount, Sess_File_Name)
print("Training complete")
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
CurrentTax= row[7]
#reset tax count
CurrTxCount = 0
TaxCount = 0
resultFile.close()
Sess_File_Name = "MicroarrayCNN_Data"+ str(TaxThreshold+ThresholdStep)
print("<<<< Start Training >>>>")
print("Training on :: ", CurrTxCount, "Taxa", TaxCount, "data points.")
Train_Network(CurrTxCount, Sess_File_Name)
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
CurrentTax= row[7]
Tax_Iterator("MicroarrayInput.csv", "MicroarrayOutput.csv")
You defined prediction as prediction=tf.argmax(y,1). And in both feed_dict, you feed labels_onehot for y. Consequently, your "prediction" is always equal to the labels.
As you didn't post the code you used to train your network, I can't tell you what exactly you need to change.
Edit: I have isses understanding the underlying problem you're trying to solve - based on your code, you're trying to train a neural network with 2165 different classes using 1609 training examples. How is this even possible? If each example had a different class, there would still be some classes without any training example. Or does one image belong to many classes? From your statement at the beginning of your question, I had assumed you're trying to output a real-valued number between 0-1.
I'm actually surprised that the code actually worked as it looks like you're adding only a single number to your labels list, but your model expects a list with length 2165 for each training example.

tensorflow-for-onehot-classification , cost is always 0

This follows on from this post (not mine): TensorFlow for binary classification
I had a similar issue and converted my data to use one hot encoding. However I'm still getting a cost of 0. Interestingly the accuracy is correct (90%) when I feed my training data back into it.
Code below:
# Set parameters
learning_rate = 0.02
training_iteration = 2
batch_size = int(np.size(y_vals)/300)
display_step = 1
numOfFeatures = 20 # 784 if MNIST
numOfClasses = 2 #10 if MNIST dataset
# TF graph input
x = tf.placeholder("float", [None, numOfFeatures])
y = tf.placeholder("float", [None, numOfClasses])
# Create a model
# Set model weights to random numbers: https://www.tensorflow.org/api_docs/python/tf/random_normal
W = tf.Variable(tf.random_normal(shape=[numOfFeatures,1])) # Weight vector
b = tf.Variable(tf.random_normal(shape=[1,1])) # Constant
# Construct a linear model
model = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax
# Minimize error using cross entropy
# Cross entropy
cost_function = -tf.reduce_sum(y*tf.log(model))
# Gradient Descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for iteration in range(training_iteration):
avg_cost = 0.
total_batch = int(len(x_vals)/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_xs = x_vals[i*batch_size:(i*batch_size)+batch_size]
batch_ys = y_vals_onehot[i*batch_size:(i*batch_size)+batch_size]
# Fit training using batch data
sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
# Compute average loss
avg_cost += sess.run(cost_function, feed_dict={x: batch_xs, y: batch_ys})/total_batch
# Display logs per eiteration step
if iteration % display_step == 0:
print ("Iteration:", '%04d' % (iteration + 1), "cost=", "{:.9f}".format(avg_cost))
print ("Tuning completed!")
# Evaluation function
correct_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(y, 1))
#correct_prediction = tf.equal(model, y)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
# Test the model
print ("Accuracy:", accuracy.eval({x: x_vals_test, y: y_vals_test_onehot}))
Your output for cost is using:
"{:.9f}".format(avg_cost)
Therefore, maybe you can replace 9 with bigger number.
Ok here is what I found in the end.
Replace:
b = tf.Variable(tf.random_normal(shape=[1,1]))
with:
b = tf.Variable(tf.zeros([1]))

Resources