batching huge data in tensorflow - python-3.x

I am trying to perform binary classification using the code/tutorial from
https://github.com/eisenjulian/nlp_estimator_tutorial/blob/master/nlp_estimators.py
print("Loading data...")
(x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data(num_words=vocab_size)
print(len(y_train), "train sequences")
print(len(y_test), "test sequences")
print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train_variable,
maxlen=sentence_size,
padding='post',
value=0)
x_test = sequence.pad_sequences(x_test_variable,
maxlen=sentence_size,
padding='post',
value=0)
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
def train_input_fn():
dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train, y_train))
dataset = dataset.shuffle(buffer_size=len(x_train_variable))
dataset = dataset.batch(100)
dataset = dataset.map(parser)
dataset = dataset.repeat()
iterator = dataset.make_one_shot_iterator()
return iterator.get_next()
def eval_input_fn():
dataset = tf.data.Dataset.from_tensor_slices((x_test, x_len_test, y_test))
dataset = dataset.batch(100)
dataset = dataset.map(parser)
iterator = dataset.make_one_shot_iterator()
return iterator.get_next()
def cnn_model_fn(features, labels, mode, params):
input_layer = tf.contrib.layers.embed_sequence(
features['x'], vocab_size, embedding_size,
initializer=params['embedding_initializer'])
training = mode == tf.estimator.ModeKeys.TRAIN
dropout_emb = tf.layers.dropout(inputs=input_layer,
rate=0.2,
training=training)
conv = tf.layers.conv1d(
inputs=dropout_emb,
filters=32,
kernel_size=3,
padding="same",
activation=tf.nn.relu)
# Global Max Pooling
pool = tf.reduce_max(input_tensor=conv, axis=1)
hidden = tf.layers.dense(inputs=pool, units=250, activation=tf.nn.relu)
dropout_hidden = tf.layers.dropout(inputs=hidden,
rate=0.2,
training=training)
logits = tf.layers.dense(inputs=dropout_hidden, units=1)
# This will be None when predicting
if labels is not None:
labels = tf.reshape(labels, [-1, 1])
optimizer = tf.train.AdamOptimizer()
def _train_op_fn(loss):
return optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return head.create_estimator_spec(
features=features,
labels=labels,
mode=mode,
logits=logits,
train_op_fn=_train_op_fn)
cnn_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
model_dir=os.path.join(model_dir, 'cnn'),
params=params)
train_and_evaluate(cnn_classifier)
The example here loads data from IMDB movie reviews. I have my own dataset in the form of text which is approx 2GB huge. Now in this example the line
(x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data(num_words=vocab_size) tries to load whole dataset in memory. If I try to do the same I run out of memory. How can I restructure this logic to read data in batches from my disk?

You want to change the dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train, y_train)) line. There are lots of ways of creating a dataset - from_tensor_slices is the easiest, but won't work on its own if you can't load the entire dataset to memory.
The best way depends on how you have the data stored, or how you want to store it/manipulate it. The simplest in my opinion with very little down-side (unless running on multiple GPUs) is to have the original dataset just give indices to data, and write a normal numpy function for loading the ith example.
dataset = tf.data.Dataset.from_tensor_slices(tf.range(epoch_size))
def tf_map_fn(i):
def np_map_fn(i):
return load_ith_example(i)
inp1, inp2 = tf.py_func(np_map_fn, (i,), Tout=(tf.float32, tf.float32), stateful=False)
# other preprocessing/data augmentation goes here.
# unbatched sizes
inp1.set_shape(shape1)
inp2.set_shape(shape2)
return inp1, inp2
dataset = dataset.repeat().shuffle(epoch_size).map(tf_map_fn, 8)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(1) # start loading data as GPU trains on previous batch
inp1, inp2 = dataset.make_one_shot_iterator().get_next()
Here I assume your outputs are float32 tensors (Tout=...). set_shape calls aren't strictly necessary, but if you know the shape it'll do better error checks.
So long as your preprocessing doesn't take longer than your network to run, this should run just as fast as any other method on a single GPU machine.
The other obvious way is to convert your data to tfrecords, but that'll take up more space on disk and is more of a pain to manage if you ask me.

Related

problem overfitting model VGG16 small dataset

Screenshot of the problem
i have two classes, each of them contains an equal number of pictures
Train
360 train pictures for classe one
360 train pictures for classe two
Test
90 test pictures classe one
90 test pictures classe two
my code
def load_split(basePath, csvPath):
data = []
labels = []
rows = open(csvPath).read().strip().split("\n")[1:]
random.shuffle(rows)
for (i, row) in enumerate(rows):
if i > 0:
print("[INFO] processed {} total images".format(i))
(label, imagePath) = row.strip().split(",")[-2:]
imagePath = os.path.sep.join([basePath, imagePath])
image = io.imread(imagePath)
image = transform.resize(image, (224, 224))
image = exposure.equalize_adapthist(image, clip_limit=0.1)
data.append(image)
labels.append(int(label))
data = np.array(data)
labels = np.array(labels)
return (data, labels)
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataset", required=True,
help="path to input GTSRB")
ap.add_argument("-m", "--model", required=True,
help="path to output model")
ap.add_argument("-p", "--plot", type=str, default="plot.png",
help="path to training history plot")
args = vars(ap.parse_args())
NUM_EPOCHS = 30
INIT_LR = 1e-4
BS = 64
labelNames = open("signnames.csv").read().strip().split("\n")[1:]
labelNames = [l.split(",")[1] for l in labelNames]
trainPath = os.path.sep.join([args["dataset"], "Train.csv"])
testPath = os.path.sep.join([args["dataset"], "Test.csv"])
print("[INFO] loading training and testing data...")
(trainX, trainY) = load_split(args["dataset"], trainPath)
(testX, testY) = load_split(args["dataset"], testPath)
trainX = (trainX-np.mean(trainX))/np.std(trainX)
testX = (testX-np.mean(testX))/np.std(testX)
numLabels = len(np.unique(trainY))
trainY = to_categorical(trainY, numLabels)
testY = to_categorical(testY, numLabels)
classTotals = trainY.sum(axis=0)
classWeight = classTotals.max() / classTotals
aug = ImageDataGenerator(
rotation_range=10,
zoom_range=0.15,
width_shift_range=0.1,
height_shift_range=0.1,
shear_range=0.15,
horizontal_flip=False,
vertical_flip=False,
fill_mode="nearest")
print("[INFO] compiling model...")
opt = Adam(lr=INIT_LR)
model = TrafficSignNet.build(width=224, height=224, depth=3,
classes=numLabels)
model.compile(loss="categorical_crossentropy", optimizer=opt,
metrics=["accuracy"])
print("[INFO] training network...")
H = model.fit_generator(
aug.flow(trainX, trainY, batch_size=BS),
validation_data=(testX, testY),
steps_per_epoch=trainX.shape[0] // BS,
epochs=NUM_EPOCHS,
class_weight=classWeight,
verbose=1)
print("[INFO] evaluating network...")
predictions = model.predict(testX, batch_size=BS)
print(classification_report(testY.argmax(axis=1),
predictions.argmax(axis=1), target_names=labelNames))
print("[INFO] serializing network to '{}'...".format(args["model"]))
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
model.save_weights("model.h5")
print("Saved model to disk")
Enddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd
I could not get access to your training script for some reason - can check your screenshot only.
I think your problem is not overfitting - your model is actually not learning, since the loss is not reduced while the accuracy keeps at the same level. Please double check you network, learning rate (most important) and preprocessing. Also, you may consider loading the VGG16 pretrained weight or not to load if you have done.
Or just post your code, I can take a look when I get a chance.
Update:
Based on your codes, I found you did not make any changes internally on the VGG16 - which is easy to debug.
Check you training and test set and make sure the classes are evenly distributed
Print out the label (Y test and train), carefully check if they are correct.
Try to standardize the X train and test instead of dividing by 255. x=(x-mean)/std
Try use learning rate as 0.0001 (I found it's generally good for VGG16 based network)
Stay simple at the first time. Don't use the decayed optimization, just try the standard ADAM firstly
Bests,

NumPy Data w/ TensorFlow "Not on the same graph"

So i'm trying to build a model that takes in an image-like numpy file and builds a model based on the data. Each individual .npy file DOES fit in memory, however I need to assume that all loaded at the same time will not.
When I go to run it & do a test evaluation, it gives me this error:
ValueError: Tensor("IteratorV2:0", shape=(), dtype=resource) must be from the same graph as Tensor("MapDataset:0", shape=(), dtype=variant)
I'm not super experienced with TensorFlow and I'm trying to make everything as "best practice" as possible.
Here's my code:
train_fnames, train_labels, test_fnames, test_labels =\
spec_to_paths_and_labels('count_data/spec.csv')
train_fnames = 'count_data/' + train_fnames
test_fnames = 'count_data/' + test_fnames
def read_npy_file(item):
data = np.load(item.decode())
return data.astype(np.int32)
# gdsii_placeholder = tf.placeholder(tf.float32, shape=(None, 224, 224, 1))
# label_placeholder = tf.placeholder(tf.int32, shape=[1])
def cnn_model(features, labels, mode):
conv1 = tf.layers.conv2d(features, filters=32, kernel_size=(5, 5))
pool1 = tf.layers.max_pooling2d(conv1, pool_size=(2, 2), strides=2)
conv2 = tf.layers.conv2d(pool1, filters=64, kernel_size=(5, 5))
pool2 = tf.layers.max_pooling2d(conv2, pool_size=(2, 2), strides=2)
flat = tf.layers.flatten(pool2)
dense = tf.layers.dense(flat, units=1024, activation='relu')
dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=True)
logits = tf.layers.dense(inputs=dropout, units=10)
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
def main(_):
# load the dataset
data = tf.data.Dataset.from_tensor_slices(train_fnames)
data = data.map(lambda item: tuple(tf.py_func(
read_npy_file, [item], [tf.int32, ])))
gdsii_classifier = tf.estimator.Estimator(
model_fn=cnn_model, model_dir="/tmp/gdsii_classifier")
res = gdsii_classifier.evaluate(lambda: data)
print(res)
if __name__ == "__main__":
tf.app.run()
There are several little mistakes in your code:
First, you tried to use Estimator without wrapping your tf.data operations inside an input function. This causes an issue because tf.data operations create their own graph for data loading while Estimator expects to have all the control on the whole graph. When you then call estimator.evaluate(), it fails because the tensor is inside your previously built graph.
Secondly, you tried to call estimator.evaluate() without a label.
Then, after you use a tf.py_func in tf.data, you need to specify the shape of the different tensors as they can't be inferred by Tensorflow.
And finally, you need to batch your data as tf.layers expect ndims=4, and you need to pass the features as either float32 or float16.
You can easily patch these issues by doing something like this:
def input_fn(x, y):
def set_shape(tensor, shape):
tensor.set_shape(shape)
return tensor
data = tf.data.Dataset.from_tensor_slices((x, y))
data = data.map(lambda item, label: (tf.py_func(read_npy_file, [item], [tf.float32]), label))
data = data.map(lambda item, label: (set_shape(item, YOUR_SHAPE), label))
data = data.batch(1)
return data
...
estimator.evaluate(lambda: input_fn(x=test_fnames, y=test_labels))

Tensorflow -- Iterating over training and validation sequencially

I have been going throught the Dataset API of tensorflow to feed different dataset with ease to an RNN model.
I got everything working following the not so many blogs together with the docs in the tensorflow website. My working example did the following:
--- Train on X epochs in a training dataset -> validate after all the training has concluded in a validation dataset.
However, I'm unable to develop the following example:
--- Train on X epochs in a training dataset -> validate in each epoch the training model with a validation dataset (a bit like what Keras does)
The problematic issue comes because of the following piece of code:
train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE, drop_remainder=True).repeat()
val_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE_VAL, drop_remainder=True).repeat()
itr = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
train_init_op = itr.make_initializer(train_dataset)
validation_init_op = itr.make_initializer(val_dataset)
When I create the iterator from_structure, I need to specify an output_shape. Obviously, the output shape of the train dataset and the validation dataset is not the same as they have a different batch_size. However, the validation_init_op is throwing the following error, which it seems counterintuitive because validation sets have always different batch_size:
TypeError: Expected output shapes compatible with (TensorShape([Dimension(256), Dimension(11), Dimension(74)]), TensorShape([Dimension(256), Dimension(3)])) but got dataset with output shapes (TensorShape([Dimension(28), Dimension(11), Dimension(74)]), TensorShape([Dimension(28), Dimension(3)])).
I want to do this second approach to evaluate my model and see the common train and validation plots developed at the same time, to see how can I improve it (stopping the learning early and etc). However, with the first simple approach I don't get all this.
So, the question is: ¿Am I doing something wrong? ¿Does my second approach has to be tackled differently? I can think of creating two iterators, but I don't know if that is the right approach. Also, this answer by #MatthewScarpino points out to a feedable iterator because switching between reinitializable ones makes them to start all over again; however, the above error is not related with that part of the code -- ¿Maybe the reinitializable iterator is not intended to set a different batch size for the validation set and to only iterate it once after training whatever the size it is and without setting it in the .batch() method?
Any help is very much appreciated.
Full code for reference:
N_TIMESTEPS_X = xt.shape[0] ## The stack number
BATCH_SIZE = 256
#N_OBSERVATIONS = xt.shape[1]
N_FEATURES = xt.shape[2]
N_OUTPUTS = yt.shape[1]
N_NEURONS_LSTM = 128 ## Number of units in the LSTMCell
N_EPOCHS = 350
LEARNING_RATE = 0.001
### Define the placeholders anda gather the data.
xt = xt.transpose([1,0,2])
xval = xval.transpose([1,0,2])
train_data = (xt, yt)
validation_data = (xval, yval)
N_BATCHES = train_data[0].shape[0] // BATCH_SIZE
print('The number of batches is: {}'.format(N_BATCHES))
BATCH_SIZE_VAL = validation_data[0].shape[0] // N_BATCHES
print('The validation batch size is: {}'.format(BATCH_SIZE_VAL))
## We define the placeholders as a trick so that we do not break into memory problems, associated with feeding the data directly.
'''As an alternative, you can define the Dataset in terms of tf.placeholder() tensors, and feed the NumPy arrays when you initialize an Iterator over the dataset.'''
batch_size = tf.placeholder(tf.int64)
x = tf.placeholder(tf.float32, shape=[None, N_TIMESTEPS_X, N_FEATURES], name='XPlaceholder')
y = tf.placeholder(tf.float32, shape=[None, N_OUTPUTS], name='YPlaceholder')
# Creating the two different dataset objects.
train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE, drop_remainder=True).repeat()
val_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE_VAL, drop_remainder=True).repeat()
# Creating the Iterator type that permits to switch between datasets.
itr = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
train_init_op = itr.make_initializer(train_dataset)
validation_init_op = itr.make_initializer(val_dataset)
next_features, next_labels = itr.get_next()
After investigating the best way to do this, I came across with this final implementation that works well on my end. Surely not be the best. So as to maintain the state, I used a feedable iterator.
AIM: This code is intented to be used when you want to train and validate at the same time, preserving the state of each iterator (i.e. validate with the newest model parameters). Together with that, the code saves the model and other stuff, like some information about the hyperparameters and summaries to visualize the training and validation in Tensorboard.
Also, don't get confused: you don't need to have a different batch size for the training set and for the validation set. This is a misconception that I have. The batch sizes must be the same AND you have to deal with the different number of batches, just passing when no more batches are left. This is a requirement so that you can create the iterator, regarding having both datasets the same data type and shape.
Hope it helps others. Just ignore the code that does not relate to your objectives. Many thanks for #kvish for all the help and time.
Code:
def RNNmodelTF(xt, yt, xval, yval, xtest, ytest):
N_TIMESTEPS_X = xt.shape[0] ## The stack number
BATCH_SIZE = 256
#N_OBSERVATIONS = xt.shape[1]
N_FEATURES = xt.shape[2]
N_OUTPUTS = yt.shape[1]
N_NEURONS_LSTM = 128 ## Number of units in the LSTMCell
N_EPOCHS = 350
LEARNING_RATE = 0.001
### Define the placeholders anda gather the data.
xt = xt.transpose([1,0,2])
xval = xval.transpose([1,0,2])
train_data = (xt, yt)
validation_data = (xval, yval)
N_BATCHES = train_data[0].shape[0] // BATCH_SIZE
## We define the placeholders as a trick so that we do not break into memory problems, associated with feeding the data directly.
'''As an alternative, you can define the Dataset in terms of tf.placeholder() tensors, and feed the NumPy arrays when you initialize an Iterator over the dataset.'''
batch_size = tf.placeholder(tf.int64)
x = tf.placeholder(tf.float32, shape=[None, N_TIMESTEPS_X, N_FEATURES], name='XPlaceholder')
y = tf.placeholder(tf.float32, shape=[None, N_OUTPUTS], name='YPlaceholder')
# Creating the two different dataset objects.
train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE, drop_remainder=True).repeat()
val_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE, drop_remainder=True).repeat()
#################### Creating the Iterator type that permits to switch between datasets.
handle = tf.placeholder(tf.string, shape = [])
iterator = tf.data.Iterator.from_string_handle(handle, train_dataset.output_types, train_dataset.output_shapes)
next_features, next_labels = iterator.get_next()
train_val_iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
train_iterator = train_val_iterator.make_initializer(train_dataset)
val_iterator = train_val_iterator.make_initializer(val_dataset)
###########################
### Create the graph
cellType = tf.nn.rnn_cell.LSTMCell(num_units=N_NEURONS_LSTM, name='LSTMCell')
inputs = tf.unstack(next_features, axis=1)
'''inputs: A length T list of inputs, each a Tensor of shape [batch_size, input_size]'''
RNNOutputs, _ = tf.nn.static_rnn(cell=cellType, inputs=inputs, dtype=tf.float32)
out_weights = tf.get_variable("out_weights", shape=[N_NEURONS_LSTM, N_OUTPUTS], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
out_bias = tf.get_variable("out_bias", shape=[N_OUTPUTS], dtype=tf.float32, initializer=tf.zeros_initializer())
predictionsLayer = tf.matmul(RNNOutputs[-1], out_weights) + out_bias
### Define the cost function, that will be optimized by the optimizer.
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=predictionsLayer, labels=next_labels, name='Softmax_plus_Cross_Entropy'))
optimizer_type = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE, name='AdamOptimizer')
optimizer = optimizer_type.minimize(cost)
### Model evaluation
correctPrediction = tf.equal(tf.argmax(predictionsLayer,1), tf.argmax(next_labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPrediction,tf.float32))
confusionMatrix1 = tf.confusion_matrix(tf.argmax(next_labels,1), tf.argmax(predictionsLayer,1), num_classes=3, name='ConfMatrix')
## Saving variables so that we can restore them afterwards.
saver = tf.train.Saver()
save_dir = '/media/SecondDiskHDD/8classModels/DLmodels/tfModels/{}_{}'.format(cellType.__class__.__name__, datetime.now().strftime("%Y%m%d%H%M%S"))
#save_dir = '/home/Desktop/tfModels/{}_{}'.format(cellType.__class__.__name__, datetime.now().strftime("%Y%m%d%H%M%S"))
os.mkdir(save_dir)
varDict = {'nTimeSteps': N_TIMESTEPS_X, 'BatchSize': BATCH_SIZE, 'nFeatures': N_FEATURES,
'nNeuronsLSTM': N_NEURONS_LSTM, 'nEpochs': N_EPOCHS,
'learningRate': LEARNING_RATE, 'optimizerType': optimizer_type.__class__.__name__}
varDicSavingTxt = save_dir + '/varDict.txt'
modelFilesDir = save_dir + '/modelFiles'
os.mkdir(modelFilesDir)
logDir = save_dir + '/TBoardLogs'
os.mkdir(logDir)
acc_summary = tf.summary.scalar('Accuracy', accuracy)
loss_summary = tf.summary.scalar('Cost_CrossEntropy', cost)
summary_merged = tf.summary.merge_all()
with open(varDicSavingTxt, 'w') as outfile:
outfile.write(repr(varDict))
with tf.Session() as sess:
tf.set_random_seed(2)
sess.run(tf.global_variables_initializer())
train_writer = tf.summary.FileWriter(logDir + '/train', sess.graph)
validation_writer = tf.summary.FileWriter(logDir + '/validation')
# initialise iterator with data
train_val_string = sess.run(train_val_iterator.string_handle())
cm1Total = None
cm2Total = None
print('¡Training starts!')
for epoch in range(N_EPOCHS):
batchAccList = []
batchAccListVal = []
tot_loss_train = 0
tot_loss_validation = 0
for batch in range(N_BATCHES):
sess.run(train_iterator, feed_dict = {x : train_data[0], y: train_data[1], batch_size: BATCH_SIZE})
optimizer_output, loss_value, summary, accBatch, cm1 = sess.run([optimizer, cost, summary_merged, accuracy, confusionMatrix1], feed_dict = {handle: train_val_string})
npArrayPred = predictionsLayer.eval(feed_dict= {handle: train_val_string})
predLabEnc = np.apply_along_axis(thresholdSet, 1, npArrayPred, value=0.5)
npArrayLab = next_labels.eval(feed_dict= {handle: train_val_string})
labLabEnc = np.argmax(npArrayLab, 1)
cm2 = confusion_matrix(labLabEnc, predLabEnc)
tot_loss_train += loss_value
batchAccList.append(accBatch)
try:
sess.run(val_iterator, feed_dict = {x: validation_data[0], y: validation_data[1], batch_size: BATCH_SIZE})
valLoss, valAcc, summary_val = sess.run([cost, accuracy, summary_merged], feed_dict = {handle: train_val_string})
tot_loss_validation += valLoss
batchAccListVal.append(valAcc)
except tf.errors.OutOfRangeError:
pass
if cm1Total is None and cm2Total is None:
cm1Total = cm1
cm2Total = cm2
else:
cm1Total += cm1
cm2Total += cm2
if batch % 10 == 0:
train_writer.add_summary(summary, batch)
validation_writer.add_summary(summary_val, batch)
epochAcc = tf.reduce_mean(batchAccList)
sess.run(train_iterator, feed_dict = {x : train_data[0], y: train_data[1], batch_size: BATCH_SIZE})
epochAcc_num = sess.run(epochAcc, feed_dict = {handle: train_val_string})
epochAccVal = tf.reduce_mean(batchAccListVal)
sess.run(val_iterator, feed_dict = {x: validation_data[0], y: validation_data[1], batch_size: BATCH_SIZE})
epochAcc_num_Val = sess.run(epochAccVal, feed_dict = {handle: train_val_string})
if epoch%10 == 0:
print("Epoch: {}, Loss: {:.4f}, Accuracy: {:.3f}".format(epoch, tot_loss_train / N_BATCHES, epochAcc_num))
print('Validation Loss: {:.4f}, Validation Accuracy: {:.3f}'.format(tot_loss_validation / N_BATCHES, epochAcc_num_Val))
cmLogFile1 = save_dir + '/cm1File.txt'
with open(cmLogFile1, 'w') as outfile:
outfile.write(repr(cm1Total))
cmLogFile2 = save_dir + '/cm2File.txt'
with open(cmLogFile2, 'w') as outfile:
outfile.write(repr(cm2Total))
saver.save(sess, modelFilesDir + '/model.ckpt')

Avoid feed_dict mechanism in static graph in tensorflow

I am trying to implement a model for generating/reconstructing samples (Variational autoencoder). During test time, I would like to be able to make the model generate new samples by feeding it a latent variable, but that requires changing the inputs to a part of the computational graph.
I could use a feed_dict to "dynamically" do that, since I cannot directly change a static graph, but I want to avoid the overhead of exchanging data between the GPU and the system RAM.
As it stands I feed the data using Iterators.
def make_mnist_dataset(batch_size, shuffle=True, include_labels=True):
"""Loads the MNIST data set and returns the relevant
iterator along with its initialization operations.
"""
# load the data
train, test = tf.keras.datasets.mnist.load_data()
# binarize and reshape the data sets
temp_train = train[0]
temp_train = (temp_train > 0.5).astype(np.float32).reshape(temp_train.shape[0], 784)
train = (temp_train, train[1])
temp_test = test[0]
temp_test = (temp_test > 0.5).astype(np.float32).reshape(temp_test.shape[0], 784)
test = (temp_test, test[1])
# prepare Dataset objects
if include_labels:
train_set = tf.data.Dataset.from_tensor_slices(train).repeat().batch(batch_size)
test_set = tf.data.Dataset.from_tensor_slices(test).repeat(1).batch(batch_size)
else:
train_set = tf.data.Dataset.from_tensor_slices(train[0]).repeat().batch(batch_size)
test_set = tf.data.Dataset.from_tensor_slices(test[0]).repeat(1).batch(batch_size)
if shuffle:
train_set = train_set.shuffle(buffer_size=int(0.5*train[0].shape[0]),
seed=123)
# make the iterator
iter = tf.data.Iterator.from_structure(train_set.output_types,
train_set.output_shapes)
data = iter.get_next()
# create initialization ops
train_init = iter.make_initializer(train_set)
test_init = iter.make_initializer(test_set)
return train_init, test_init, data
And here's the code snippet where the data being iterated over is being fed to the graph:
train_init, test_init, next_batch = make_mnist_dataset(batch_size, include_labels=True)
ops = build_graph(next_batch[0], next_batch[1], learning_rate, is_training,
latent_dim, tau, batch_size, inf_layers, gen_layers)
Is there any way to "switch" from an Iterator object to a different input source during test time, without resorting to feed_dict?

How do I obtain predictions and probabilities from new data input to a CNN in Tensorflow

I'll preface this by saying this is my first posted question on SO. I've just recently started working with Tensorflow, and have been attempting to apply a convolutional-neural network model approach for classification of .csv records in a file representing images from scans of microarray data. (FYI: Microarrays are a grid of spotted DNA on a glass slide, representing specific DNA target sequences for determining the presence of those DNA targets in a sample. The individual pixels represent fluorescence intensity value from 0-1). The file has ~200,000 records in total. Each record (image) has 10816 pixels that represent DNA sequences from known viruses, and one index label which identifies the virus species. The pixels create a pattern which is unique to each of the different viruses. There are 2165 different viruses in total represented within the 200,000 records. I have trained the network on images of labeled microarray datasets, but when I try to pass a new dataset through to classify it/them as one of the 2165 different viruses and determine predicted values and probabilities, I don't seem to be having much luck. This is the code that I am currently using for this:
import tensorflow as tf
import numpy as np
import csv
def extract_data(filename):
print("extracting data...")
NUM_LABELS = 2165
NUM_FEATURES = 10816
labels = []
fvecs = []
rowCount = 0
#iterate over the rows, split the label from the features
#convert the labels to integers and features to floats
for line in open(filename):
rowCount = rowCount + 1
row = line.split(',')
labels.append(row[3])#(int(row[7])) #<<<IT ALWAYS PREDICTS THIS VALUE!
for x in row [4:10820]:
fvecs.append(float(x))
#convert the array of float arrasy into a numpy float matrix
fvecs_np = np.matrix(fvecs).astype(np.float32)
#convert the array of int lables inta a numpy array
labels_np = np.array(labels).astype(dtype=np.uint8)
#convert the int numpy array into a one-hot matrix
labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)
print("arrays converted")
return fvecs_np, labels_onehot
def TestModels():
fvecs_np, labels_onehot = extract_data("MicroarrayTestData.csv")
print('RESTORING NN MODEL')
weights = {}
biases = {}
sess=tf.Session()
init = tf.global_variables_initializer()
#Load meta graph and restore weights
ModelID = "MicroarrayCNN_Data-1000.meta"
print("RESTORING:::", ModelID)
saver = tf.train.import_meta_graph(ModelID)
saver.restore(sess,tf.train.latest_checkpoint('./'))
graph = tf.get_default_graph()
x = graph.get_tensor_by_name("x:0")
y = graph.get_tensor_by_name("y:0")
keep_prob = tf.placeholder(tf.float32)
y_ = tf.placeholder("float", shape=[None, 2165])
wc1 = graph.get_tensor_by_name("wc1:0")
wc2 = graph.get_tensor_by_name("wc2:0")
wd1 = graph.get_tensor_by_name("wd1:0")
Wout = graph.get_tensor_by_name("Wout:0")
bc1 = graph.get_tensor_by_name("bc1:0")
bc2 = graph.get_tensor_by_name("bc2:0")
bd1 = graph.get_tensor_by_name("bd1:0")
Bout = graph.get_tensor_by_name("Bout:0")
weights = {wc1, wc2, wd1, Wout}
biases = {bc1, bc2, bd1, Bout}
print("NEXTArgmax")
prediction=tf.argmax(y,1)
probabilities = y
predY = prediction.eval(feed_dict={x: fvecs_np, y: labels_onehot}, session=sess)
probY = probabilities.eval(feed_dict={x: fvecs_np, y: labels_onehot}, session=sess)
accuracy = tf.reduce_mean(tf.cast(prediction, "float"))
print(sess.run(accuracy, feed_dict={x: fvecs_np, y: labels_onehot}))
print("%%%%%%%%%%%%%%%%%%%%%%%%%%")
print("Predicted::: ", predY, accuracy)
print("%%%%%%%%%%%%%%%%%%%%%%%%%%")
feed_dictTEST = {y: labels_onehot}
probabilities=probY
print("probabilities", probabilities.eval(feed_dict={x: fvecs_np}, session=sess))
########## Run Analysis ###########
TestModels()
So, when I run this code I get the correct prediction for the test set, although I am not sure I believe it, because it appears that whatever value I append in line 14 (see below) is the output it predicts:
labels.append(row[3])#<<<IT ALWAYS PREDICTS THIS VALUE!
I don't understand this, and it makes me suspicious that I've set up the CNN incorrectly, as I would have expected it to ignore my input label and determine a bast match from the trained network based on the trained patterns. The only thing I can figure is that when I pass the value through for the prediction; it is instead training the model on this data as well, and then predicting itself. Is this a correct assumption, or am I misinterpreting how Tensorflow works?
The other issue is that when I try to use code that (based on other tutorials) which is supposed to output the probabilities of all of the 2165 possible outputs, I get the error:
InvalidArgumentError (see above for traceback): Shape [-1,2165] has negative dimensions
[[Node: y = Placeholder[dtype=DT_FLOAT, shape=[?,2165], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
To me, it looks like it is the correct layer based on the 2165 value in the Tensor shape, but I don't understand the -1 value. So, to wrap up the summary, my questions are:
Based on the fact that I get the value that I have in the label of the input data, is this the correct method to make a classification using this model?
Am I missing a layer or have I configured the model incorrectly in order to extract the probabilities of all of the possible output classes, or am I using the wrong code to extract the information? I try to print out the accuracy to see if that would work, but instead it outputs the description of a tensor, so clearly that is incorrect as well.
(ADDITIONAL INFORMATION)
As requested, I'm also including the original code that was used to train the model, which is now below. You can see I do sort of a piece meal training of a limited number of related records at a time by their taxonomic relationships as I iterate through the file. This is mostly because the Mac that I'm training on (Mac Pro w/ 64GB ram) tends to give me the "Killed -9" error due to overuse of resources if I don't do it this way. There may be a better way to do it, but this seems to work.
Original Author: Aymeric Damien
Project: https://github.com/aymericdamien/TensorFlow-Examples/
from __future__ import print_function
import tensorflow as tf
import numpy as np
import csv
import random
# Parameters
num_epochs = 2
train_size = 1609
learning_rate = 0.001 #(larger >speed, lower >accuracy)
training_iters = 5000 # How much do you want to train (more = better trained)
batch_size = 32 #How many samples to train on, size of the training batch
display_step = 10 # How often to diplay what is going on during training
# Network Parameters
n_input = 10816 # MNIST data input (img shape: 28*28)...in my case 104x104 = 10816(rough array size)
n_classes = 2165 #3280 #2307 #787# Switched to 100 taxa/training set, dynamic was too wonky.
dropout = 0.75 # Dropout, probability to keep units. Jeffery Hinton's group developed it, that prevents overfitting to find new paths. More generalized model.
# Functions
def extract_data(filename):
print("extracting data...")
# arrays to hold the labels and feature vectors.
NUM_LABELS = 2165
NUM_FEATURES = 10826
taxCount = 0
taxCurrent = 0
labels = []
fvecs = []
rowCount = 0
#iterate over the rows, split the label from the features
#convert the labels to integers and features to floats
print("entering CNN loop")
for line in open(filename):
rowCount = rowCount + 1
row = line.split(',')
taxCurrent = row[3]
print("profile:", row[0:12])
labels.append(int(row[3]))
fvecs.append([float(x) for x in row [4:10820]])
#convert the array of float arrasy into a numpy float matrix
fvecs_np = np.matrix(fvecs).astype(np.float32)
#convert the array of int lables inta a numpy array
labels_np = np.array(labels).astype(dtype=np.uint8)
#convert the int numpy array into a one-hot matrix
labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)
print("arrays converted")
return fvecs_np, labels_onehot
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1): #Layer 1 : Convolutional layer
# Conv2D wrapper, with bias and relu activation
print("conv2d")
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') # Strides are the tensors...list of integers. Tensors=data
x = tf.nn.bias_add(x, b) #bias is the tuning knob
return tf.nn.relu(x) #rectified linear unit (activation function)
def maxpool2d(x, k=2): #Layer 2 : Takes samples from the image. (This is a 4D tensor)
print("maxpool2d")
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
# Create model
def conv_net(x, weights, biases, dropout):
print("conv_net setup")
# Reshape input picture
x = tf.reshape(x, shape=[-1, 104, 104, 1]) #-->52x52 , -->26x26x64
# Convolution Layer
conv1 = conv2d(x, weights['wc1'], biases['bc1']) #defined above already
# Max Pooling (down-sampling)
conv1 = maxpool2d(conv1, k=2)
print(conv1.get_shape)
# Convolution Layer
conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) #wc2 and bc2 are just placeholders...could actually skip this layer...maybe
# Max Pooling (down-sampling)
conv2 = maxpool2d(conv2, k=2)
print(conv2.get_shape)
# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
fc1 = tf.nn.relu(fc1) #activation function for the NN
# Apply Dropout
fc1 = tf.nn.dropout(fc1, dropout)
# Output, class prediction
out = tf.add(tf.matmul(fc1, weights['Wout']), biases['Bout'])
return out
def Train_Network(Txid_IN, Sess_File_Name):
import tensorflow as tf
tf.reset_default_graph()
x,y = 0,0
weights = {}
biases = {}
# tf Graph input
print("setting placeholders")
x = tf.placeholder(tf.float32, [None, n_input], name="x") #Gateway for data (images)
y = tf.placeholder(tf.float32, [None, n_classes], name="y") # Gateway for data (labels)
keep_prob = tf.placeholder(tf.float32) #dropout # Gateway for dropout(keep probability)
# Store layers weight & bias
#CREATE weights
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32]),name="wc1"), #
# 5x5 conv, 32 inputs, 64 outputs
'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64]),name="wc2"),
# fully connected, 7*7*64 inputs, 1024 outputs
'wd1': tf.Variable(tf.random_normal([26*26*64, 1024]),name="wd1"),
# 1024 inputs, 10 outputs (class prediction)
'Wout': tf.Variable(tf.random_normal([1024, n_classes]),name="Wout")
}
biases = {
'bc1': tf.Variable(tf.random_normal([32]), name="bc1"),
'bc2': tf.Variable(tf.random_normal([64]), name="bc2"),
'bd1': tf.Variable(tf.random_normal([1024]), name="bd1"),
'Bout': tf.Variable(tf.random_normal([n_classes]), name="Bout")
}
# Construct model
print("constructing model")
pred = conv_net(x, weights, biases, keep_prob)
print(pred)
# Define loss(cost) and optimizer
#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y)) Deprecated version of the statement
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = pred, labels=y)) #added reduce_mean 6/27
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
print("%%%%%%%%%%%%%%%%%%%%")
print ("%% ", correct_pred)
print ("%% ", accuracy)
print("%%%%%%%%%%%%%%%%%%%%")
# Initializing the variables
#init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
saver = tf.train.Saver()
fvecs_np, labels_onehot = extract_data("MicroarrayDataOUT.csv") #CHAGE TO PICORNAVIRUS!!!!!AHHHHHH!!!
print("starting session")
# Launch the graph
FitStep = 0
with tf.Session() as sess: #graph is encapsulated by its session
sess.run(init)
step = 1
# Keep training until reach max iterations (training_iters)
while step * batch_size < training_iters:
if FitStep >= 5:
break
else:
#iterate and train
print(step)
print(fvecs_np, labels_onehot)
for step in range(num_epochs * train_size // batch_size):
sess.run(optimizer, feed_dict={x: fvecs_np, y: labels_onehot, keep_prob:dropout}) #no dropout???...added Keep_prob:dropout
if FitStep >= 5:
break
#else:
###batch_x, batch_y = mnist.train.next_batch(batch_size)
# Run optimization op (backprop)
###sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
### keep_prob: dropout}) <<<<SOMETHING IS WRONG IN HERE?!!!
if step % display_step == 0:
# Calculate batch loss and accuracy
loss, acc = sess.run([cost, accuracy], feed_dict={x: fvecs_np,
y: labels_onehot,
keep_prob: 1.})
print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(np.mean(loss)) + ", Training Accuracy= " + \
"{:.5f}".format(acc))
TrainAcc = float("{:.5f}".format(acc))
#print("******", TrainAcc)
if TrainAcc >= .99: #Changed from .95 temporarily
print(FitStep)
FitStep = FitStep+1
saver.save(sess, Sess_File_Name, global_step=1000) #
print("Saved Session:", Sess_File_Name)
step += 1
print("Optimization Finished!")
print("Testing Accuracy:", \
sess.run(accuracy, feed_dict={x: fvecs_np[:256],
y: labels_onehot[:256],
keep_prob: 1.}))
#feed_dictTEST = {x: fvecs_np[50]}
#prediction=tf.argmax(y,1)
#print(prediction)
#best = sess.run([prediction],feed_dictTEST)
#print(best)
print("DONE")
sess.close()
def Tax_Iterator(CSV_inFile, CSV_outFile): #Deprecate
#Need to copy *.csv file to MySQL for sorting
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
TaxCount = 0
TaxThreshold = 2165
ThresholdStep = 2165
PrevTax = 0
linecounter = 0
#Open all GenBank profile list
for line in open(CSV_inFile):
linecounter = linecounter+1
print(linecounter)
resultFile = open(CSV_outFile,'a')
wr = csv.writer(resultFile, dialect='excel')
# Check for new TXID
row = line.split(',')
print(row[7], "===", PrevTax)
if row[7] != PrevTax:
print("X1")
TaxCount = TaxCount+1
PrevTax = row[7]
#Check it current Tax count is < or > threshold
# < threshold
print(TaxCount,"=+=", TaxThreshold)
if TaxCount<=3300:
print("X2")
CurrentTax= row[7]
CurrTxCount = CurrentTax
print("TaxCount=", TaxCount)
print( "Add to CSV")
print("row:", CurrentTax, "***", row[0:15])
wr.writerow(row[0:-1])
# is > threshold
else:
print("X3")
# but same TXID....
print(row[7], "=-=", CurrentTax)
if row[7]==CurrentTax:
print("X4")
CurrentTax= row[7]
print("TaxCount=", TaxCount)
print( "Add to CSV")
print("row:", CurrentTax, "***", row[0:15])
wr.writerow(row[0:-1])
# but different TXID...
else:
print(row[7], "=*=", CurrentTax)
if row[7]>CurrentTax:
print("X5")
TaxThreshold=TaxThreshold+ThresholdStep
resultFile.close()
Sess_File_Name = "CNN_VirusIDvSPECIES_XXALL"+ str(TaxThreshold-ThresholdStep)
print("<<<< Start Training >>>>"
print("Training on :: ", CurrTxCount, "Taxa", TaxCount, "data points.")
Train_Network(CurrTxCount, Sess_File_Name)
print("Training complete")
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
CurrentTax= row[7]
#reset tax count
CurrTxCount = 0
TaxCount = 0
resultFile.close()
Sess_File_Name = "MicroarrayCNN_Data"+ str(TaxThreshold+ThresholdStep)
print("<<<< Start Training >>>>")
print("Training on :: ", CurrTxCount, "Taxa", TaxCount, "data points.")
Train_Network(CurrTxCount, Sess_File_Name)
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
CurrentTax= row[7]
Tax_Iterator("MicroarrayInput.csv", "MicroarrayOutput.csv")
You defined prediction as prediction=tf.argmax(y,1). And in both feed_dict, you feed labels_onehot for y. Consequently, your "prediction" is always equal to the labels.
As you didn't post the code you used to train your network, I can't tell you what exactly you need to change.
Edit: I have isses understanding the underlying problem you're trying to solve - based on your code, you're trying to train a neural network with 2165 different classes using 1609 training examples. How is this even possible? If each example had a different class, there would still be some classes without any training example. Or does one image belong to many classes? From your statement at the beginning of your question, I had assumed you're trying to output a real-valued number between 0-1.
I'm actually surprised that the code actually worked as it looks like you're adding only a single number to your labels list, but your model expects a list with length 2165 for each training example.

Resources