Is loading in eager TensorFlow broken right now? - python-3.x

Weights in classes inheriting from tf.keras.Model seem unable to load at the moment. I am unable to load the weights from Example() outside of the class using checkpointing, so I tried to do it within, which by all accounts should work. Its able to save the weights, as it can when just saving Example(), however it still can't load them. This is my model code:
class Example(tf.keras.Model):
def __init__(self, cfg):
super(Example, self).__init__()
self.model = tf.keras.Sequential([
# Create saver
self.save_path = cfg.save_dir + cfg.extension
self.ckpt_prefix = self.save_path + '/ckpt'
self.saver = tf.train.Checkpoint(model=self.model)
def call(self, x_in):
x_out = self.model(x_in)
return x_out
def save(self):
def load(self):
And this is what I use to check if it loads:
example = Example()
if Path(self.example.save_path).is_dir():
This was tested on both tensorflow 1.3 and 2.0, and I can confirm that the weights are not empty after the first batch, as well as that it is checkpointing/saving.

As it turns out, there are three different ways TensorFlow does checkpointing, depending on what is being checkpointed.
The checkpointed object is just a variable. This is restored immediately upon calling checkpoint.restore(tf.train.latest_checkpoint(checkpoint_path)).
The checkpointed object is a model with input shape defined. This is also restored immediately.
The checkpointed object is a model without input shape defined. This is where the behaviour changes, as TensorFlow does a "delayed" restore, and will NOT restore the model weights until input is passed to the model.
Here is an example:
import os
import tensorflow as tf
import numpy as np
# Disable logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# Create model
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(256, 3, padding="same"),
tf.keras.layers.Conv2D(3, 3, padding="same")
print("Are weights empty before training?", model.weights == [])
# Create optim, checkpoint
optimizer = tf.train.AdamOptimizer(0.001)
checkpoint = tf.train.Checkpoint(model=model)
# Make fake data
img = np.random.uniform(0, 255, (1, 32, 32, 3)).astype(np.float32)
truth = np.random.uniform(0, 255, (1, 32, 32, 3)).astype(np.float32)
# Train
with tf.GradientTape() as tape:
logits = model(img)
loss = tf.losses.mean_squared_error(truth, logits)
# Compute/apply gradients
grads = tape.gradient(loss, model.trainable_weights)
grads_and_vars = zip(grads, model.trainable_weights)
# Save model
checkpoint_path = './ckpt/''./ckpt/')
# Check if weights update
print("Are weights empty after training?", model.weights == [])
# Reset model
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(256, 3, padding="same"),
tf.keras.layers.Conv2D(3, 3, padding="same")
print("Are weights empty when resetting model?", model.weights == [])
# Update checkpoint pointer
checkpoint = tf.train.Checkpoint(model=model)
# Restore values from the checkpoint
status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_path))
# This next line is REQUIRED to restore
print("Are weights empty after restoring from checkpoint?", model.weights == [])
With output:
Are weights empty before training? True
Are weights empty after training? False
Are weights empty when resetting model? True
Are weights empty after restoring from checkpoint? True
< object at 0x7f6256b4ddd8>
Traceback (most recent call last):
File "", line 58, in <module>
File "/home/jpatts/.local/lib/python3.6/site-packages/tensorflow/python/training/checkpointable/", line 1013, in assert_consumed
raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
AssertionError: Unresolved object in checkpoint: attributes {
full_name: "sequential/conv2d/kernel"
checkpoint_key: "model/layer-0/kernel/.ATTRIBUTES/VARIABLE_VALUE"
However, uncommenting the line model(img) will produce the following output:
Are weights empty before training? True
Are weights empty after training? False
Are weights empty when resetting model? True
Are weights empty after restoring from checkpoint? False
< object at 0x7ff62320fe48>
So input data needs to be passed to properly restore a shape invariant model.


Torch onnx.export error for instance norm

I try to convert a pytorch model to the onnx format using torch.onnx.export().
Hower I get the following error:
File "C:\Users\Markus\miniconda3\envs\ic-move\lib\site-packages\torch\onnx\", line 1395, in instance_norm
raise RuntimeError("Unsupported: ONNX export of instance_norm for unknown "
RuntimeError: Unsupported: ONNX export of instance_norm for unknown channel size.
this is how I call the function:
torch.onnx.export(net, # model being run
x, # model input (or a tuple for multiple inputs)
ONNX_PATH, # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=12, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
input_names = ['input'], # the model's input names
output_names = ['output'], # the model's output names
dynamic_axes={'input' : {0 : 'batch_size'}, # variable length axes
'output' : {0 : 'batch_size'}})
and here is an example of how I call the instance norm in my model:
class Residual(nn.Module):
def __init__(self, inp_dim, out_dim):
super(Residual, self).__init__()
self.relu = nn.ReLU()
self.bn1 = nn.InstanceNorm2d(inp_dim)
self.conv1 = Conv(inp_dim, int(out_dim/2), 1, relu=False)
self.bn2 = nn.InstanceNorm2d(int(out_dim/2))
self.conv2 = Conv(int(out_dim/2), int(out_dim/2), 3, relu=False)
self.bn3 = nn.InstanceNorm2d(int(out_dim/2))
self.conv3 = Conv(int(out_dim/2), out_dim, 1, relu=False)
self.skip_layer = Conv(inp_dim, out_dim, 1, relu=False)
if inp_dim == out_dim:
self.need_skip = False
self.need_skip = True
With the input_dim = 64. Aren't this the channels I set?

NumPy Data w/ TensorFlow "Not on the same graph"

So i'm trying to build a model that takes in an image-like numpy file and builds a model based on the data. Each individual .npy file DOES fit in memory, however I need to assume that all loaded at the same time will not.
When I go to run it & do a test evaluation, it gives me this error:
ValueError: Tensor("IteratorV2:0", shape=(), dtype=resource) must be from the same graph as Tensor("MapDataset:0", shape=(), dtype=variant)
I'm not super experienced with TensorFlow and I'm trying to make everything as "best practice" as possible.
Here's my code:
train_fnames, train_labels, test_fnames, test_labels =\
train_fnames = 'count_data/' + train_fnames
test_fnames = 'count_data/' + test_fnames
def read_npy_file(item):
data = np.load(item.decode())
return data.astype(np.int32)
# gdsii_placeholder = tf.placeholder(tf.float32, shape=(None, 224, 224, 1))
# label_placeholder = tf.placeholder(tf.int32, shape=[1])
def cnn_model(features, labels, mode):
conv1 = tf.layers.conv2d(features, filters=32, kernel_size=(5, 5))
pool1 = tf.layers.max_pooling2d(conv1, pool_size=(2, 2), strides=2)
conv2 = tf.layers.conv2d(pool1, filters=64, kernel_size=(5, 5))
pool2 = tf.layers.max_pooling2d(conv2, pool_size=(2, 2), strides=2)
flat = tf.layers.flatten(pool2)
dense = tf.layers.dense(flat, units=1024, activation='relu')
dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=True)
logits = tf.layers.dense(inputs=dropout, units=10)
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
return tf.estimator.EstimatorSpec(mode=tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
def main(_):
# load the dataset
data =
data = item: tuple(tf.py_func(
read_npy_file, [item], [tf.int32, ])))
gdsii_classifier = tf.estimator.Estimator(
model_fn=cnn_model, model_dir="/tmp/gdsii_classifier")
res = gdsii_classifier.evaluate(lambda: data)
if __name__ == "__main__":
There are several little mistakes in your code:
First, you tried to use Estimator without wrapping your operations inside an input function. This causes an issue because operations create their own graph for data loading while Estimator expects to have all the control on the whole graph. When you then call estimator.evaluate(), it fails because the tensor is inside your previously built graph.
Secondly, you tried to call estimator.evaluate() without a label.
Then, after you use a tf.py_func in, you need to specify the shape of the different tensors as they can't be inferred by Tensorflow.
And finally, you need to batch your data as tf.layers expect ndims=4, and you need to pass the features as either float32 or float16.
You can easily patch these issues by doing something like this:
def input_fn(x, y):
def set_shape(tensor, shape):
return tensor
data =, y))
data = item, label: (tf.py_func(read_npy_file, [item], [tf.float32]), label))
data = item, label: (set_shape(item, YOUR_SHAPE), label))
data = data.batch(1)
return data
estimator.evaluate(lambda: input_fn(x=test_fnames, y=test_labels))

This Keras model works when created, but fails when loaded. Tensor splitting suspected

I'm experimenting with LSTMs, specifically, inputting a sequence into an LSTM, transferring the states into another LSTM, and decoding the sequence. I added an autoencoder between the two LSTMs, encoding and then decoding the transferred states via a lower dimensional latent space.
This works fine when I create the model and fit it. However, if I save this model, and then either try to continue training it, or even just use it without additional training, the model does not run and I get the following warning:
Traceback (most recent call last):
File "", line 140, in <module>
model.fit_generator(train_generator(),callbacks=[checkpointer], steps_per_epoch=30, epochs=2000, verbose=1,validation_data=val_generator(),validation_steps=30)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\legacy\", line 91, in wrapper
return func(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\", line 2224, in fit_generator
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\", line 1877, in train_on_batch
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\", line 1476, in _standardize_user_data
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\", line 86, in _standardize_input_data
str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 2 arrays: [array([[[ 0.47338937, 0.75865918, 0.37731877, 0.63840222,
[ 0.52119932, 0.78308798, 0.45885839, 0.66738276,
[ 0.5674261 , 0.806364...
My code is as follows:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, TimeDistributed,Lambda, Dropout, Activation ,RepeatVector
from keras.callbacks import ModelCheckpoint
import numpy as np
from keras.layers import Lambda, Concatenate
from keras import backend as K
from keras.models import load_model
import os
encoder_inputs = Input(shape=(seq_length, features_num))
encoder = LSTM(LSTM_latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
merged_encoder_states = Concatenate(axis=-1)([state_h, state_c])
decoded_states=Dense(LSTM_latent_dim*2, activation='relu')(encoded_states)
decoder_inputs=Input(shape=(1, features_num))
decoder_lstm = LSTM(LSTM_latent_dim, return_sequences=True, return_state=True)
decoder_dense = Dense(features_num)
all_outputs = []
inputs = decoder_inputs
for _ in range(seq_length):
# Run the decoder on one timestep
outputs, state_h, state_c = decoder_lstm(inputs, initial_state=states)
outputs = decoder_dense(outputs)
# Store the current prediction (we will concatenate all predictions later)
# Reinject the outputs as inputs for the next loop iteration
# as well as update the states
inputs = outputs
states = [state_h, state_c]
# Concatenate all predictions
decoder_outputs = Lambda(lambda x: K.concatenate(x, axis=1))(all_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
#model = load_model('pre_model.h5')
filepath_for_w= 'AE2_p2p_s2s_model.h5'
model = load_model(filepath_for_w) # if model was previouslly run, continue from it
print("loaded model")
except: print("new model")
model.compile(loss='mean_squared_error', optimizer='adam')
def create_wavelength(min_wavelength, max_wavelength, fluxes_in_wavelength, category ) :
#category :: 0 - train ; 2 - validate ; 4- test. 1;3;5 - dead space
k = fluxes_in_wavelength
base= (np.trunc(k*np.random.random()*(max_wavelength-min_wavelength)) +k*min_wavelength) /k
return (answer)
def make_line(length,category):
shift= np.random.random()
wavelength = create_wavelength(30,10,1,category)
return answer
def make_data(seq_num,seq_len,dim,category):
for i in range (seq_num):
for j in range (dim):
line = make_line(seq_len,category)
return (data)
def train_generator():
while True:
sequence_length = seq_length+1
data=make_data(1000,sequence_length,features_num,0) # category=0 in train
encoder_input_data =data[:,1:,:] # all
decoder_input_data = data[:,0,:] # the first value in the sequence
decoder_target_data = encoder_input_data
yield [encoder_input_data, decoder_input_data], decoder_target_data
def val_generator():
while True:
sequence_length =seq_length+1
data=make_data(1000,sequence_length,features_num,2) # category=2 in val
# # decoder_target_data is the same as decoder_input_data but offset by one timestep
encoder_input_data =data[:,1:,:] # all
decoder_input_data = data[:,0,:] # the one before the last one.
decoder_target_data = encoder_input_data
yield [encoder_input_data, decoder_input_data], decoder_target_data
checkpointer=ModelCheckpoint(filepath_for_w, monitor='val_loss', verbose=0, save_best_only=True, mode='auto', period=1)
model.fit_generator(train_generator(),callbacks=[checkpointer], steps_per_epoch=30, epochs=2000, verbose=1,validation_data=val_generator(),validation_steps=30)
def predict_wave(input_wave,input_for_decoder): # input wave= x[n,:,:], ie points except the last seq_length; each wave has feature_num features. run this function for all such instances (=n)
#print (input_wave.shape)
#print (input_for_decoder.shape)
pred= model.predict([input_wave,input_for_decoder])
return pred
def predict_many_waves_from_input(x):
x, x2=x # x == encoder_input_data ; x==2 decoder_input_data
instance_num= x.shape[0]
for n in range(instance_num):
return (multi_predict_collection)
def test_maker():
if True:
sequence_length = seq_length +1
data=make_data(470,sequence_length,features_num,4) # category=4 in test
encoder_input_data =data[:,1:,:] # all
decoder_input_data = data[:,0,:] # the first value
decoder_target_data = encoder_input_data
return [encoder_input_data, decoder_input_data], decoder_target_data
x,y= test_maker()
a=predict_many_waves_from_input (x)
x=x[0] # keep the wave (generated data except last seq_length time points)
print (x.shape)
print (y.shape)
print (a.shape) ('a.npy',a) ('y.npy',y) ('x.npy',x)
print (np.mean(np.absolute(y[:,:,0]-a[:,:,0])))
print (np.mean(np.absolute(y[:,:,1]-a[:,:,1])))
print (np.mean(np.absolute(y[:,:,2]-a[:,:,2])))
print (np.mean(np.absolute(y[:,:,3]-a[:,:,3])))
print (np.mean(np.absolute(y[:,:,4]-a[:,:,4])))
The culprit might be this line:
After combining the states of the encoding LSTM and passing them through the autoencoder, I split them back into c and h (the cell state and the hidden state, respectively) and feed them into the decoder LSTM.
It seems reasonable to me that this step occurs correctly when the initial model is used, but is somehow incorrectly saved into the model file (or incorrectly loaded from the model file), resulting in a defective loaded model.
Further supporting my assessment, in my opinion, is the fact that when this line is replaced with
states= [state_h, state_c]
, the loaded model is able to run correctly (fitting and predicting), but of course this does away with the state autoencoder so I cannot use it except for zooming in on the bug.
So, I ask your help regarding two questions:
Why does this problem occur?
How do I solve it?
A possible partial solution is to forgo the saving of the model in its entirety, and just save (and load) the model's weights.
Replacing the lines
model = load_model(filepath_for_w)
checkpointer=ModelCheckpoint(filepath_for_w, monitor='val_loss', verbose=0, save_best_only=True, mode='auto', period=1)
checkpointer=ModelCheckpoint(filepath_for_w, save_weights_only=True, monitor='val_loss', verbose=0, save_best_only=True, mode='auto', period=1)
does the trick. The model can be loaded for further fitting and for prediction.
However this does not allow the saving of the entire model; I still need to keep the architecture in the code in order to populate it with the weights. It also does not explain why does this problem occurs to begin with.

Unable to load keras model with lambda function using a custom distance function

I'm trying to build a system to check sentence similarities using a siamese LSTM model using Manhattan distance as the distance function while merging two layers.
I'm using the code found in this article
The issue is that after I've built and saved the model in a json file I'm unable to load the model as an error gets thrown saying
name 'exponent_neg_manhattan_distance' is not defined
Here's the code:
# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 5
def exponent_neg_manhattan_distance(left, right):
''' Helper function for the similarity estimate of the LSTMs outputs'''
return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))
# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')
embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)
# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)
# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)
left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)
# Calculates the distance as defined by the MaLSTM model
malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])
# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)
malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
# Start training
training_start_time = time()
malstm_trained =[X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,
validation_data=([X_validation['left'], X_validation['right']], Y_validation))
print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))'malstm.h5')
model_json = malstm.to_json()
with open ('malstm.json', 'w') as file:
Now when I try to load the model I get the following error:
model = model_from_json(open('malstm.json').read(), custom_objects = {"exponent_neg_manhattan_distance":exponent_neg_manhattan_distance})
C:\Users\archi\Miniconda3\lib\site-packages\keras\engine\ UserWarning: The `Merge` layer is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.
return cls(**config)
Traceback (most recent call last):
File "<ipython-input-12-4c72a4db6c29>", line 1, in <module>
model = model_from_json(open('malstm.json').read(), custom_objects = {"exponent_neg_manhattan_distance":exponent_neg_manhattan_distance})
File "C:\Users\archi\Miniconda3\lib\site-packages\keras\", line 349, in model_from_json
return layer_module.deserialize(config, custom_objects=custom_objects)
File "C:\Users\archi\Miniconda3\lib\site-packages\keras\layers\", line 55, in deserialize
File "C:\Users\archi\Miniconda3\lib\site-packages\keras\utils\", line 144, in deserialize_keras_object
File "C:\Users\archi\Miniconda3\lib\site-packages\keras\engine\", line 2524, in from_config
process_node(layer, node_data)
File "C:\Users\archi\Miniconda3\lib\site-packages\keras\engine\", line 2483, in process_node
layer(input_tensors, **kwargs)
File "C:\Users\archi\Miniconda3\lib\site-packages\keras\engine\", line 619, in __call__
output =, **kwargs)
File "C:\Users\archi\Miniconda3\lib\site-packages\keras\legacy\", line 209, in call
return self.mode(inputs, **arguments)
File "<ipython-input-19-913812c640b3>", line 28, in <lambda>
NameError: name 'exponent_neg_manhattan_distance' is not defined
I've searched online and the issue is probably because of the use of the lambda function. Is there any way I could load this model because it took a crazy amount of time to train. Any help would be appreciated!
First save your model with
Then load with with custom objects"model_path")
from keras.models import load_model
# Returns a compiled model identical to the previous one model =
Converting comment into answer: you can salvage the weights of the network if you just create it yourself in code again. The error is about creating the network from JSON, but let's follow from:
# ...
# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])
# ... you don't need compile for only predict
# ... skip training and model saving
# malstm.save_weights('malst_w.h5')
now, the weights are loaded into the existing model which you created in code.

Tensor Flow queue not closing. Problems with tf.train.start_queue_runners(sess)

While running a test CNN I'm always getting this error when trying to close the session with sess.close(), or requesting toe coordinator to stop and collect the threads. Apparently, the session is trying to close while there is still threads running. I just can't find a way to stop this to happens. Or if there is better/correct way to use queues and threads in tensor flow...
Thanks in advance!
There is always a bunch of:
2017-10-24 15:48:02.625448: W C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\kernels\] _20_input_p
roducer/input_producer: Skipping cancelled enqueue attempt with queue not closed
Followed by:
ERROR:tensorflow:Exception in QueueRunner: Enqueue operation was cancelled
[[Node: batch/fifo_queue_enqueue = QueueEnqueueV2[Tcomponents=[DT_FLOAT, DT_FLOAT], timeout_ms=-1, _device="/job:localhost/replica:0
/task:0/cpu:0"](batch/fifo_queue, Squeeze_1/_13, input_producer_1/Gather_1/_15)]]
Traceback (most recent call last):
File "<stdin>", line 30, in <module>
ERROR:tensorflow:Exception in QueueRunner: Enqueue operation was cancelled
[[Node: batch_1/fifo_queue_enqueue = QueueEnqueueV2[Tcomponents=[DT_FLOAT, DT_FLOAT], timeout_ms=-1, _device="/job:localhost/replica
:0/task:0/cpu:0"](batch_1/fifo_queue, Squeeze/_37, input_producer/Gather_1/_39)]]
ERROR:tensorflow:Exception in QueueRunner: Enqueue operation was cancelled
[[Node: batch_1/fifo_queue_enqueue = QueueEnqueueV2[Tcomponents=[DT_FLOAT, DT_FLOAT], timeout_ms=-1, _device="/job:localhost/replica
:0/task:0/cpu:0"](batch_1/fifo_queue, Squeeze/_37, input_producer/Gather_1/_39)]]
Exception in thread Thread-53:
Traceback (most recent call last):
File "C:\Program Files\Anaconda3\lib\", line 916, in _bootstrap_inner
File "C:\Program Files\Anaconda3\lib\", line 864, in run
self._target(*self._args, **self._kwargs)
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\training\", line 238, in _run
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\client\", line 1235, in _single_operation_run
target_list_as_strings, status, None)
File "C:\Program Files\Anaconda3\lib\", line 89, in __exit__
File "C:\Program Files\Anaconda3\lib\site-packages\tensorflow\python\framework\", line 466, in raise_exception_on_not_ok_stat
Below is the code that wrote based on examples from tf manual and GitHub:
"""My general framework to construct a tensor flow data set of images for regression.
The genetal idea is to: create a list of image names (i.e the path to each image).
The image list must have also labels. In the case of a regression this can be multiple variables.
import csv
import os
import sys
import plotly as py
import plotly.graph_objs as go
import math
import numpy as np
import tensorflow as tf
#First neet to get image paths and their respective labels
chn = 1
im_h = 424
im_w = 511
#resize image
size = 0.1
#size of test set
p = 0.25
def imtensors(im_path, chn, im_h, im_w, size):
im_h = int(im_h*size)
im_w = int(im_w*size)
ima_tensors = tf.read_file(im_path)
ima_tensors = tf.image.decode_png(ima_tensors, channels=chn)
ima_tensors = tf.image.resize_images(ima_tensors, [im_h, im_w])
return ima_tensors
dbname = 'simpRDB.csv'
imagepaths, y = list(), list()
#read the csv as a dictionary
with open(dbname, newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
n = len(y)
ntest = int(n*p)
#remember that in py the index starts at 0 and x[a:d] -> a,b,c
impath_test = imagepaths[0:ntest]
y_test = y[0:ntest]
impath_train = imagepaths[ntest+1:n]
y_train = y[ntest+1:n]
#now convert to tensors
impath_test = tf.convert_to_tensor(impath_test, dtype=tf.string)
y_test = tf.convert_to_tensor(y_test, dtype=tf.float32)
im_test, y_test = tf.train.slice_input_producer([impath_test, y_test])
im_test = imtensors(im_test, chn, im_h, im_w, size)
impath_train = tf.convert_to_tensor(impath_train, dtype=tf.string)
y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
im_train, y_train = tf.train.slice_input_producer([impath_train, y_train])
im_train = imtensors(im_train, chn, im_h, im_w, size)
# -----------------------------------------------
# This is a classic CNN with some spice.
# The basic change isat the output node. Instead of
# use a softmax or other multiclass we a re using a
# a fully regressor estimator to the last layer of
# nodes.
# -----------------------------------------------
# Parameters
learning_rate = 0.001
num_steps = 10000
b_size = 8
display_step = 100
# Network Parameters
dropout = 0.3 # rate to drop input
#create batched train set
#Use small batchs because CPU/GPU can run out of memory
X, Y = tf.train.batch([im_train, y_train], batch_size=b_size, capacity=b_size*4, num_threads=4,
X_test, Y_test = tf.train.batch([im_test, y_test], batch_size=b_size, capacity=b_size*4,
num_threads=4, allow_smaller_final_batch=True)
#First lets define the weights and bias in a more sistematic fashion.
#The weights are going to be initialized as random weights wit values near zero. This is a good
#practice for neuralnets in general
inp_h = int(im_h*size)
inp_w = int(im_w*size)
#placeholders for model cheking
x = tf.placeholder(tf.float32, shape=[None, inp_h, inp_w, 1])
y_ = tf.placeholder(tf.float32, shape=[None, 1])
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
#The bias neurons are normaly initialized slightly positive for a ReLU activation function in order
# to prevent "dead neurons"
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
#Now convolution layers: They are going to have size one with zero pad.
# The arguent strides deifne the size of the window. So the output is of the same size.
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
#And the polling are going to be 2x2 blocks side by side.
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
#Define weights and biases
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
#after 2 max_poling 2x2 the 'image'size is reduce by 4. Need to use ceil (intrinsics of TF)
h_val = math.ceil(math.ceil(inp_h/2)/2)
w_val = math.ceil(math.ceil(inp_w/2)/2)
W_fc1 = weight_variable([h_val * w_val * 64, 1024])
b_fc1 = bias_variable([1024])
W_d1 = weight_variable([1024, 100])
b_d1 = bias_variable([100])
w_out = weight_variable([100, 1])
b_out = bias_variable([1])
# Create model
def conv_net(x, dropout, reuse, is_training):
# Define a scope for reusing the variables
with tf.variable_scope('ConvNet', reuse=reuse):
#no need for dropout if evaluating the model
# Convolution Layer with 32 filters and a kernel size of
conv1 = tf.nn.relu(conv2d(x, W_conv1) + b_conv1)
# Max Pooling (down-sampling) with strides of 2 and kernel size of 2
pool1 = max_pool_2x2(conv1)
#In order to build a deep network, we stack several layers of this type.
# The second layer will have 64 features for each 5x5 patch.
conv2 = tf.nn.relu(conv2d(pool1, W_conv2) + b_conv2)
# Max Pooling (down-sampling) with strides of 2 and kernel size of 2
pool2 = max_pool_2x2(conv2)
#Now that the image size has been reduced to nxm, we add a fully-connected layer
# with 1024(32X32) neurons to allow processing on the entire image.
# Flatten the data to a 1-D vector for the fully connected layer
fc1 = tf.contrib.layers.flatten(pool2)
# Fully connected layer
fc1_flat = tf.nn.relu(tf.matmul(fc1, W_fc1) + b_fc1)
# Apply Dropout (if is_training is False, dropout is not applied)
fc1_drop = tf.layers.dropout(fc1_flat, rate=dropout, training=is_training)
# dense layer with size reduction for input to final prediction layer
d1 = tf.nn.relu(tf.matmul(fc1_drop, W_d1) + b_d1)
# Only one output aka regression
out = tf.matmul(d1, w_out)+b_out
return out
#Create graph for trainig and a graph for prediction sharing the same weights
out_train = conv_net(X, dropout, reuse=False, is_training=True)
#no drop out at evaluation
out_test = conv_net(X_test, dropout, reuse=True, is_training=False)
# Define loss and optimizer.
#reduce absulute sum of squares
loss_op = tf.reduce_mean(tf.abs(out_train - Y))
loss_ts = tf.reduce_mean(tf.abs(out_test - Y_test))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
#Save the loss for training and testing for plot latter
losses_op = []
losses_ts = []
steps = []
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
# Saver object
saver = tf.train.Saver()
# Start training
with tf.Session() as sess:
# Run the initializer
# Start the data queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess)
# Training cycle
for step in range(1, num_steps+1):
if step % display_step == 0:
# Run optimization and calculate batch loss and accuracy
_, loss, loss2 =[train_op, loss_op, loss_ts])
print("Step " + str(step) + ", Minibatch Loss = " + \
"{:.4f}".format(loss) + ", Loss Testbatch = " + "{:.4f}".format(loss2))
# Only run the optimization op (backprop)
print("Optimization Finished!")
except Exception as e:
#Something about it is safer to call twice
coord.join(threads), model_path)
#plota of loss over steps
p_loss1 = go.Scatter(
name='Loss training minibatch'
p_loss2 = go.Scatter(
name='Loss evaluation minibatch'
data = [p_loss1, p_loss2]
# Save your model
#Loss of final model on the test set
#no dropout for the test set
m_name = 'bw_model_1'
save_path = os.path.join(os.getcwd(), 'bw_models')
if not os.path.exists(save_path):
model_path = os.path.join(save_path, m_name)
#save a graph of the loss over time
py.offline.plot(data, filename=model_path)
Sorry about my dumbness! lol
Found a big error, the problem was that I was creating the coordinator but not calling it inside the queue runner. It was like this:
# Start the data queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess)
And it should be this:
# Start the data queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess, coord)
Now the code is stopping the threads and collecting it at the end!
