I am trying to create a SavedModel(*.pb) file in tensorflow python. I will use this protobuf file in Java. However running the following code gives me a permission denied error. I am running tensorflow-gpu on a windows machine.
import cv2
import tensorflow as tf
import os
import numpy as np
from imutils import paths
from keras.preprocessing.image import img_to_array
def load_data(data_dir):
imagePaths = sorted(list(paths.list_images(data_dir)))
print("[INFO] loading images...")
data = []
labels = []
for imagePath in imagePaths:
# load the image, pre-process it, and store it in the data list
image = cv2.imread(imagePath)
reimage = cv2.resize(image, (64,16))
reimage=reimage.astype(np.float32)
reimage=np.multiply(reimage,1.0/255.0)
data.append(reimage)
label = imagePath.split(os.path.sep)[-2]
if label=="classa":
label=0
if label=="classb":
label=1
if label=="classc":
label=2
ll=int(label)
labels.append(ll)
data=np.array(data)
print(data.shape)
labels=np.array(labels)
print(labels.shape)
return data, labels
images_a,labels_a=load_data("ttt")
graph=tf.Graph()
with graph.as_default():
images_ph=tf.placeholder(tf.float32,[None,16,64,3],name='input')
labels_ph=tf.placeholder(tf.int32,[None],name='labels')
dataset=tf.data.Dataset.from_tensor_slices((images_a,labels_a)).repeat().batch(100)
iter=dataset.make_one_shot_iterator()
next_element=iter.get_next()
conv1 = tf.layers.conv2d(
inputs=images_ph,
filters=32,
kernel_size=[5, 5],
padding="same",
activation=tf.nn.relu)
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
pool2_flat = tf.reshape(pool1, [-1,32*8*32])
dense = tf.layers.dense(inputs=pool2_flat, units=1000, activation=tf.nn.relu)
dropout = tf.layers.dropout(inputs=dense, rate=0.4)
logits = tf.layers.dense(inputs=dropout, units=3)
predicted_labels=tf.argmax(logits,1)
loss=tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels_ph,logits=logits))
train=tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
init=tf.global_variables_initializer()
with tf.Session(graph=graph) as session:
session.run(init)
for i in range(101):
x,y=session.run(next_element)
loss_values=session.run([train,loss],feed_dict={images_ph:x,labels_ph:y})
if i%10==0:
print("Loss: ",loss_values)
builder=tf.saved_model.builder.SavedModelBuilder("/pModel")
builder.add_meta_graph_and_variables(session,[tf.saved_model.tag_constants.SERVING],
signature_def_map={
tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:tf.saved_model.signature_def_utils.build_signature_def(
inputs={"input":tf.saved_model.utils.build_tensor_info(images_ph)},
outputs={"output":tf.saved_model.utils.build_tensor_info(logits)})
})
builder.save()
test_images,test_labels=load_data("test")
predicted=session.run(predicted_labels,feed_dict={images_ph:test_images})
match_count=sum([int(y==y_) for y,y_ in zip (test_labels, predicted)])
accuracy=match_count/len(test_labels)
print("Accuracy: {:.3f}".format(accuracy))
print(tf.saved_model.tag_constants.SERVING)
When this is run it gives me the following error:
2018-04-19 13:32:10.043715: W C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\35\tensorflow\core\framework\op_kernel.cc:1202] OP_REQUIRES failed at save_restore_v2_ops.cc:220 : Permission denied: Failed to create a directory: /; Permission denied
Traceback (most recent call last):
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1361, in _do_call
return fn(*args)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1340, in _run_fn
target_list, status, run_metadata)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 516, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.PermissionDeniedError: Failed to create a directory: /; Permission denied
[[Node: save/MergeV2Checkpoints = MergeV2Checkpoints[delete_old_dirs=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](save/MergeV2Checkpoints/checkpoint_prefixes, _arg_save/Const_0_0)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "tensorv4.py", line 94, in <module>
outputs={"output":tf.saved_model.utils.build_tensor_info(logits)})
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\saved_model\builder_impl.py", line 392, in add_meta_graph_and_variables
saver.save(sess, variables_path, write_meta_graph=False, write_state=False)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 1654, in save
{self.saver_def.filename_tensor_name: checkpoint_file})
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 905, in run
run_metadata_ptr)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1137, in _run
feed_dict_tensor, options, run_metadata)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1355, in _do_run
options, run_metadata)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1374, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.PermissionDeniedError: Failed to create a directory: /; Permission denied
[[Node: save/MergeV2Checkpoints = MergeV2Checkpoints[delete_old_dirs=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](save/MergeV2Checkpoints/checkpoint_prefixes, _arg_save/Const_0_0)]]
Caused by op 'save/MergeV2Checkpoints', defined at:
File "tensorv4.py", line 94, in <module>
outputs={"output":tf.saved_model.utils.build_tensor_info(logits)})
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\saved_model\builder_impl.py", line 386, in add_meta_graph_and_variables
allow_empty=True)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 1293, in __init__
self.build()
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 1302, in build
self._build(self._filename, build_save=True, build_restore=True)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 1339, in _build
build_save=build_save, build_restore=build_restore)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 787, in _build_internal
save_tensor = self._AddShardedSaveOps(filename_tensor, per_device)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 411, in _AddShardedSaveOps
return self._AddShardedSaveOpsForV2(filename_tensor, per_device)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 393, in _AddShardedSaveOpsForV2
sharded_prefixes, checkpoint_prefix, delete_old_dirs=True)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\ops\gen_io_ops.py", line 379, in merge_v2_checkpoints
delete_old_dirs=delete_old_dirs, name=name)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 3271, in create_op
op_def=op_def)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 1650, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
PermissionDeniedError (see above for traceback): Failed to create a directory: /; Permission denied
[[Node: save/MergeV2Checkpoints = MergeV2Checkpoints[delete_old_dirs=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](save/MergeV2Checkpoints/checkpoint_prefixes, _arg_save/Const_0_0)]]
Any idea on what is going wrong. I tried running with administrator privileges but to no avail
Related
I am using tensorflow v1.14. I have a saved model and I'm trying to restore the model using the following code:
loader = tf.train.import_meta_graph("models/fcnn0/model.ckpt.meta")
graph = tf.get_default_graph()
sess = tf.Session()
loader.restore(sess, "models/fcnn0/model.ckpt")
I used to use the same piece of code in Tensorflow v1.13 and it used to work without errors. But now I'm getting the error
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1356, in _do_call
return fn(*args)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1341, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1429, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.DataLossError: file is too short to be an sstable
[[{{node save/RestoreV2}}]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/sandesh/PycharmProjects/fading/finding_code/src/load_32_64.py", line 8, in <module>
loader.restore(sess, "models/fcnn_32_64_aenc_1331_747_3870000/model.ckpt")
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1286, in restore
{self.saver_def.filename_tensor_name: save_path})
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 950, in run
run_metadata_ptr)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1173, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1350, in _do_run
run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1370, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.DataLossError: file is too short to be an sstable
[[node save/RestoreV2 (defined at home/sandesh/PycharmProjects/fading/finding_code/src/load_32_64.py:5) ]]
Original stack trace for 'save/RestoreV2':
File "home/sandesh/PycharmProjects/fading/finding_code/src/load_32_64.py", line 5, in <module>
loader = tf.train.import_meta_graph("models/fcnn_32_64_aenc0/model.ckpt.meta")
File "usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1449, in import_meta_graph
**kwargs)[0]
File "usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1473, in _import_meta_graph_with_return_elements
**kwargs))
File "usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/meta_graph.py", line 857, in import_scoped_meta_graph_with_return_elements
return_elements=return_elements)
File "usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/importer.py", line 443, in import_graph_def
_ProcessNewOps(graph)
File "usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/importer.py", line 236, in _ProcessNewOps
for new_op in graph._add_new_tf_operations(compute_devices=False): # pylint: disable=protected-access
File "usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3751, in _add_new_tf_operations
for c_op in c_api_util.new_tf_operations(self)
File "usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3751, in <listcomp>
for c_op in c_api_util.new_tf_operations(self)
File "usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3641, in _create_op_from_tf_operation
ret = Operation(c_op, self)
File "usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 2005, in __init__
self._traceback = tf_stack.extract_stack()
Can someone point me as to what I'm doing wrong? Thanks in advance.
I was looking into the folder where the model files were saved and found that the model.ckpt.meta file had not been written to disk properly. I reran the training and saved the model and then it worked perfectly.
I cannot run my code in Google Colaboratory twice without restarting runtime. Is there a way to run it without restarting runtime.
My code takes TensorD libraries and compute aproximation of a random 2x4x4 tensor using CP ALS algorithm. (this is an example taken from https://github.com/Large-Scale-Tensor-Decomposition/tensorD)
!git clone https://github.com/Large-Scale-Tensor-Decomposition/tensorD.git
import sys
import time
sys.path.append("/content/tensorD")
from tensorD.factorization.env import Environment
from tensorD.dataproc.provider import Provider
from tensorD.demo.DataGenerator import *
from tensorD.factorization.cp import CP_ALS
# generate a random tensor with shape 3x4x4
t = time.time()
X = synthetic_data_cp([3, 4, 4], 7)
data_provider = Provider()
data_provider.full_tensor = lambda: X
env = Environment(data_provider, summary_path='/tmp/cp_' + '7')
cp = CP_ALS(env)
args = CP_ALS.CP_Args(rank=7, validation_internal=1)
# build CP model with arguments
cp.build_model(args)
# train CP model with the maximum iteration of 100
cp.train(50)
# obtain factor matrices from trained model
factor_matrices = cp.factors
# obtain scaling vector from trained model
lambdas = cp.lambdas
for matrix in factor_matrices:
print(matrix)
elapsed = time.time() - t
print(elapsed)
when I run it first time I have no problem. When I run it again (without restart of runtime) I obtain:
CP model initial finish
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 try:
-> 1334 return fn(*args)
1335 except errors.OpError as e:
7 frames
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [3,4,4]
[[{{node Placeholder}}]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1346 pass
1347 message = error_interpolation.interpolate(message, self._graph)
-> 1348 raise type(e)(node_def, op, message)
1349
1350 def _extend_graph(self):
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [3,4,4]
[[node Placeholder (defined at /content/tensorD/tensorD/factorization/cp.py:69) ]]
Caused by op 'Placeholder', defined at:
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-4-4d2250ec007b>", line 21, in <module>
cp.build_model(args)
File "/content/tensorD/tensorD/factorization/cp.py", line 69, in build_model
input_data = tf.placeholder(tf.float32, shape=self._env.full_shape())
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/array_ops.py", line 2077, in placeholder
return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 5791, in placeholder
"Placeholder", dtype=dtype, shape=shape, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3300, in create_op
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
self._traceback = tf_stack.extract_stack()
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [3,4,4]
[[node Placeholder (defined at /content/tensorD/tensorD/factorization/cp.py:69) ]]
Any help will be appreciated!
This seems to mostly be a question about TensorFlow. Do you get what you want outside of Colab? I'm not totally clear about what you are expecting, but import tensorflow as tf; tf.reset_default_graph() at the top of your snippet seems sensible and squelches the error.
I'm trying to carry out the tutorial named "Training a classifier" with PyTorch.
WHen trying to debug this part of the code :
import matplotlib.pyplot as plt
import numpy as np
# functions to show an image
def imshow(img):
img = img / 2 + 0.5 # unnormalize
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()
# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))
I get this error message :
Files already downloaded and verified Files already downloaded and verified
Files already downloaded and verified Files already downloaded and verified Traceback (most recent call last):
File "<string>", line 1, in <module>
File "D:\Anaconda\lib\multiprocessing\spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "D:\Anaconda\lib\multiprocessing\spawn.py", line 114, in _main
prepare(preparation_data)
File "D:\Anaconda\lib\multiprocessing\spawn.py", line 225, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "D:\Anaconda\lib\multiprocessing\spawn.py", line 277, in
_fixup_main_from_path
run_name="__mp_main__")
File "D:\Anaconda\lib\runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File "D:\Anaconda\lib\runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "D:\Anaconda\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "d:\Yggdrasil\Programmation\PyTorch\TutorialCIFAR10.py", line 36, in <module>
dataiter = iter(trainloader)
File "D:\Anaconda\lib\site-packages\torch\utils\data\dataloader.py", line 451, in __iter__
return _DataLoaderIter(self)
File "D:\Anaconda\lib\site-packages\torch\utils\data\dataloader.py", line 239, in __init__
w.start()
File "D:\Anaconda\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "D:\Anaconda\lib\multiprocessing\context.py", line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "D:\Anaconda\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "D:\Anaconda\lib\multiprocessing\popen_spawn_win32.py", line 33, in __init__
prep_data = spawn.get_preparation_data(process_obj._name)
File "D:\Anaconda\lib\multiprocessing\spawn.py", line 143, in get_preparation_data
_check_not_importing_main()
File "D:\Anaconda\lib\multiprocessing\spawn.py", line 136, in
_check_not_importing_main
is not going to be frozen to produce an executable.)
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
Traceback (most recent call last):
File "d:\Yggdrasil\Programmation\PyTorch\TutorialCIFAR10.py", line 36, in <module>
dataiter = iter(trainloader)
File "D:\Anaconda\lib\site-packages\torch\utils\data\dataloader.py", line 451, in __iter__
return _DataLoaderIter(self)
File "D:\Anaconda\lib\site-packages\torch\utils\data\dataloader.py", line 239, in __init__
w.start()
File "D:\Anaconda\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "D:\Anaconda\lib\multiprocessing\context.py", line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj) File "D:\Anaconda\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "D:\Anaconda\lib\multiprocessing\popen_spawn_win32.py", line 65, in
__init__
reduction.dump(process_obj, to_child)
File "D:\Anaconda\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
BrokenPipeError: [Errno 32] Broken pipe
All the previous lines in the tutorial are working perfectly.
Does someone know how to solve this, please ?
Thanks a lot in advance
The question happened because Windows cannot run this DataLoader in 'num_workers' more than 0.
You can see where the trainloader come from.we can see
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
shuffle=True, num_workers=2)
We need to change the 'num_workers' to 0.like this:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
shuffle=True, num_workers=0)
Every trainloaders need to change like this.
Got the same error. The following workaround works for me:
def run():
# code goes here
if __name__ == '__main__':
run()
This doesn't look to be a PyTorch problem. Try executing the code in Jupyter notebooks and other environment troubleshooting.
you need to add a if-clause protection as stated in the pytorch docs:
https://pytorch.org/docs/stable/notes/windows.html#usage-multiprocessing
I'm in the middle of training my net on tensorflow and I get this error at some part of the training process, though I can not tell exactly what is causing it because it will always happen at a seemingly random training sample at a more or less random epoch of training. I'm at my wits end with this and would like it if someone could shed light on it below.
Traceback (most recent call last):
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1361, in _do_call
return fn(*args)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1340, in _run_fn
target_list, status, run_metadata)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py", line 516, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.AbortedError: Operation received an exception:Status: 3, message: could not set native handle, in file tensorflow/core/kernels/mkl_conv_ops.cc:652
[[Node: SRGAN_g/Conv4_4/Conv2D = _MklConv2D[T=DT_FLOAT, _kernel="MklOp", data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](SRGAN_g/pixelshufflerx2/1/4/Relu, SRGAN_g/Conv4/W_conv2d/read, SRGAN_g/pixelshufflerx2/1/4/Relu:1, DMT/_132)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 289, in <module>
train()
File "main.py", line 149, in train
errM, _ = sess.run([mse_loss, g_optim_init], {t_image: b_imgs_96, t_target_image: b_imgs_384})
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 905, in run
run_metadata_ptr)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1137, in _run
feed_dict_tensor, options, run_metadata)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1355, in _do_run
options, run_metadata)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1374, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.AbortedError: Operation received an exception:Status: 3, message: could not set native handle, in file tensorflow/core/kernels/mkl_conv_ops.cc:652
[[Node: SRGAN_g/Conv4_4/Conv2D = _MklConv2D[T=DT_FLOAT, _kernel="MklOp", data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](SRGAN_g/pixelshufflerx2/1/4/Relu, SRGAN_g/Conv4/W_conv2d/read, SRGAN_g/pixelshufflerx2/1/4/Relu:1, DMT/_132)]]
Caused by op 'SRGAN_g/Conv4_4/Conv2D', defined at:
File "main.py", line 289, in <module>
train()
File "main.py", line 53, in train
net_g = SRGAN_g(t_image, is_train=True, reuse=False)
File "/global/home/hpc4142/srgan-master_TF/model.py", line 51, in SRGAN_g
recursive_layers[i] = Conv2dReuse(recursive_layers[i], 256, (3, 3), (1, 1), act=None, padding='SAME', W_init=w_init, name='n256s1/2/%s' %i, group = "Conv4")
File "/global/home/hpc4142/srgan-master_TF/nu_layers.py", line 461, in Conv2dReuse
group = group)
File "/global/home/hpc4142/srgan-master_TF/nu_layers.py", line 270, in __init__
self.outputs = act( tf.nn.conv2d(self.inputs, W, strides=strides, padding=padding, use_cudnn_on_gpu=use_cudnn_on_gpu, data_format=data_format) + b )
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 631, in conv2d
data_format=data_format, dilations=dilations, name=name)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3271, in create_op
op_def=op_def)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1650, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
AbortedError (see above for traceback): Operation received an exception:Status: 3, message: could not set native handle, in file tensorflow/core/kernels/mkl_conv_ops.cc:652
[[Node: SRGAN_g/Conv4_4/Conv2D = _MklConv2D[T=DT_FLOAT, _kernel="MklOp", data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](SRGAN_g/pixelshufflerx2/1/4/Relu, SRGAN_g/Conv4/W_conv2d/read, SRGAN_g/pixelshufflerx2/1/4/Relu:1, DMT/_132)]]
I had the same issue but when I tried to apply the same fix I have run into another error. I am however running on 5 gpus. I have read that you need to make sure that your samples are divisible by both the batch sive and number of gpus but I have done that. I have scoured the internet for days and am unable to find anything that has been able to fix the issue I am having. I am running keras v2.0.9 and tensor flow v1.1.0
VARIABLES:
attributeTables[0] is a numpy array shape (35560, 700)
y is a numpy array shape (35560, ) I have also tried using shape (35560, 1) for y but all that happens is the "Incompatible shapes: [2540] vs. [508]" changes from that to "Incompatible shapes: [2540, 1] vs. [508, 1]"
So this says to me that the issue is only with the targetsand that the expected batch size is getting multiplied somewhere in the middle of the process only for the targets and not for attributes causing a mismatch or at least only durring validation I'm not sure.
Here is the code and error in question.
import numpy as np
from keras.models import Sequential
from keras.utils import multi_gpu_model
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
def baseline_model():
# create model
print("Building Layers")
model = Sequential()
model.add(LSTM(700, batch_input_shape=(batchSize, X.shape[1], X.shape[2]), activation='tanh', return_sequences=False, stateful=True))
model.add(Dense(1))
print("Building Parallel model")
parallel_model = multi_gpu_model(model, gpus=nGPU)
# Compile model
#model.compile(loss='mean_squared_error', optimizer='adam')
print("Compiling Model")
parallel_model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])
return parallel_model
def buildModel():
print("Bulding Model")
mlp = baseline_model()
print("Fitting Model")
return mlp.fit(X_train, y_train, epochs=1, batch_size=batchSize, shuffle=False, validation_data=(X_test, y_test))
print("Scaling")
scaler = StandardScaler()
X_Scaled = scaler.fit_transform(attributeTables[0])
print("Finding Batch Size")
nGPU = 5
batchSize = 500
while len(X_Scaled) % (batchSize * nGPU) != 0:
batchSize += 1
print("Filling Arrays")
X = X_Scaled.reshape((X_Scaled.shape[0], X_Scaled.shape[1], 1))
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)
print("Calling buildModel()")
model = buildModel()
print("Ploting History")
plt.plot(model.history['loss'], label='train')
plt.plot(model.history['val_loss'], label='test')
plt.legend()
plt.show()
Here is my complete output.
Beginning OHLC Load
Time took : 7.571000099182129
Making gloabal copies
Time took : 0.0
Using TensorFlow backend.
Scaling
Finding Batch Size
Filling Arrays
Calling buildModel()
Bulding Model
Building Layers
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:2010: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
FutureWarning)
Building Parallel model
Compiling Model
Fitting Model
Train on 28448 samples, validate on 7112 samples
Epoch 1/1
Traceback (most recent call last):
File "<ipython-input-2-74c49f05bfbc>", line 1, in <module>
runfile('C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py', wdir='C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor')
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile
execfile(filename, namespace)
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 77, in <module>
model = buildModel()
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 57, in buildModel
return mlp.fit(X_train, y_train, epochs=1, batch_size=batchSize, shuffle=False, validation_data=(X_test, y_test))
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1631, in fit
validation_steps=validation_steps)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1213, in _fit_loop
outs = f(ins_batch)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py", line 2332, in __call__
**self.session_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 778, in run
run_metadata_ptr)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 982, in _run
feed_dict_string, options, run_metadata)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1032, in _do_run
target_list, options, run_metadata)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1052, in _do_call
raise type(e)(node_def, op, message)
InvalidArgumentError: Incompatible shapes: [2540,1] vs. [508,1]
[[Node: training/Adam/gradients/loss/concatenate_1_loss/sub_grad/BroadcastGradientArgs = BroadcastGradientArgs[T=DT_INT32, _class=["loc:#loss/concatenate_1_loss/sub"], _device="/job:localhost/replica:0/task:0/gpu:0"](training/Adam/gradients/loss/concatenate_1_loss/sub_grad/Shape, training/Adam/gradients/loss/concatenate_1_loss/sub_grad/Shape_1)]]
[[Node: replica_1/sequential_1/dense_1/BiasAdd/_313 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:1", send_device_incarnation=1, tensor_name="edge_1355_replica_1/sequential_1/dense_1/BiasAdd", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Caused by op 'training/Adam/gradients/loss/concatenate_1_loss/sub_grad/BroadcastGradientArgs', defined at:
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 245, in <module>
main()
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 241, in main
kernel.start()
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\ioloop.py", line 832, in start
self._run_callback(self._callbacks.popleft())
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\ioloop.py", line 605, in _run_callback
ret = callback()
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 265, in enter_eventloop
self.eventloop(self)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\eventloops.py", line 106, in loop_qt5
return loop_qt4(kernel)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\eventloops.py", line 99, in loop_qt4
_loop_qt(kernel.app)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\eventloops.py", line 83, in _loop_qt
app.exec_()
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\eventloops.py", line 39, in process_stream_events
kernel.do_one_iteration()
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 298, in do_one_iteration
stream.flush(zmq.POLLIN, 1)
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 352, in flush
self._handle_recv()
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2808, in run_ast_nodes
if self.run_code(code, result):
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-74c49f05bfbc>", line 1, in <module>
runfile('C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py', wdir='C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor')
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile
execfile(filename, namespace)
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 77, in <module>
model = buildModel()
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 57, in buildModel
return mlp.fit(X_train, y_train, epochs=1, batch_size=batchSize, shuffle=False, validation_data=(X_test, y_test))
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1608, in fit
self._make_train_function()
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 990, in _make_train_function
loss=self.total_loss)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\legacy\interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\optimizers.py", line 415, in get_updates
grads = self.get_gradients(loss, params)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\optimizers.py", line 73, in get_gradients
grads = K.gradients(loss, params)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py", line 2369, in gradients
return tf.gradients(loss, variables, colocate_gradients_with_ops=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 560, in gradients
grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 368, in _MaybeCompile
return grad_fn() # Exit early
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 560, in <lambda>
grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_grad.py", line 609, in _SubGrad
rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 411, in _broadcast_gradient_args
name=name)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 768, in apply_op
op_def=op_def)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2336, in create_op
original_op=self._default_original_op, op_def=op_def)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1228, in __init__
self._traceback = _extract_stack()
...which was originally created as op 'loss/concatenate_1_loss/sub', defined at:
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 245, in <module>
main()
[elided 27 identical lines from previous traceback]
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 77, in <module>
model = buildModel()
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 55, in buildModel
mlp = baseline_model()
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 29, in baseline_model
parallel_model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 860, in compile
sample_weight, mask)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 460, in weighted
score_array = fn(y_true, y_pred)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\losses.py", line 13, in mean_absolute_error
return K.mean(K.abs(y_pred - y_true), axis=-1)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 821, in binary_op_wrapper
return func(x, y, name=name)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 2627, in _sub
result = _op_def_lib.apply_op("Sub", x=x, y=y, name=name)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 768, in apply_op
op_def=op_def)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2336, in create_op
original_op=self._default_original_op, op_def=op_def)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1228, in __init__
self._traceback = _extract_stack()
InvalidArgumentError (see above for traceback): Incompatible shapes: [2540,1] vs. [508,1]
[[Node: training/Adam/gradients/loss/concatenate_1_loss/sub_grad/BroadcastGradientArgs = BroadcastGradientArgs[T=DT_INT32, _class=["loc:#loss/concatenate_1_loss/sub"], _device="/job:localhost/replica:0/task:0/gpu:0"](training/Adam/gradients/loss/concatenate_1_loss/sub_grad/Shape, training/Adam/gradients/loss/concatenate_1_loss/sub_grad/Shape_1)]]
[[Node: replica_1/sequential_1/dense_1/BiasAdd/_313 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:1", send_device_incarnation=1, tensor_name="edge_1355_replica_1/sequential_1/dense_1/BiasAdd", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Daniel Moller's link was right when i disabled the parallel model and put it on one GPU the stateful worked no ptoblem. Currently waiting on it to train. Will post results.
I just published an experimental utility, stateful_multi_gpu, to handle stateful model training of multiple GPUs. I'm interested to know if it is of use to you.
Please also see my answer for the same question Daniel Möller referred to.