Keras run 2 independent training process on 2 GPU. ResourceExhaustedError - keras

I have 2 GPU GTX1080 with Keras v2 installed. I run 2 training processes on gpu0 and gpu1 simultaneously. But, I got ResourceExhaustedError.
What is missing?
python multi-gpu-process.py --gpu_id=1 (ok)
python multi-gpu-process.py --gpu_id=0 (ResourceExhaustedError. Help pls.)
# file: multi-gpu-process.py (2 training processes work on different GPUs)
import numpy as np
import os
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
# What GPU is installed.
gpu_list = get_available_gpus()
# Specified gpu installed on machine?
if not '/gpu:' + str(FLAGS.gpu_id) in gpu_list:
raise Exception('This gpu is not installed: /gpu:{}'.format(FLAGS.gpu_id))
# Set GPU in environment.
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' # see issue #152
os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpu_id)
import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout, Activation
from keras.layers.convolutional import Conv2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
from tensorflow.python.client import device_lib
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('gpu_id', 0, """GPU id. Single gpu case.""")
# Use one GPU.
import keras.backend.tensorflow_backend as K # If import and not used, error on session release.
# Solve this error
# https://stackoverflow.com/questions/42969779/keras-error-you-must-feed-a-value-for-placeholder-tensor-bidirectional-1-keras
K.set_learning_phase(1) # set learning phase
# train on specified gpu
with K.tf.device('/gpu:%d' % FLAGS.gpu_id):
K.set_session(K.tf.Session(config=K.tf.ConfigProto(allow_soft_placement=True, # True. Allow to find other device if specified is not available.
log_device_placement=True)))
# To prove running multi process on gpu. Make small model.
model = Sequential()
model.add(Dense(400, input_dim=800, activation='tanh'))
model.add(Dense(200, input_dim=800, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print (model.summary())
optimizer = keras.optimizers.Adam(lr=0.0001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
x = np.random.rand(131072, 800)
y = np.random.randint(0, 2, (131072, 1))
model.fit(x, y, batch_size=2048*4, epochs=1000000)
K.clear_session() # Without it, session error at the end.
==================Not work when starting 2nd process on 2nd GPU (GTX 1080, 8GB)=========================
# Increased model size compared with working version
#
# file: multi-gpu-process_notwork.py
import numpy as np
import os
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
# What GPU is installed.
gpu_list = get_available_gpus()
# Specified gpu installed on machine?
if not '/gpu:' + str(FLAGS.gpu_id) in gpu_list:
raise Exception('This gpu is not installed: /gpu:{}'.format(FLAGS.gpu_id))
# Set GPU in environment.
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' # see issue #152
os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpu_id)
import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout, Activation
from keras.layers.convolutional import Conv2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
from tensorflow.python.client import device_lib
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('gpu_id', 0, """GPU id. Single gpu case.""")
# Use one GPU.
import keras.backend.tensorflow_backend as K # If import and not used, error on session release.
# Solve this error
# https://stackoverflow.com/questions/42969779/keras-error-you-must-feed-a-value-for-placeholder-tensor-bidirectional-1-keras
K.set_learning_phase(1) # set learning phase
# train on specified gpu
with K.tf.device('/gpu:%d' % FLAGS.gpu_id):
K.set_session(K.tf.Session(config=K.tf.ConfigProto(allow_soft_placement=True, # True. Allow to find other device if specified is not available.
log_device_placement=True)))
# To prove running multi process on gpu. Make small model.
model = Sequential()
model.add(Dense(4000, input_dim=8000, activation='tanh'))
model.add(Dense(2000, input_dim=8000, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print (model.summary())
optimizer = keras.optimizers.Adam(lr=0.0001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
x = np.random.rand(131072, 8000)
y = np.random.randint(0, 2, (131072, 1))
model.fit(x, y, batch_size=2048*4, epochs=1000000)
K.clear_session() # Without it, session error at the end.
Partial error:
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[8192,400]
[[Node: gradients/dense_1/Tanh_grad/TanhGrad = TanhGrad[T=DT_FLOAT, _class=["loc:#dense_1/Tanh"], _device="/job:localhost/replica:0/task:0/gpu:0"](dense_1/Tanh, gradients/dense_2/MatMul_grad/MatMul)]]
Since the GPU has 8GB, it has nothing to do with model, which is pretty small size.

Your model may be small, but your batch size of 8192 is probably way too big for your memory; start with a value of 64 or 128, and then you can try increasing it until you get an out-of-memory (OOM) error again...
EDIT: Although reducing your batch size will probably eliminate the error, there is still the issue of discrepancy between your 2 GPUs - as Yu-Yang correctly points out in the comments, you should set CUDA_VISIBLE_DEVICES before importing Keras, otherwise your Tensorflow backend will occupy the memory of both GPUs. Doing so will most probably eliminate the discrepancy (it might also let you keep your high batch size without an OOM error).

Related

Tensorflow custom loss function - can't get samples of y_pred and y_true in loss function

I'm running an LSTM network that works fine (TF 2.0). My problem starts when trying to modify the loss function.
I planed to adjust some data manipulation over 'y_true' and 'y_pred' but since TF force to maintain the data as tensors (and not convert it to Pandas or NumPy) it is challenging.
To get better control of the data inside the loss function I've simulated tf.keras.losses.mae function.
My goal was to be able to see the data ('y_true' and 'y_pred') so I can make my desire adjustments.
The original function:
def mean_absolute_error(y_true, y_pred):
y_pred = ops.convert_to_tensor(y_pred)
y_true = math_ops.cast(y_true, y_pred.dtype)
return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
And after adjustments for debugging:
from tensorflow.python.framework import ops
from tensorflow.python.ops import math_ops
import tensorflow.keras.backend as K
def mean_absolute_error_test(y_true, y_pred):
global temp_true
temp_true=y_true
print(y_true)
y_pred = ops.convert_to_tensor(y_pred)
y_true = math_ops.cast(y_true, y_pred.dtype)
return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
when I run model.compile and print y_true I get:
Tensor("dense_target:0", shape=(None, None), dtype=float32)
type=tensorflow.python.framework.ops.Tensor
Does anyone know how can I see 'y_pred' and 'y_true' or what am I missing?
Seems like I can't see samples of y_true or the data is empty.
The main code part:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dropout,Dense
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, load_model
from tensorflow.python.keras.layers.recurrent import LSTM
from tensorflow.keras.callbacks import EarlyStopping
K.clear_session()
model = Sequential()
model.add(LSTM(20,activation='relu',input_shape=(look_back,len(training_columns)),recurrent_dropout=0.4))
model.add(Dropout(0.1))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss=test2,experimental_run_tf_function=False)# mse,mean_squared_logarithmic_error
num_epochs = 20
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
history=model.fit(X_train_lstm, y_train_lstm, epochs = num_epochs, batch_size = 128,shuffle=False,verbose=1,validation_data=[X_test_lstm,y_test_lstm],callbacks=[es])

Wrapper for Keras Model in Spark

I have a Keras Nueral Network and I want to deploy this model using an wrapper in the spark environment. So I tried the following tutorial here
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Input, Dense, Conv1D, Conv2D, MaxPooling2D, Dropout,Flatten
from keras import backend as K
from keras.models import Model
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
# Expect to see a numpy n-dimentional array of (60000, 28, 28)
type(X_train), X_train.shape, type(X_train)
#This time however, we flatten each of our 28 X 28 images to a vector of 1, 784
X_train = X_train.reshape(-1, 784)
X_test = X_test.reshape(-1, 784)
# expect to see a numpy n-dimentional array of : (60000, 784) for Traning Data shape and (10000, 784) for Test Data shape
type(X_train), X_train.shape, X_test.shape
#We also use sklearn's MinMaxScaler for normalizing
from sklearn.preprocessing import MinMaxScaler
def scaleData(data):
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
return scaler.fit_transform(data)
X_train = scaleData(X_train)
X_test = scaleData(X_test)
# We define the same Keras model as earlier
input_shape = (1,28,28) if K.image_data_format() == 'channels_first' else (28,28, 1)
keras_model = Sequential()
keras_model.add(Conv2D(32, kernel_size=(5, 5), activation='relu', input_shape=input_shape, padding='same'))
keras_model.add(MaxPooling2D(pool_size=(2, 2)))
keras_model.add(Conv2D(64, (5, 5), activation='relu', padding='same'))
keras_model.add(MaxPooling2D(pool_size=(2, 2)))
keras_model.add(Flatten())
keras_model.add(Dense(512, activation='relu'))
keras_model.add(Dropout(0.5))
keras_model.add(Dense(10, activation='softmax'))
keras_model.summary()
# Import the Keras to DML wrapper and define some basic variables
from systemml.mllearn import Keras2DML
epochs = 5
batch_size = 100
samples = 60000
max_iter = int(epochs*math.ceil(samples/batch_size))
# Now create a SystemML model by calling the Keras2DML method and feeding it your spark session, Keras model, its input shape, and the # predefined variables. We also ask to be displayed the traning results every 10 iterations.
sysml_model = Keras2DML(spark, keras_model, input_shape=(1,28,28), weights='weights_dir', batch_size=batch_size, max_iter=max_iter, test_interval=0, display=10)
# Initiate traning. More spark workers and better machine configuration means faster training!
sysml_model.fit(X_train, y_train)
# Test your model's performance on the secluded test set, and re-iterate if required
sysml_model.score(X_test, y_test)
At the line from systemml.mllearn import Keras2DML
The error I got is
Traceback (most recent call last): File
"d:/SparkJarDirectory/./NNSpark.py", line 58,
in
from systemml.mllearn import Keras2DML File "C:\Users\xyz\AppData\Local\Continuum\anaconda3\lib\site-packages\systemml\mllearn__init__.py",
line 45, in
from .estimators import * File "C:\Users\xyz\AppData\Local\Continuum\anaconda3\lib\site-packages\systemml\mllearn\estimators.py",
line 917
def init(self, sparkSession, keras_model, input_shape, transferUsingDF=False, load_keras_weights=True, weights=None,
labels=None, batch_size=64, max_iter=2000, test_iter=10,
test_interval=500, display=100, lr_policy="step", weight_decay=5e-4,
regularization_type="L2"):
^ SyntaxError: import * only allowed at module level 2019-03-12 20:25:48 INFO ShutdownHookManager:54 - Shutdown hook called
2019-03-12 20:25:48 INFO ShutdownHookManager:54 - Deleting directory
C:\Users\xyz\AppData\Local\Temp\spark-2e1736f8-1798-42da-a157-cdf0ade1bf36
From my understanding I get that that there is an issue at the library I am using where they use
from .estimators import *
__all__ = estimators.__all__
I am not sure why the wrapper is not working or what fix is required. Any help is appreciated.
I think the systemml release 1.2.0 misses some fixes for python 3.5 (https://github.com/apache/systemml/commit/9e7ee19a45102f7cbb37507da25b1ba0641868fd) so you will need to install systemml from source (for my setup, which is different than yours, it would git clone and then "cd src/main/python; sudo python3.4 setup.py install")

Colab+Keras+TensorBoard FailedPreconditionError

I'm trying to run a simple Keras script and use Google Colab with TensorBoard. Here's my code:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.applications.mobilenet import MobileNet
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback
# Settings
num_classes = 10
batch_size = 16
epochs = 1
# Data setup
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
# Select model
model = MobileNet(weights=None, input_shape=x_train.shape[1:], classes=num_classes)
# Select loss, optimizer, metric
model.compile(loss='categorical_crossentropy',
optimizer=tf.train.AdamOptimizer(0.001),
metrics=['accuracy'])
# Train
tbc=TensorBoardColab()
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(x_test, y_test),
callbacks=[TensorBoardColabCallback(tbc)])
This is a suggestion I saw to use TensorBoard with Colab as referenced here: Can I use Tensorboard with Google Colab?
However, when adding the callback I get the error:
FailedPreconditionError: Error while reading resource variable
conv_dw_8_2/depthwise_kernel from Container: localhost. This could
mean that the variable was uninitialized. Not found: Resource
localhost/conv_dw_8_2/depthwise_kernel/N10tensorflow3VarE does not
exist. [[Node: conv_dw_8_2/depthwise/ReadVariableOp =
ReadVariableOpdtype=DT_FLOAT,
_device="/job:localhost/replica:0/task:0/device:GPU:0"]]
[[Node: loss_2/mul/_147 = _Recvclient_terminated=false,
recv_device="/job:localhost/replica:0/task:0/device:CPU:0",
send_device="/job:localhost/replica:0/task:0/device:GPU:0",
send_device_incarnation=1, tensor_name="edge_6752_loss_2/mul",
tensor_type=DT_FLOAT,
_device="/job:localhost/replica:0/task:0/device:CPU:0"]]
Does anybody know what I'm doing wrong? This seems like a very useful way to run TensorBoard on Colab if I could get it working.
This is caused by conflicting versions of Keras. Tensorboardcolab uses the full keras library while you import the tf.keras implementation of the Keras API. So when you fit the model you end up using two different versions of keras.
You have a few options:
Use Keras libary and change your imports
import tensorflow as tf
import keras
from keras.datasets import cifar10
from keras.applications.mobilenet import MobileNet
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback
Although the code runs fine with these changes, you might consider using Keras's version of the Adam optimizer, so you don't need to import tensorflow explicitly anymore.
model.compile(loss='categorical_crossentropy',
optimizer=keras.optimizers.Adam(lr=0.001),
metrics=['accuracy'])`
Use tf.keras and patch TensorBoardColab
Your code runs fine, if you'd patch callbacks.py and core.py and fix the imports there:
from keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import TensorBoard
You could also use this fork where I made these changes.

TensorFlow/Keras multi-threaded model fitting

I'm attempting to train multiple keras models with different parameter values using multiple threads (and the tensorflow backend). I've seen a few examples of using the same model within multiple threads, but in this particular case, I run into various errors regarding conflicting graphs, etc. Here's a simple example of what I'd like to be able to do:
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense
from keras.models import Sequential
sess = tf.Session()
def example_model(size):
model = Sequential()
model.add(Dense(size, input_shape=(5,)))
model.add(Dense(1))
model.compile(optimizer='sgd', loss='mse')
return model
if __name__ == '__main__':
K.set_session(sess)
X = np.random.random((10, 5))
y = np.random.random((10, 1))
models = [example_model(i) for i in range(5, 10)]
e = ThreadPoolExecutor(4)
res_list = [e.submit(model.fit, X, y) for model in models]
for res in res_list:
print(res.result())
The resulting error is ValueError: Tensor("Variable:0", shape=(5, 5), dtype=float32_ref) must be from the same graph as Tensor("Variable_2/read:0", shape=(), dtype=float32).. I've also tried initializing the models within the threads which gives a similar failure.
Any thoughts on the best way to go about this? I'm not at all attached to this exact structure, but I'd prefer to be able to use multiple threads rather than processes so all the models are trained within the same GPU memory allocation.
Tensorflow Graphs are not threadsafe (see https://www.tensorflow.org/api_docs/python/tf/Graph) and when you create a new Tensorflow Session, it by default uses the default graph.
You can get around this by creating a new session with a new graph in your parallelized function and constructing your keras model there.
Here is some code that creates and fits a model on each available gpu in parallel:
import concurrent.futures
import numpy as np
import keras.backend as K
from keras.layers import Dense
from keras.models import Sequential
import tensorflow as tf
from tensorflow.python.client import device_lib
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
xdata = np.random.randn(100, 8)
ytrue = np.random.randint(0, 2, 100)
def fit(gpu):
with tf.Session(graph=tf.Graph()) as sess:
K.set_session(sess)
with tf.device(gpu):
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.fit(xdata, ytrue, verbose=0)
return model.evaluate(xdata, ytrue, verbose=0)
gpus = get_available_gpus()
with concurrent.futures.ThreadPoolExecutor(len(gpus)) as executor:
results = [x for x in executor.map(fit, gpus)]
print('results: ', results)

Using model.pop() changes the model's summary but does not effect the output

I am trying to remove the top layers from a model I have previously trained.
This is the code I use:
import os
import h5py
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers import Activation, Dropout, Flatten, Dense
# KERAS_BACKEND=theano python
import keras
keras.backend.set_image_dim_ordering("th")
img_width, img_height = 150, 150
data_dir = '//shared_directory/projects/try_CD/data/validation'
nb_train_samples = 2000
nb_validation_samples = 800
nb_epoch = 50
def make_bottleneck_features(model):
datagen = ImageDataGenerator(rescale=1./255)
generator = datagen.flow_from_directory(
data_dir,
target_size=(img_width, img_height),
batch_size=32,
class_mode=None,
shuffle=False)
bottleneck_features = model.predict_generator(generator, nb_validation_samples)
return (bottleneck_features)
model=keras.models.load_model('/shared_directory/projects/think_exp/CD_M1.h5')
A = make_bottleneck_features(model)
model.summary()
for i in range (6):
model.pop()
B = make_bottleneck_features(model)
model.summary()
Judging comparing the results of the two calls to model.summary(), I can see that indeed the 6 topmost layers were removed.
However, the model's output (saved to A and B) does not change after discarding these layers.
What is the source of that discrepancy?
How can I retrieve the output of the desired layer instead of that of the entire model?
Thanks in advance!
You can't drop layers like that, in order for it to have an effect, you need to recompile the model (AKA model.compile).
But that's not the best way to obtain outputs from intermediate layers, you can just use K.function (where K is keras.backend) to build a function from the input to one of the layers and then call the function. More details are available in this answer.

Resources