I'm attempting to train multiple keras models with different parameter values using multiple threads (and the tensorflow backend). I've seen a few examples of using the same model within multiple threads, but in this particular case, I run into various errors regarding conflicting graphs, etc. Here's a simple example of what I'd like to be able to do:
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense
from keras.models import Sequential
sess = tf.Session()
def example_model(size):
model = Sequential()
model.add(Dense(size, input_shape=(5,)))
model.add(Dense(1))
model.compile(optimizer='sgd', loss='mse')
return model
if __name__ == '__main__':
K.set_session(sess)
X = np.random.random((10, 5))
y = np.random.random((10, 1))
models = [example_model(i) for i in range(5, 10)]
e = ThreadPoolExecutor(4)
res_list = [e.submit(model.fit, X, y) for model in models]
for res in res_list:
print(res.result())
The resulting error is ValueError: Tensor("Variable:0", shape=(5, 5), dtype=float32_ref) must be from the same graph as Tensor("Variable_2/read:0", shape=(), dtype=float32).. I've also tried initializing the models within the threads which gives a similar failure.
Any thoughts on the best way to go about this? I'm not at all attached to this exact structure, but I'd prefer to be able to use multiple threads rather than processes so all the models are trained within the same GPU memory allocation.
Tensorflow Graphs are not threadsafe (see https://www.tensorflow.org/api_docs/python/tf/Graph) and when you create a new Tensorflow Session, it by default uses the default graph.
You can get around this by creating a new session with a new graph in your parallelized function and constructing your keras model there.
Here is some code that creates and fits a model on each available gpu in parallel:
import concurrent.futures
import numpy as np
import keras.backend as K
from keras.layers import Dense
from keras.models import Sequential
import tensorflow as tf
from tensorflow.python.client import device_lib
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
xdata = np.random.randn(100, 8)
ytrue = np.random.randint(0, 2, 100)
def fit(gpu):
with tf.Session(graph=tf.Graph()) as sess:
K.set_session(sess)
with tf.device(gpu):
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.fit(xdata, ytrue, verbose=0)
return model.evaluate(xdata, ytrue, verbose=0)
gpus = get_available_gpus()
with concurrent.futures.ThreadPoolExecutor(len(gpus)) as executor:
results = [x for x in executor.map(fit, gpus)]
print('results: ', results)
Related
I'm trying to develop a multitask deep neural network (MTDNN) to make prediction on small molecule bioactivity against kinase targets and something is definitely wrong with my model structure but I can't figure out what.
For my training data (highly imbalanced data with 0 as inactive and 1 as active), I have 423 unique kinase targets (tasks) and over 400k unique compounds. I first calculate the ECFP fingerprint using smiles, and then I randomly split the input data into train, test, and valid sets based on 8:1:1 ratio using RandomStratifiedSplitter from deepchem package. After training my model using the train set and I want to make prediction on the test set to check model performance.
Here's what my data looks like (screenshot example):
(https://i.stack.imgur.com/8Hp36.png)
Here's my code:
# Import Packages
import numpy as np
import pandas as pd
import deepchem as dc
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import initializers, regularizers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Dropout, Reshape
from tensorflow.keras.optimizers import SGD
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
# Build Model
inputs = keras.Input(shape = (1024, ))
x = keras.layers.Dense(2000, activation='relu', name="dense2000",
kernel_initializer=initializers.RandomNormal(stddev=0.02),
bias_initializer=initializers.Ones(),
kernel_regularizer=regularizers.L2(l2=.0001))(inputs)
x = keras.layers.Dropout(rate=0.25)(x)
x = keras.layers.Dense(500, activation='relu', name='dense500')(x)
x = keras.layers.Dropout(rate=0.25)(x)
x = keras.layers.Dense(846, activation='relu', name='output1')(x)
logits = Reshape([423, 2])(x)
outputs = keras.layers.Softmax(axis=2)(logits)
Model1 = keras.Model(inputs=inputs, outputs=outputs, name='MTDNN')
Model1.summary()
opt = keras.optimizers.SGD(learning_rate=.0003, momentum=0.9)
def loss_function (output, labels):
loss = tf.nn.softmax_cross_entropy_with_logits(output,labels)
return loss
loss_fn = loss_function
Model1.compile(loss=loss_fn, optimizer=opt,
metrics=[keras.metrics.Accuracy(),
keras.metrics.AUC(),
keras.metrics.Precision(),
keras.metrics.Recall()])
for train, test, valid in split2:
trainX = pd.DataFrame(train.X)
trainy = pd.DataFrame(train.y)
trainy2 = tf.one_hot(trainy,2)
testX = pd.DataFrame(test.X)
testy = pd.DataFrame(test.y)
testy2 = tf.one_hot(testy,2)
validX = pd.DataFrame(valid.X)
validy = pd.DataFrame(valid.y)
validy2 = tf.one_hot(validy,2)
history = Model1.fit(x=trainX, y=trainy2,
shuffle=True,
epochs=10,
verbose=1,
batch_size=100,
validation_data=(validX, validy2))
y_pred = Model1.predict(testX)
y_pred2 = y_pred[:, :, 1]
y_pred3 = np.round(y_pred2)
# Check the # of nonzero in assay
(y_pred3!=0).sum () #all 0s
My questions are:
The roc and precision recall are all extremely high (>0.99), but the prediction result of test set contains all 0s, no actives at all. I also use the randomized dataset with same active:inactive ratio for each task to test if those values are too good to be true, and turns out all values are still above 0.99, including roc which is expected to be 0.5.
Can anyone help me to identify what is wrong with my model and how should I fix it please?
Can I use built-in functions in sklearn to calculate roc/accuracy/precision-recall? Or should I manually calculate the metrics based on confusion matrix on my own for multitasking purpose. Why and why not?
I'm running an LSTM network that works fine (TF 2.0). My problem starts when trying to modify the loss function.
I planed to adjust some data manipulation over 'y_true' and 'y_pred' but since TF force to maintain the data as tensors (and not convert it to Pandas or NumPy) it is challenging.
To get better control of the data inside the loss function I've simulated tf.keras.losses.mae function.
My goal was to be able to see the data ('y_true' and 'y_pred') so I can make my desire adjustments.
The original function:
def mean_absolute_error(y_true, y_pred):
y_pred = ops.convert_to_tensor(y_pred)
y_true = math_ops.cast(y_true, y_pred.dtype)
return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
And after adjustments for debugging:
from tensorflow.python.framework import ops
from tensorflow.python.ops import math_ops
import tensorflow.keras.backend as K
def mean_absolute_error_test(y_true, y_pred):
global temp_true
temp_true=y_true
print(y_true)
y_pred = ops.convert_to_tensor(y_pred)
y_true = math_ops.cast(y_true, y_pred.dtype)
return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
when I run model.compile and print y_true I get:
Tensor("dense_target:0", shape=(None, None), dtype=float32)
type=tensorflow.python.framework.ops.Tensor
Does anyone know how can I see 'y_pred' and 'y_true' or what am I missing?
Seems like I can't see samples of y_true or the data is empty.
The main code part:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dropout,Dense
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, load_model
from tensorflow.python.keras.layers.recurrent import LSTM
from tensorflow.keras.callbacks import EarlyStopping
K.clear_session()
model = Sequential()
model.add(LSTM(20,activation='relu',input_shape=(look_back,len(training_columns)),recurrent_dropout=0.4))
model.add(Dropout(0.1))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss=test2,experimental_run_tf_function=False)# mse,mean_squared_logarithmic_error
num_epochs = 20
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
history=model.fit(X_train_lstm, y_train_lstm, epochs = num_epochs, batch_size = 128,shuffle=False,verbose=1,validation_data=[X_test_lstm,y_test_lstm],callbacks=[es])
I am loading my pre-trained keras model and then trying to parallelize a large number of input data using dask? Unfortunately, I'm running into some issues with this relating to how I'm creating my dask array. Any guidance would be greatly appreciated!
Setup:
First I cloned from this repo https://github.com/sanchit2843/dlworkshop.git
Reproducible Code Example:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from keras.models import load_model
import keras
from keras.models import Sequential
from keras.layers import Dense
from dask.distributed import Client
import warnings
import dask.array as DaskArray
warnings.filterwarnings('ignore')
dataset = pd.read_csv('data/train.csv')
X = dataset.drop(['price_range'], axis=1).values
y = dataset[['price_range']].values
# scale data
sc = StandardScaler()
X = sc.fit_transform(X)
ohe = OneHotEncoder()
y = ohe.fit_transform(y).toarray()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
# Neural network
model = Sequential()
model.add(Dense(16, input_dim=20, activation="relu"))
model.add(Dense(12, activation="relu"))
model.add(Dense(4, activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=64)
# Use dask
client = Client()
def load_and_predict(input_data_chunk):
def contrastive_loss(y_true, y_pred):
margin = 1
square_pred = K.square(y_pred)
margin_square = K.square(K.maximum(margin - y_pred, 0))
return K.mean(y_true * square_pred + (1 - y_true) * margin_square)
mlflow.set_tracking_uri('<uri>')
mlflow.set_experiment('clean_parties_ml')
runs = mlflow.search_runs()
artifact_uri = runs.loc[runs['start_time'].idxmax()]['artifact_uri']
model = mlflow.keras.load_model(artifact_uri + '/model', custom_objects={'contrastive_loss': contrastive_loss})
y_pred = model.predict(input_data_chunk)
return y_pred
da_input_data = da.from_array(X_test, chunks=(100, None))
prediction_results = da_input_data.map_blocks(load_and_predict, dtype=X_test.dtype).compute()
The Error I'm receiving:
AttributeError: '_thread._local' object has no attribute 'value'
Keras/Tensorflow don't play nicely with other threaded systems. There is an ongoing issue on this topic here: https://github.com/dask/dask-examples/issues/35
I am using Tensorflow 2.x, The below is the custom learning rate scheduler which i have written
def scheduler(epoch):
if epoch == 1:
return 3e-5
else:
return 3e-5 * (1/(1 + 0.01 * epoch ))
and i am calling it like this
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
model.fit(inputs_train,tags_train,epochs=30,batch_size=32,validation_data=(inputs_val,tags_val),shuffle=False,callbacks=[callback])
But instead of calling it on epochs, i want to call it on each batch. I couldn't find anything below documentation regarding batches
https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/LearningRateScheduler
Is it possible to call it on batches, if yes how to do that?
Write a custom callback and use backend set_value method
from keras.models import Sequential
from keras.layers import Dense
import keras
import numpy as np
import tensorflow.keras.backend as K
model = Sequential()
model.add(Dense(8, input_dim=2, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
x = np.random.randn(10,2)
y = np.random.randint(0,2,(10,2))
class lr_callback(keras.callbacks.Callback):
def on_batch_end(self, batch, logs=None):
K.set_value(self.model.optimizer.lr, 0.54321)
model.fit(x,y,epochs=2,batch_size=4,shuffle=False, callbacks=[lr_callback()])
print (K.get_value(model.optimizer.lr))
I have 2 GPU GTX1080 with Keras v2 installed. I run 2 training processes on gpu0 and gpu1 simultaneously. But, I got ResourceExhaustedError.
What is missing?
python multi-gpu-process.py --gpu_id=1 (ok)
python multi-gpu-process.py --gpu_id=0 (ResourceExhaustedError. Help pls.)
# file: multi-gpu-process.py (2 training processes work on different GPUs)
import numpy as np
import os
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
# What GPU is installed.
gpu_list = get_available_gpus()
# Specified gpu installed on machine?
if not '/gpu:' + str(FLAGS.gpu_id) in gpu_list:
raise Exception('This gpu is not installed: /gpu:{}'.format(FLAGS.gpu_id))
# Set GPU in environment.
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' # see issue #152
os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpu_id)
import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout, Activation
from keras.layers.convolutional import Conv2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
from tensorflow.python.client import device_lib
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('gpu_id', 0, """GPU id. Single gpu case.""")
# Use one GPU.
import keras.backend.tensorflow_backend as K # If import and not used, error on session release.
# Solve this error
# https://stackoverflow.com/questions/42969779/keras-error-you-must-feed-a-value-for-placeholder-tensor-bidirectional-1-keras
K.set_learning_phase(1) # set learning phase
# train on specified gpu
with K.tf.device('/gpu:%d' % FLAGS.gpu_id):
K.set_session(K.tf.Session(config=K.tf.ConfigProto(allow_soft_placement=True, # True. Allow to find other device if specified is not available.
log_device_placement=True)))
# To prove running multi process on gpu. Make small model.
model = Sequential()
model.add(Dense(400, input_dim=800, activation='tanh'))
model.add(Dense(200, input_dim=800, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print (model.summary())
optimizer = keras.optimizers.Adam(lr=0.0001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
x = np.random.rand(131072, 800)
y = np.random.randint(0, 2, (131072, 1))
model.fit(x, y, batch_size=2048*4, epochs=1000000)
K.clear_session() # Without it, session error at the end.
==================Not work when starting 2nd process on 2nd GPU (GTX 1080, 8GB)=========================
# Increased model size compared with working version
#
# file: multi-gpu-process_notwork.py
import numpy as np
import os
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
# What GPU is installed.
gpu_list = get_available_gpus()
# Specified gpu installed on machine?
if not '/gpu:' + str(FLAGS.gpu_id) in gpu_list:
raise Exception('This gpu is not installed: /gpu:{}'.format(FLAGS.gpu_id))
# Set GPU in environment.
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' # see issue #152
os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpu_id)
import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout, Activation
from keras.layers.convolutional import Conv2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
from tensorflow.python.client import device_lib
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('gpu_id', 0, """GPU id. Single gpu case.""")
# Use one GPU.
import keras.backend.tensorflow_backend as K # If import and not used, error on session release.
# Solve this error
# https://stackoverflow.com/questions/42969779/keras-error-you-must-feed-a-value-for-placeholder-tensor-bidirectional-1-keras
K.set_learning_phase(1) # set learning phase
# train on specified gpu
with K.tf.device('/gpu:%d' % FLAGS.gpu_id):
K.set_session(K.tf.Session(config=K.tf.ConfigProto(allow_soft_placement=True, # True. Allow to find other device if specified is not available.
log_device_placement=True)))
# To prove running multi process on gpu. Make small model.
model = Sequential()
model.add(Dense(4000, input_dim=8000, activation='tanh'))
model.add(Dense(2000, input_dim=8000, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print (model.summary())
optimizer = keras.optimizers.Adam(lr=0.0001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
x = np.random.rand(131072, 8000)
y = np.random.randint(0, 2, (131072, 1))
model.fit(x, y, batch_size=2048*4, epochs=1000000)
K.clear_session() # Without it, session error at the end.
Partial error:
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[8192,400]
[[Node: gradients/dense_1/Tanh_grad/TanhGrad = TanhGrad[T=DT_FLOAT, _class=["loc:#dense_1/Tanh"], _device="/job:localhost/replica:0/task:0/gpu:0"](dense_1/Tanh, gradients/dense_2/MatMul_grad/MatMul)]]
Since the GPU has 8GB, it has nothing to do with model, which is pretty small size.
Your model may be small, but your batch size of 8192 is probably way too big for your memory; start with a value of 64 or 128, and then you can try increasing it until you get an out-of-memory (OOM) error again...
EDIT: Although reducing your batch size will probably eliminate the error, there is still the issue of discrepancy between your 2 GPUs - as Yu-Yang correctly points out in the comments, you should set CUDA_VISIBLE_DEVICES before importing Keras, otherwise your Tensorflow backend will occupy the memory of both GPUs. Doing so will most probably eliminate the discrepancy (it might also let you keep your high batch size without an OOM error).