Parallelization of multiples independent models in tensorflow-gpu / keras - python-3.x

I need to train a set of models but do not benefit from GPU acceleration using tensorflow-gpu / keras as time augments linearly with the number of models trained.
In
class Models(tf.keras.Model):
def __init__(self,N_MODELS=1):
super(Models, self).__init__()
self.block_i = [estimate_affine()
for node in range(N_MODELS)]
def call(self, inputs):
x = [self.block_i[i](input_i) for i,input_i in enumerate(inputs)]
return x
a list of N_MODELS layers are built and as are idenpendant should be parallelized. As it is not the case, even though output is what I expect, I guess my implementation is not optimal. Any idea how to make it parallelizable ?
Best
Paul
Here is a toynet of N_MODELS of linear regression
import tensorflow as tf
tf.enable_eager_execution()
from tensorflow.keras import layers
import numpy as np
from numpy import random
import time
class estimate_affine(layers.Layer):
def __init__(self):
'''
'''
super(estimate_affine, self).__init__()
self.a = tf.Variable(initial_value=[0.], dtype='float32',trainable=True,name='par1')
self.b = tf.Variable(initial_value=[0.], dtype='float32',trainable=True,name='par2')
def call(self, inputs):
return (self.a,self.b)
class Models(tf.keras.Model):
def __init__(self,N_MODELS=1):
super(Models, self).__init__()
self.block_i = [estimate_affine()
for node in range(N_MODELS)]
def call(self, inputs):
x = [self.block_i[i](input_i) for i,input_i in enumerate(inputs)]
return x
N_ITERATIONS=100
N_POINTS=100
ls_t=[]
for N_MODELS in [5,10,50,100,1000]:
t=time.time()
### Aim is to fit N_MODELS on N_POINTS which are basically N_MODELS of ax+b
a=np.random.randint(0,10,N_MODELS)
b=np.random.randint(0,10,N_MODELS)
noise=np.random.rand(N_POINTS) * 1
x=np.linspace(0,1,N_POINTS)
dataset=np.array([a_i *( x + noise) + b_i for a_i,b_i in zip(a,b)])
model=Models(N_MODELS=N_MODELS)
optimizer=tf.keras.optimizers.SGD(learning_rate=5e-3)
for i in range(N_ITERATIONS):
with tf.GradientTape() as tape:
outputs=model(dataset)
L=tf.reduce_sum([((outputs[idx][0]*x+outputs[idx][1])
- dataset[idx,:])**2 for idx in range(N_MODELS)])
grads = tape.gradient(L, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
t_diff=time.time()-t
print('N_MODEL : {}, time : {}'.format(N_MODELS,t_diff))
ls_t.append(t_diff)

Related

Multi-label Logistic regression Pytorch, Are these two sets equivalent?

Says I import these libraries
import torch.nn as nn
import torch.optim as optim # step
import torch.nn.functional as F # all activation f. like ReLU
Suppose I need to construct Logistic regression model with output unit > 1 (as I want to do multi-label classification)
So, I found two options (a pair of nn.module and criterion) to construct them, which cause me so confused which set to choose
# SET 1
class LogisticReg(nn.Module):
def __init__(self, input_size, num_classes): # sequence of layer (768,50,10)
super(LogisticReg, self).__init__()
self.fc1 = nn.Linear(input_size, num_classes) # 768 * 1024
def forward(self, x):
x = self.fc1(x)
x = F.sigmoid(x)
return x
criterion = nn.BCELoss()
# SET 2
class LogisticReg(nn.Module):
def __init__(self, input_size, num_classes): # sequence of layer (768,50,10)
super(LogisticReg, self).__init__()
self.fc1 = nn.Linear(input_size, num_classes) # 768 * 1024
def forward(self, x):
x = self.fc1(x)
return x
criterion = nn.BCEWithLogitsLoss()
The only difference is that when nn.module of Logistic regression does not have sigmoid activation at .forward then it must pair nn.BCEwithLogitLoss instead of nn.BCELoss, since it does not got activated by Sigmoid yet
Hence, I have 2 questions here
Are these 2 sets exactly equivalent?
If yes, then what is the point of Pytorch having both criterion BCELoss and BCEwithLogitLoss

Custom Trainable Layers in Keras

In keras, we can use a Lambda layer to create a custom layer, like this:
def f(x):
return x**2
model.add(Lambda(f))
Now my question is, how to make such custom function trainable? How to make this function such that it raises an input to the power w, where w is trainable. Like this:
def f(x):
return x**w
The problem can be solved by making a new layer via subclassing,
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.optimizers import *
import numpy as np
class ScaleLayer(tf.keras.layers.Layer):
def __init__(self):
super(ScaleLayer, self).__init__()
self.scale = tf.Variable(1., trainable=True)
def call(self, inputs):
return inputs ** self.scale
x = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13]).reshape(-1,1)
y = x**3.25
l = ScaleLayer()
a1 = tf.keras.layers.Input(shape=1)
a2 = l(a1)
model = tf.keras.models.Model(a1,a2)
model.compile(optimizer=Adam(learning_rate=0.01), loss='mse')
model.fit(x,y, epochs=500, verbose=0)
print(l.weights) # This prints 3.25
More about this can be found here

What can be the cause of the validation loss increasing and the accuracy remaining constant to zero while the train loss decreases?

I am trying to solve a multiclass text classification problem. Due to specific requirements from my project I am trying to use skorch (https://skorch.readthedocs.io/en/stable/index.html) to wrap pytorch for the sklearn pipeline. What I am trying to do is fine-tune a pretrained version of BERT from Huggingface (https://huggingface.co) with my dataset. I have tried, in the best of my knowledge, to follow the instructions from skorch on how I should input my data, structure the model etc. Still during the training the train loss decreases until the 8th epoch where it starts fluctuating, all while the validation loss increases from the beginning and the validation accuracy remains constant to zero. My pipeline setup is
from sklearn.pipeline import Pipeline
pipeline = Pipeline(
[
("tokenizer", Tokenizer()),
("classifier", _get_new_transformer())
]
in which I am using a tokenizer class to preprocess my dataset, tokenizing it for BERT and creating the attention masks. It looks like this
import torch
from transformers import AutoTokenizer, AutoModel
from torch import nn
import torch.nn.functional as F
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import numpy as np
class Tokenizer(BaseEstimator, TransformerMixin):
def __init__(self):
super(Tokenizer, self).__init__()
self.tokenizer = AutoTokenizer.from_pretrained(/path/to/model)
def _tokenize(self, X, y=None):
tokenized = self.tokenizer.encode_plus(X, max_length=20, add_special_tokens=True, pad_to_max_length=True)
tokenized_text = tokenized['input_ids']
attention_mask = tokenized['attention_mask']
return np.array(tokenized_text), np.array(attention_mask)
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
word_tokens, attention_tokens = np.array([self._tokenize(string)[0] for string in tqdm(X)]), \
np.array([self._tokenize(string)[1] for string in tqdm(X)])
X = word_tokens, attention_tokens
return X
def fit_transform(self, X, y=None, **fit_params):
self = self.fit(X, y)
return self.transform(X, y)
then I initialize the model I want to fine-tune as
class Transformer(nn.Module):
def __init__(self, num_labels=213, dropout_proba=.1):
super(Transformer, self).__init__()
self.num_labels = num_labels
self.model = AutoModel.from_pretrained(/path/to/model)
self.dropout = torch.nn.Dropout(dropout_proba)
self.classifier = torch.nn.Linear(768, num_labels)
def forward(self, X, **kwargs):
X_tokenized, attention_mask = torch.stack([x.unsqueeze(0) for x in X[0]]),\
torch.stack([x.unsqueeze(0) for x in X[1]])
_, X = self.model(X_tokenized.squeeze(), attention_mask.squeeze())
X = F.relu(X)
X = self.dropout(X)
X = self.classifier(X)
return X
I initialize the model and create the classifier with skorch as follows
from skorch import NeuralNetClassifier
from skorch.dataset import CVSplit
from skorch.callbacks import ProgressBar
import torch
from transformers import AdamW
def _get_new_transformer() -> NeuralNetClassifier:
transformer = Transformer()
net = NeuralNetClassifier(
transformer,
lr=2e-5,
max_epochs=10,
criterion=torch.nn.CrossEntropyLoss,
optimizer=AdamW,
callbacks=[ProgressBar(postfix_keys=['train_loss', 'valid_loss'])],
train_split=CVSplit(cv=2, random_state=0)
)
return net
and I use fit like that
pipeline.fit(X=dataset.training_samples, y=dataset.training_labels)
in which my training samples are lists of strings and my labels are the an array containing the indexes of each class, as pytorch requires.
This is a sample of what happens
training history
I have tried to keep train only the fully connected layer and not BERT but I have the same issue again. I also tested the train accuracy after the training process and it was only 0,16%. I would be grateful for any advice or insight on how to solve my problem! I am pretty new with skorch and not so comfortable with pytorch yet and I believe that I am missing something really simple. Thank you very much in advance!

Python 3: How to evaluate the Adam Gradient in Tensor Flow 2.0? I would like to replace my implementation

I have the following code that is working well. However, I strongly believe that the Tensorflow 2.0 implementation of the Adam Gradient is more efficient than my naive implementation.
How can I replace the evaluation of the Adam Gradient by the Tensorflow 2.0 implementation?
import tensorflow as tf
import numpy as np
def linearModelGenerator(numberSamples):
x = tf.random.normal(shape=(numberSamples,))
y = 3*tf.ones(shape=(numberSamples,)) + tf.constant(5.0) * x + tf.random.normal(shape=(numberSamples,),stddev=0.01)
return x,y
class Adam:
def __init__(self,shapes,lr=0.001,beta1=0.9,beta2=0.999,epsilon=1e-07):
self.lr=lr
self.beta1=beta1
self.beta2=beta2
self.epsilon=epsilon
self.shapes=shapes
self.m=np.shape(shapes)[0]
self.listM=[]
self.listV=[]
self.t=0
for i in range(self.m):
if(np.isscalar(shapes[i])):
self.listM.append(0)#append(tf.zeros(shapes[i]))
self.listV.append(0)#append(tf.zeros(shapes[i]))
else:
self.append(tf.zeros(shapes[i]))
self.append(tf.zeros(shapes[i]))
def evalGradient(self,*args):
adamGrad=[]
self.t=self.t+1
for i in range(self.m):
grad=args[i]
self.listM[i]=self.beta1*self.listM[i]+(1-self.beta1)*grad
self.listV[i]=self.beta2*self.listV[i]+(1-self.beta2)*(grad*grad)
hatM=self.listM[i]/(1-(self.beta1)**self.t)
hatV=self.listV[i]/(1-(self.beta2)**self.t)
adamGrad.append(hatM/(tf.math.sqrt(hatV)+(tf.ones(np.shape(hatV))*self.epsilon)))
return adamGrad
class LinearModel:
def __init__(self):
self.weight = tf.Variable(-1.0)
self.bias = tf.Variable(-1.0)
def __call__(self, x):
return self.weight * x + self.bias
def loss(y, pred):
return tf.reduce_mean(tf.square(y - pred))
def trainAdam(linear_model,adam, x, y):
with tf.GradientTape() as t:
current_loss = loss(y, linear_model(x))
gradWeight, gradBias = t.gradient(current_loss, [linear_model.weight, linear_model.bias])
gradAdamList=adam.evalGradient(gradWeight,gradBias)
gradAdamWeight=gradAdamList[0]
gradAdamBias=gradAdamList[1]
linear_model.weight.assign_sub(adam.lr * gradAdamWeight)
linear_model.bias.assign_sub(adam.lr * gradAdamBias)
if __name__=="__main__":
numberSamples=100
x,y=linearModelGenerator(numberSamples)
linear_model = LinearModel()
epochs = 1000
shapes=[]
shapes.append(1)
shapes.append(1)
adam=Adam(shapes,lr=0.1)
for epoch_count in range(epochs):
real_loss = loss(y, linear_model(x))
trainAdam(linear_model,adam, x, y)
print('w',linear_model.weight.numpy())
print('bias',linear_model.bias.numpy())
print('real_loss',real_loss.numpy())
I would like to keep the general structure of the code, but to replace the Adam Gradient Implementation.
The built-in optimizers in TensorFlow 2 can not only be used with tf.keras.Model.fit(), but also with tf.GradientTape(). With the latter, you can just call its apply_gradients() method directly. The optimizer object will keep track of the accumulators and running moments internally. Roughly, your code can be modified as follows:
adam = tf.optimizers.Adam(learning_rate)
def trainAdam(linear_model,adam, x, y):
with tf.GradientTape() as t:
current_loss = loss(y, linear_model(x))
gradWeight, gradBias = t.gradient(current_loss, [linear_model.weight, linear_model.bias])
adam.apply_gradients(zip([gradWeight, gradBias], [linear_model.weight, linear_model.bias]))

how to get the real shape of batch_size which is none in keras

When implementing a custom layer in Keras, I need to know the real size of batch_size. my shape is (?,20).
questions:
1. What is the best way to change (?,20) to (batch_size,20).
I have looked into this but it can not adjust to my problem.
I can pass the batch_size to this layer. In that case, I need to reshape (?,20) to (batch_size,20), how can I do that?
2. Is it the best way to that, or is there any builtin function that can get the real batch_size while building and running the model?
This is my layer:
from scipy.stats import entropy
from keras.engine import Layer
import keras.backend as K
import numpy as np
class measure(Layer):
def __init__(self, beta, **kwargs):
self.beta = beta
self.uses_learning_phase = True
self.supports_masking = True
super(measure, self).__init__(**kwargs)
def call(self, x):
return K.in_train_phase(self.rev_entropy(x, self.beta), x)
def get_config(self):
config = {'beta': self.beta}
base_config = super(measure, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def rev_entropy(self, x, beta):
entropy_p_t_w = np.apply_along_axis(entropy, 1, x)
con = (beta / (1 + entropy_p_t_w)) ** 1.5
new_f_w_t = x * (con.reshape(con.shape[0], 1))
norm_const = 1e-30 + np.sum(new_f_w_t, axis=0)
for t in range(norm_const.shape[0]):
new_f_w_t[:, t] /= norm_const[t]
return new_f_w_t
And here is where I call this layer:
encoded = measure(beta=0.08)(encoded)
I am also using fit_generator if it can help at all:
autoencoder.fit_generator(train_gen, steps_per_epoch=num_train_steps, epochs=NUM_EPOCHS,
validation_data=test_gen, validation_steps=num_test_steps, callbacks=[checkpoint])
The dimension of the x passed to the layer is (?,20) and that's why I can not do my calculation.
Thanks:)

Resources