Adapting pytorch softmax function - pytorch

I am currently looking into the softmax function and I would like to adapt the orignally implemented for ome small tests.
I have been to the docs but there wasn't that much of usefull information about the function. This is the pytorch python implementation:
def __init__(self, dim=None):
super(Softmax, self).__init__()
self.dim = dim
def __setstate__(self, state):
self.__dict__.update(state)
if not hasattr(self, 'dim'):
self.dim = None
def forward(self, input):
return F.softmax(input, self.dim, _stacklevel=5)
Where can I find the F.softmax impementation?
One off the things I want to try for instance is the soft-margin softmax described here: Soft-Margin Softmax for Deep Classification
Where would be the best place to start?
Thanks in advance!

Softmax Implementation in PyTorch and Numpy
A Softmax function is defined as follows:
A direct implementation of the above formula is as follows:
def softmax(x):
return np.exp(x) / np.exp(x).sum(axis=0)
Above implementation can run into arithmetic overflow because of np.exp(x).
To avoid the overflow, we can divide the numerator and denominator in the softmax equation with a constant C. Then the softmax function becomes following:
The above approach is implemented in PyTorch and we take log(C) as -max(x). Below is the PyTorch implementation:
def softmax_torch(x): # Assuming x has atleast 2 dimensions
maxes = torch.max(x, 1, keepdim=True)[0]
x_exp = torch.exp(x-maxes)
x_exp_sum = torch.sum(x_exp, 1, keepdim=True)
probs = x_exp/x_exp_sum
return probs
A corresponding Numpy equivalent is as follows:
def softmax_np(x):
maxes = np.max(x, axis=1, keepdims=True)[0]
x_exp = np.exp(x-maxes)
x_exp_sum = np.sum(x_exp, 1, keepdims=True)
probs = x_exp/x_exp_sum
return probs
We can compare the results with PyTorch implementation - torch.nn.functional.softmax using below snippet:
import torch
import numpy as np
if __name__ == "__main__":
x = torch.randn(1, 3, 5, 10)
std_pytorch_softmax = torch.nn.functional.softmax(x)
pytorch_impl = softmax_torch(x)
numpy_impl = softmax_np(x.detach().cpu().numpy())
print("Shapes: x --> {}, std --> {}, pytorch impl --> {}, numpy impl --> {}".format(x.shape, std_pytorch_softmax.shape, pytorch_impl.shape, numpy_impl.shape))
print("Std and torch implementation are same?", torch.allclose(std_pytorch_softmax, pytorch_impl))
print("Std and numpy implementation are same?", torch.allclose(std_pytorch_softmax, torch.from_numpy(numpy_impl)))
References:
Softmax discussion
Discussion on Softmax implementation at PyTorch forumn
An SO thread on implementation of Softmax in Python

Related

Python 3: How to evaluate the Adam Gradient in Tensor Flow 2.0? I would like to replace my implementation

I have the following code that is working well. However, I strongly believe that the Tensorflow 2.0 implementation of the Adam Gradient is more efficient than my naive implementation.
How can I replace the evaluation of the Adam Gradient by the Tensorflow 2.0 implementation?
import tensorflow as tf
import numpy as np
def linearModelGenerator(numberSamples):
x = tf.random.normal(shape=(numberSamples,))
y = 3*tf.ones(shape=(numberSamples,)) + tf.constant(5.0) * x + tf.random.normal(shape=(numberSamples,),stddev=0.01)
return x,y
class Adam:
def __init__(self,shapes,lr=0.001,beta1=0.9,beta2=0.999,epsilon=1e-07):
self.lr=lr
self.beta1=beta1
self.beta2=beta2
self.epsilon=epsilon
self.shapes=shapes
self.m=np.shape(shapes)[0]
self.listM=[]
self.listV=[]
self.t=0
for i in range(self.m):
if(np.isscalar(shapes[i])):
self.listM.append(0)#append(tf.zeros(shapes[i]))
self.listV.append(0)#append(tf.zeros(shapes[i]))
else:
self.append(tf.zeros(shapes[i]))
self.append(tf.zeros(shapes[i]))
def evalGradient(self,*args):
adamGrad=[]
self.t=self.t+1
for i in range(self.m):
grad=args[i]
self.listM[i]=self.beta1*self.listM[i]+(1-self.beta1)*grad
self.listV[i]=self.beta2*self.listV[i]+(1-self.beta2)*(grad*grad)
hatM=self.listM[i]/(1-(self.beta1)**self.t)
hatV=self.listV[i]/(1-(self.beta2)**self.t)
adamGrad.append(hatM/(tf.math.sqrt(hatV)+(tf.ones(np.shape(hatV))*self.epsilon)))
return adamGrad
class LinearModel:
def __init__(self):
self.weight = tf.Variable(-1.0)
self.bias = tf.Variable(-1.0)
def __call__(self, x):
return self.weight * x + self.bias
def loss(y, pred):
return tf.reduce_mean(tf.square(y - pred))
def trainAdam(linear_model,adam, x, y):
with tf.GradientTape() as t:
current_loss = loss(y, linear_model(x))
gradWeight, gradBias = t.gradient(current_loss, [linear_model.weight, linear_model.bias])
gradAdamList=adam.evalGradient(gradWeight,gradBias)
gradAdamWeight=gradAdamList[0]
gradAdamBias=gradAdamList[1]
linear_model.weight.assign_sub(adam.lr * gradAdamWeight)
linear_model.bias.assign_sub(adam.lr * gradAdamBias)
if __name__=="__main__":
numberSamples=100
x,y=linearModelGenerator(numberSamples)
linear_model = LinearModel()
epochs = 1000
shapes=[]
shapes.append(1)
shapes.append(1)
adam=Adam(shapes,lr=0.1)
for epoch_count in range(epochs):
real_loss = loss(y, linear_model(x))
trainAdam(linear_model,adam, x, y)
print('w',linear_model.weight.numpy())
print('bias',linear_model.bias.numpy())
print('real_loss',real_loss.numpy())
I would like to keep the general structure of the code, but to replace the Adam Gradient Implementation.
The built-in optimizers in TensorFlow 2 can not only be used with tf.keras.Model.fit(), but also with tf.GradientTape(). With the latter, you can just call its apply_gradients() method directly. The optimizer object will keep track of the accumulators and running moments internally. Roughly, your code can be modified as follows:
adam = tf.optimizers.Adam(learning_rate)
def trainAdam(linear_model,adam, x, y):
with tf.GradientTape() as t:
current_loss = loss(y, linear_model(x))
gradWeight, gradBias = t.gradient(current_loss, [linear_model.weight, linear_model.bias])
adam.apply_gradients(zip([gradWeight, gradBias], [linear_model.weight, linear_model.bias]))

pytorch simple custom recurrent layer extremely slow

I implemented a very simple custom recurrent layer in pytorch using PackedSequence. The layer slows down my network in the order of x20. I read about slow down on custom layers without using JIT, but in the order of x1.7, which is something I could live with.
I am simply indexing the packed sequences per sequence and performing a recursion.
I have the suspicion some of the code is not executed on the GPU?
I'm also grateful for any other tips how to implement this type of RNN (essentially not having a dense layer, without any mixing between features).
import torch
import torch.nn as nn
from torch.nn.utils.rnn import PackedSequence
def getPackedSequenceIndices(batch_sizes):
"""input: batch_sizes from PackedSequence object
requires length-sorted sequences!
"""
nBatches = batch_sizes[0]
seqIdx = []
for ii in range(nBatches):
seqLen = torch.sum((batch_sizes - ii) > 0).item()
idx = torch.LongTensor(seqLen)
idx[0] = ii
idx[1:] = batch_sizes[0:seqLen-1]
seqIdx.append( torch.cumsum(idx, dim=0) )
return seqIdx
class LinearRecursionLayer(nn.Module):
"""Linear recursive smoothing layer with trainable smoothing constants."""
def __init__(self, feat_dim, alpha_smooth=0.5):
super(LinearRecursionLayer, self).__init__()
self.feat_dim = feat_dim
# trainable parameters
self.alpha_smooth = nn.Parameter(alpha_smooth*torch.ones(self.feat_dim))
self.wx = nn.Parameter(torch.ones(self.feat_dim))
self.activ = nn.Tanh
def forward(self, x):
if isinstance(x, PackedSequence):
seqIdx = getPackedSequenceIndices(x.batch_sizes)
ydata = torch.zeros_like(x.data)
for idx in seqIdx:
y_frame = x.data[idx[0]] # init with first frame
# iterate over sequence
for nn in idx:
x_frame = x.data[nn]
y_frame = self.alpha_smooth*y_frame + (1-self.alpha_smooth)*x_frame # smoothing recurrence
ydata[nn,:] = self.activ(self.wx*(y_frame))
y = PackedSequence(ydata, x.batch_sizes) # pack
else: # tensor
raise ValueError('not implemented')
return y

how to get the real shape of batch_size which is none in keras

When implementing a custom layer in Keras, I need to know the real size of batch_size. my shape is (?,20).
questions:
1. What is the best way to change (?,20) to (batch_size,20).
I have looked into this but it can not adjust to my problem.
I can pass the batch_size to this layer. In that case, I need to reshape (?,20) to (batch_size,20), how can I do that?
2. Is it the best way to that, or is there any builtin function that can get the real batch_size while building and running the model?
This is my layer:
from scipy.stats import entropy
from keras.engine import Layer
import keras.backend as K
import numpy as np
class measure(Layer):
def __init__(self, beta, **kwargs):
self.beta = beta
self.uses_learning_phase = True
self.supports_masking = True
super(measure, self).__init__(**kwargs)
def call(self, x):
return K.in_train_phase(self.rev_entropy(x, self.beta), x)
def get_config(self):
config = {'beta': self.beta}
base_config = super(measure, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def rev_entropy(self, x, beta):
entropy_p_t_w = np.apply_along_axis(entropy, 1, x)
con = (beta / (1 + entropy_p_t_w)) ** 1.5
new_f_w_t = x * (con.reshape(con.shape[0], 1))
norm_const = 1e-30 + np.sum(new_f_w_t, axis=0)
for t in range(norm_const.shape[0]):
new_f_w_t[:, t] /= norm_const[t]
return new_f_w_t
And here is where I call this layer:
encoded = measure(beta=0.08)(encoded)
I am also using fit_generator if it can help at all:
autoencoder.fit_generator(train_gen, steps_per_epoch=num_train_steps, epochs=NUM_EPOCHS,
validation_data=test_gen, validation_steps=num_test_steps, callbacks=[checkpoint])
The dimension of the x passed to the layer is (?,20) and that's why I can not do my calculation.
Thanks:)

pyspark and pytorch: unable to retrieve gradients on the driver

I'm new to pytorch and I'm trying to explore the feasibility of its usage with spark (for now I'm working in spark standalone).
As for now I'm struggling on a very specific topic.
Let's start with a very simple model:
# linmodel.py
import torch
import torch.nn as nn
import numpy as np
def standardize(x):
return (x - np.mean(x)) / np.std(x)
def add_noise(y):
rnd = np.random.randn(y.shape[0])
return y + rnd
def cost(target, predicted):
cost = torch.sum((torch.t(target) - predicted) ** 2)
return cost
class LinModel(nn.Module):
def __init__(self, in_size, out_size):
super(LinModel, self).__init__() # always call parent's init
self.linear = nn.Linear(in_size, out_size, bias=False) # layer parameters
def forward(self, x):
return self.linear(x)
Which instantiates a basic linear model, along with some utility functions.
The goal is to approximate a target matrix, and to keep track of how the
gradients behave.
I'm trying to achieve the following:
create my target matrix
split the inputs on the workers
instantiate models and optimizer on the workers
compute the approximation on subsets of input
retrieve the gradients for further analysis
And everything works fine until point 5.
Here's the code:
#test.py
import torch
import torch.nn as nn
import numpy as np
import torch.optim
from torch.autograd import Variable
from pyspark import SparkContext
import linmodel
def prepare_input(nsamples=400):
Xold = np.linspace(0, 1000, nsamples).reshape([nsamples, 1])
X = linmodel.standardize(Xold)
W = np.random.randint(1, 10, size=(5, 1))
Y = W.dot(X.T) # target
for i in range(Y.shape[1]):
Y[:, i] = linmodel.add_noise(Y[:, i])
x = Variable(torch.from_numpy(X), requires_grad=False).type(torch.FloatTensor)
y = Variable(torch.from_numpy(Y), requires_grad=False).type(torch.FloatTensor)
print("created torch variables {} {}".format(x.size(), y.size()))
return x, y, W
def initialize(tup):
x, y = tup[0] # data
m, o = tup[1] # model and optimizer
model, optimizer = torch_step(x, y, m, o)
# here we have the gradients
print('gradient: {}'.format([param.grad.data for param in model.parameters()]))
return (x, y), (model, optimizer)
def create_model():
model = linmodel.LinModel(1, 5)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
return model, optimizer
def torch_step(x, y, model, optimizer):
prediction = model(x)
loss = linmodel.cost(y, prediction)
optimizer.zero_grad()
loss.backward()
optimizer.step()
return model, optimizer
def main(sc, num_partitions=4):
x, y, W = prepare_input()
parts_x = list(torch.split(x, int(x.size()[0] / num_partitions)))
parts_y = list(torch.split(y, int(x.size()[0] / num_partitions), 1))
rdd_models = sc.parallelize([create_model() for _ in range(num_partitions)]).repartition(num_partitions)
rdd_x = sc.parallelize(parts_x).repartition(num_partitions)
rdd_y = sc.parallelize(parts_y).repartition(num_partitions)
parts = rdd_x.zip(rdd_y) # [((100x1), (5x100)), ...]
full = parts.zip(rdd_models).map(initialize).cache()
models_out = full.map(lambda x: x[1][0]).collect()
test_model = models_out[0]
print(type(test_model))
print('gradient: {}'.format([param.grad.data for param in test_model.parameters()]))
if __name__ == '__main__':
sc = SparkContext(appName='test')
main(sc)
As you can see in the comments , when the function initialize is mapped on the full rdd, if you inspect the logs of the executors you'll find the gradients to be computed.
When I collect the result and try to access the very same attribute on the driver I receive a AttributeError: 'NoneType' object has no attribute 'data'
meaning that all the model.grad attribute are set to None.
I'm sure I'm missing something big here, but I cannot see it.
Any hint is appreciated.
Thanks a lot.
There are two major mistakes in your approach (according to me):
Since you want a distributed training, your approach of instantiating the model separately in all of the executors is wrong. You should instantiate the model in the head node (node where spark driver is located) and then distribute that model to all the executors. So each executor independently does forward pass and calculates the gradients on its portion of data and passes the gradients to the head node for weight update (weight update has to be serialized). Then the updated network is again scattered to the executors for the next iteration.
A much bigger concern is that I am not very sure if the gradient buffers are copied to the head node from the executors when you perform .collect(). Due to which model.grad can be set to None. To begin debugging, I suggest you have only one executor (and 1 partition) and then perform a .collect() to see if the gradient buffers are being copied. Or if you are good at Java or Scala, you can look at the collect() method's implementation.
Hope this helps.....

Calibration with xgboost

I'm wondering if I can do calibration in xgboost. To be more specific, does xgboost come with an existing calibration implementation like in scikit-learn, or are there some ways to put the model from xgboost into a scikit-learn's CalibratedClassifierCV?
As far as I know in sklearn this is the common procedure:
# Train random forest classifier, calibrate on validation data and evaluate
# on test data
clf = RandomForestClassifier(n_estimators=25)
clf.fit(X_train, y_train)
clf_probs = clf.predict_proba(X_test)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
sig_clf.fit(X_valid, y_valid)
sig_clf_probs = sig_clf.predict_proba(X_test)
sig_score = log_loss(y_test, sig_clf_probs)
print "Calibrated score is ",sig_score
If I put an xgboost tree model into the CalibratedClassifierCV an error will be thrown (of course):
RuntimeError: classifier has no decision_function or predict_proba method.
Is there a way to integrate the excellent calibration module of scikit-learn with xgboost?
Appreciate your insightful ideas!
Answering to my own question, an xgboost GBT can be integrated with scikit-learn by writing a wrapper class like the case below.
class XGBoostClassifier():
def __init__(self, num_boost_round=10, **params):
self.clf = None
self.num_boost_round = num_boost_round
self.params = params
self.params.update({'objective': 'multi:softprob'})
def fit(self, X, y, num_boost_round=None):
num_boost_round = num_boost_round or self.num_boost_round
self.label2num = dict((label, i) for i, label in enumerate(sorted(set(y))))
dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in y])
self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
def predict(self, X):
num2label = dict((i, label)for label, i in self.label2num.items())
Y = self.predict_proba(X)
y = np.argmax(Y, axis=1)
return np.array([num2label[i] for i in y])
def predict_proba(self, X):
dtest = xgb.DMatrix(X)
return self.clf.predict(dtest)
def score(self, X, y):
Y = self.predict_proba(X)
return 1 / logloss(y, Y)
def get_params(self, deep=True):
return self.params
def set_params(self, **params):
if 'num_boost_round' in params:
self.num_boost_round = params.pop('num_boost_round')
if 'objective' in params:
del params['objective']
self.params.update(params)
return self
See full example here.
Please don't hesitate to provide a smarter way of doing this!
A note from the hell scape that is July 2020:
You no longer need a wrapper class. The predict_proba method is built into the xgboost sklearn python apis. Not sure when they were added but they are there for v1.0.0 on for certain.
Note: this is of course only true for classes that would have the predict_proba method. Ex: The XGBRegressor doesn't. The XGBClassifier does.

Resources