What is the "correct" or best way in jax to implement a Dense layer where each layer might or might not have a bias? - jax

For example in jax.experimental.stax there is an Dense layer implemented like this:
def Dense(out_dim, W_init=glorot_normal(), b_init=normal()):
"""Layer constructor function for a dense (fully-connected) layer."""
def init_fun(rng, input_shape):
output_shape = input_shape[:-1] + (out_dim,)
k1, k2 = random.split(rng)
W, b = W_init(k1, (input_shape[-1], out_dim)), b_init(k2, (out_dim,))
return output_shape, (W, b)
def apply_fun(params, inputs, **kwargs):
W, b = params
return jnp.dot(inputs, W) + b
return init_fun, apply_fun
If we implement bias as being allowed to be None for example, or params having length 1, there are implications for how grad works.
What is the pattern that one should aim for here? jax.jit has a static_argnums that I suppose could be used with some has_bias param but book-keeping this is involved and I am sure there must be some examples somewhere.

Wouldn't this work ?
def Dense(out_dim, W_init=glorot_normal(), b_init=normal()):
"""Layer constructor function for a dense (fully-connected) layer."""
def init_fun(rng, input_shape):
output_shape = input_shape[:-1] + (out_dim,)
k1, k2 = random.split(rng)
W = W_init(k1, (input_shape[-1], out_dim))
if b_init:
b = b_init(k2, (out_dim,)
return output_shape, (W, b)
return output_shape, W
def apply_fun(params, inputs, **kwargs):
if len(params) == 1:
W = params
return jnp.dot(inputs, W)
else:
W, b = params
return jnp.dot(inputs, W) + b
return init_fun, apply_fun

Related

Normalizing multivariate time-series data with different sequence length

I have a multivariate time-series dataset with different sequence lengths. I filled the missing values in the sequences with zeros. I am trying to use a recurrent neural network model for forcasting with Time Series. I noticed my results of the model degrade when the range of the data is outside -1 and 1. I wrote the following normalization class using MinMaxScaler. However, I don't know how to exclude the missing values in the sequences during computation of MinMaxScaler. Here is my code
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
import numpy as np
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def get_mask_from_sequence_lengths(
sequence_lengths: torch.Tensor, max_length: int
) -> torch.BoolTensor:
# (batch_size, max_length)
ones = sequence_lengths.new_ones(sequence_lengths.size(0), max_length)
range_tensor = ones.cumsum(dim=1)
return sequence_lengths.unsqueeze(1) >= range_tensor
class Normalizer1D(nn.Module):
# Data size of (batch_size, seq_len, input_size)
def __init__(self, input_dim, inputs ):
super(Normalizer1D, self).__init__()
self.input_dim = input_dim
self.to(device)
self._norm = self.build_normalizers(inputs)
max_len=inputs.shape[-1]
data = torch.from_numpy(inputs)
length = torch.LongTensor([torch.max((data[i,0,:]!=0).nonzero()).item()+1 for i in range(data.shape[0])])
mask = get_mask_from_sequence_lengths( length, max_len)
def build_normalizers(self, x):
normalizers = OrderedDict()
for i in range(self.input_dim):
if np.min(x[:,i,:])<0:
scaler = MinMaxScaler(feature_range=(-1, 1))
else:
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(x[:,i,:])
normalizers[str(i)] = scaler
return normalizers
def normalize(self, x):
#(B, D, T)
d = x.cpu().detach().numpy()
n_x=[]
for i in range(x.shape[1]):
n_x.append(self._norm[str(i)].fit_transform(d[:,i,:]))
x =np.stack(n_x, axis=1)
return torch.from_numpy(x).to(device)
def unnormalize(self, x):
#(T, B, D)==>(B, T, D)
d = x.cpu().detach().numpy()
n_x=[]
for i in range(x.shape[1]):
n_x.append(self._norm[str(i)].inverse_transform(d[:,i,:]))
x =np.stack(n_x, axis=1)
return torch.from_numpy(x).to(device)
#property
def min_(self):
#(T, B, D)
min_ = []
for i in range(len(self._norm)):
min_.append(self._norm[str(i)].min_)
return torch.from_numpy(np.stack(min_, axis=1))
#property
def scale_(self):
#(T, B, D)
scale_ = []
for i in range(len(self._norm)):
scale_.append(self._norm[str(i)].scale_)
return torch.from_numpy(np.stack(scale_, axis=1))
def unnormalize_mean(self, x_mu):
Xscale = self.scale_()
Xmin = self.min_()
normX = x_mu.mul_(Xscale)
return normX.add_(Xmin)
def unnormalize_sigma(self, x_sigma):
Xscale =self.scale_()
return x_sigma.mul_(Xscale)
# compute the normalizers
def compute_normalizer(loader_train):
##batch_size, input_dim, seq_len
for i, (u, y) in enumerate(loader_train):
if i ==0:
#input u torch.Size([B, D, T])
inputs = u
outputs = y
else:
inputs = torch.cat([inputs,u], dim=0)
outputs = torch.cat([outputs,y], dim=0)
inputs = inputs.cpu().detach().numpy()
outputs = outputs.cpu().detach().numpy()
# initialization
u_normalizer = Normalizer1D(inputs.shape[1], inputs)
y_normalizer = Normalizer1D(outputs.shape[1], outputs)
return u_normalizer, y_normalizer
I will appreciate if someone could suggest a way to exclude the missing values from the normalization process.

Using self in init part of a class in Python

Is there any difference between the following two codes related to initializing a class in Python?
class summation:
def __init__(self, f, s):
self.first = f
self.second = s
self.summ = self.first + self.second
.
.
.
class summation:
def __init__(self, f, s):
self.first = f
self.second = s
self.summ = f + s
.
.
.
If there exists any difference, what is that, and which code is preferable?
Edit: I am going to write an artificial neural network with Python (and Pytorch). In fact, the above two codes are just some examples. In the actual case, I have seen in various resources that when there exists self.input = input in the initialization of a class, in other parts it is used as self.input, not input.
My questions: What are the differences between these two approaches? Why is the use of self.input preferable, in my case?
Example: (from https://docs.dgl.ai/en/latest/tutorials/models/1_gnn/4_rgcn.html#sphx-glr-tutorials-models-1-gnn-4-rgcn-py)
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
import dgl.function as fn
from functools import partial
class RGCNLayer(nn.Module):
def __init__(self, in_feat, out_feat, num_rels, num_bases=-1, bias=None,
activation=None, is_input_layer=False):
super(RGCNLayer, self).__init__()
self.in_feat = in_feat
self.out_feat = out_feat
self.num_rels = num_rels
self.num_bases = num_bases
self.bias = bias
self.activation = activation
self.is_input_layer = is_input_layer
# sanity check
if self.num_bases <= 0 or self.num_bases > self.num_rels:
self.num_bases = self.num_rels
# weight bases in equation (3)
self.weight = nn.Parameter(torch.Tensor(self.num_bases, self.in_feat,
self.out_feat))
if self.num_bases < self.num_rels:
# linear combination coefficients in equation (3)
self.w_comp = nn.Parameter(torch.Tensor(self.num_rels, self.num_bases))
# add bias
if self.bias:
self.bias = nn.Parameter(torch.Tensor(out_feat))
# init trainable parameters
nn.init.xavier_uniform_(self.weight,
gain=nn.init.calculate_gain('relu'))
if self.num_bases < self.num_rels:
nn.init.xavier_uniform_(self.w_comp,
gain=nn.init.calculate_gain('relu'))
if self.bias:
nn.init.xavier_uniform_(self.bias,
gain=nn.init.calculate_gain('relu'))
def forward(self, g):
if self.num_bases < self.num_rels:
# generate all weights from bases (equation (3))
weight = self.weight.view(self.in_feat, self.num_bases, self.out_feat)
weight = torch.matmul(self.w_comp, weight).view(self.num_rels,
self.in_feat, self.out_feat)
else:
weight = self.weight
if self.is_input_layer:
def message_func(edges):
# for input layer, matrix multiply can be converted to be
# an embedding lookup using source node id
embed = weight.view(-1, self.out_feat)
index = edges.data['rel_type'] * self.in_feat + edges.src['id']
return {'msg': embed[index] * edges.data['norm']}
else:
def message_func(edges):
w = weight[edges.data['rel_type']]
msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
msg = msg * edges.data['norm']
return {'msg': msg}
def apply_func(nodes):
h = nodes.data['h']
if self.bias:
h = h + self.bias
if self.activation:
h = self.activation(h)
return {'h': h}
g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)
No. there is no difference between these two approaches in your case with this level of information. but could they? Yes. they could. if they have some modifications in their setters or getters. later in my answer I'll show you how.
First of all, I prefer using this one:
class summation:
def __init__(self, f, s):
self.first = f
self.second = s
#property
def summ(self):
return self.first+self.second
the above implementation calculates the summation on demand. so when you change self.first or self.second, summ will be calculated automatically. you can access the sum as you did before.
s = summation(1,9)
print(s.summ)
# 10
s.first = 2
s.second = 3
print(s.summ)
# 5
So, How could they be different?
let's implements them as follows. in setters I doubled the inputs to show you how setters can affect the results. it's just an imaginary example and is not exactly what you wrote.
class summation1:
def __init__(self, f, s):
self.first = f
self.second = s
self.summ = self.first + self.second
#property
def first(self):
return self.__first
#first.setter
def first(self,f):
self.__first = f*2
#property
def second(self):
return self.__second
#second.setter
def second(self,s):
self.__second = s*2
class summation2:
def __init__(self, f, s):
self.first = f
self.second = s
self.summ = f + s
#property
def first(self):
return self.__first
#first.setter
def first(self,f):
self.__first = f*2
#property
def second(self):
return self.__second
#second.setter
def second(self,s):
self.__second = s*2
now let's take a look at the outputs:
a = 3
b = 2
s1 = summation1(a,b)
s2 = summation2(a,b)
print(s1.summ)
# 10
print(s2.summ)
# 5
so, if you are not sure what to choose between those two, maybe the first approach is what you need.

Pytorch autograd: Make gradient of a parameter a function of another parameter

In Pytorch, how can I make the gradient of a parameter a function itself?
Here is a simple code snippet:
import torch
def fun(q):
def result(w):
l = w * q
l.backward()
return w.grad
return result
w = torch.tensor((2.), requires_grad=True)
q = torch.tensor((3.), requires_grad=True)
f = fun(q)
print(f(w))
In the code above, how can I make f(w) have gradient with respect to q?
EDIT: based on the accepted answer I was able to write a code that works. Essentially I am alternating between 2 optimization steps. For dim == 1 it works and for dim == 2 it does not. I get the error "RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling backward the first time."
import torch
class f_class():
def __init__(self, dim):
self.dim = dim
if self.dim == 1:
self.w = torch.tensor((3.), requires_grad=True)
elif self.dim == 2:
self.w = [torch.tensor((3.), requires_grad=True), torch.tensor((5.), requires_grad=True)]
else:
raise ValueError("dim 1 or 2")
def forward(self, x):
if self.dim == 1:
return torch.mul(self.w, x)
elif self.dim == 2:
return torch.mul(torch.mul(self.w[0], self.w[1]), x)
def set_w(self, w):
self.w = w
def get_w(self):
return self.w
class g_class():
def __init__(self):
self.q = torch.tensor((4.), requires_grad=True)
def forward(self, f):
return torch.mul(self.q, f)
def set_q(self, q):
self.q = q
def get_q(self):
return self.q
def w_new(f, g, dim):
loss_g = g.forward(f.forward(xd))
if dim == 1:
grads = torch.autograd.grad(loss_g, f.get_w(), create_graph=True, only_inputs=True)[0]
temp = f.get_w().detach() + grads
else:
grads = torch.autograd.grad(loss_g, f.get_w(), create_graph=True, only_inputs=True)
temp = [wi.detach() + gi for wi, gi in zip(f.get_w(), grads)]
return temp
def q_new(f, g):
loss_f = 2 * f.forward(xd)
loss_f.backward()
temp = g.get_q().detach() + g.get_q().grad
temp.requires_grad = True
return temp
dim = 1
xd = torch.tensor((2.))
f = f_class(dim)
g = g_class()
for _ in range(3):
print(f.get_w(), g.get_q())
wnew = w_new(f, g, dim)
f.set_w(wnew)
print(f.get_w(), g.get_q())
qnew = q_new(f, g)
g.set_q(qnew)
print(f.get_w(), g.get_q())
When computing gradients, if you want to construct a computation graph for the gradient itself you need to specify create_graph=True to autograd.
A potential source of error in your code is using Tensor.backward within f. The problem here is that w.grad and q.grad will be populated with the gradient of l. This means that when you call f(w).backward(), the gradients of both f and l will be added to w.grad and q.grad. In effect you will end up with w.grad being equal to dl/dw + df/dw and similarly for q.grad. One way to get around this is to zero the gradients after f(w) but before .backward(). A better way is to use torch.autograd.grad within f. Using the latter approach, the grad attribute of w and q will not be populated when calling f, only when calling .backward(). This leaves room for things like gradient accumulation during training.
import torch
def fun(q):
def result(w):
l = w * q
return torch.autograd.grad(l, w, only_inputs=True, retain_graph=True)[0]
return result
w = torch.tensor((2.), requires_grad=True)
q = torch.tensor((3.), requires_grad=True)
f = fun(q)
f(w).backward()
print('w.grad:', w.grad)
print('q.grad:', q.grad)
which results in
w.grad: None
q.grad: tensor(1.)
Note that w.grad was not populated. This is because f(w) = dl/dw = q is not a function of w, and therefore w is not part of the computation graph. If you're using a standard pytorch optimizer this is fine since None gradients are implicitly assumed to be zero.
If l were instead a non-linear function of w, then w.grad would have been populated after f(w).backward(). For example
import torch
def fun(q):
def result(w):
# now dl/dw = 2 * w * q
l = w**2 * q
return torch.autograd.grad(l, w, only_inputs=True, create_graph=True)[0]
return result
w = torch.tensor((2.), requires_grad=True)
q = torch.tensor((3.), requires_grad=True)
f = fun(q)
f(w).backward()
print('w.grad:', w.grad)
print('q.grad:', q.grad)
which results in
w.grad: tensor(6.)
q.grad: tensor(4.)

Constraint on the sum of parameters in Keras Layer

I want to add custom constraints on the parameters of a layer.
I write a custom activation layer with two trainable parameters a and b s.t:
activation_fct = a*fct() + b*fct().
I need to have the sum of the parameters (a+b) equal to 1 but I don't know how to write such a constraint.
Can you give me some advices ?
Thanks in advance.
Two approaches come to my mind.
First one is to lock one of the parameters, let's say b and make only the other one (a in this case) trainable. Then you can compute b as follows
b = 1 - a
The second approach could be making both a and b trainable and transform them via softmax function. Softmax function will make sure that their sum is always 1.
from scipy.special import softmax
a = 0.12
b = 0.3
w1, w2 = softmax([a, b])
print(f'w1: {w1}, w2: {w2}, w1 + w2: {w1 + w2}')
This will produce
w1: 0.45512110762641994, w2: 0.5448788923735801, w1 + w2: 1.0
And once you have w1 and w2, you can use them in the mentioned formula instead of a and b.
activation_fct = w1 * fct() + w2 * fct()
You can have a single weight instead of two, and use this custom constraint:
import keras.backend as K
class Between_0_1(keras.constraints.Constraint):
def __call__(self, w):
return K.clip(w, 0, 1)
Then when building the weights, build only a and use the constraints.
def build(self, input_shape):
self.a = self.add_weight(name='weight_a',
shape=(1,),
initializer='uniform',
constraint = Between_0_1(),
trainable=True)
#if you want to start as 0.5
K.set_value(self.a, [0.5])
self.built = True
In call, b = 1-a:
def call(self, inputs, **kwargs):
#do stuff
....
return (self.a * something) + ((1-self.a)*another_thing)
You can alsto try #MatusDubrava softmax approach, but in this case your weights need to have shape (2,), and no constraint:
def build(self, input_shape):
self.w = self.add_weight(name='weights',
shape=(2,),
initializer='zeros',
trainable=True)
self.build = True
def call(self, inputs, **kwargs):
w = K.softmax(self.w)
#do stuff
....
return (w[0] * something ) + (w[1] * another_thing)

How can I find correct parameters in cvxopt.solvers.qp() for SVM?

I have some data and I want to classification like svm with cvxopt function.
In documentation of cvxopt.solvers.qp, there were some matrixes with vectorized and transposed.
How can I find correct params (P, q, G, h, A, b) when I know n_samples and n_features?
solution = cvxopt.solvers.qp(P, q, G, h, A, b)
Solved it. I got a hint how to set parameters of cvx.opt from here. Code is as bellows.
I made a matrix (n_samples, n_samples) and get other parameters with cvxopt.matrix.
class SVM(object):
def __init__(self, kernel=linear_kernel, C=None):
self.kernel = kernel
self.C = C
if self.C is not None: self.C = float(self.C)
def fit(self, X, y):
n_samples, n_features = X.shape
# Gram matrix
K = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
K[i,j] = self.kernel(X[i], X[j])
P = cvxopt.matrix(np.outer(y,y) * K)
q = cvxopt.matrix(np.ones(n_samples) * -1)
A = cvxopt.matrix(y, (1,n_samples))
b = cvxopt.matrix(0.0)
if self.C is None:
G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
h = cvxopt.matrix(np.zeros(n_samples))
else:
tmp1 = np.diag(np.ones(n_samples) * -1)
tmp2 = np.identity(n_samples)
G = cvxopt.matrix(np.vstack((tmp1, tmp2)))
tmp1 = np.zeros(n_samples)
tmp2 = np.ones(n_samples) * self.C
h = cvxopt.matrix(np.hstack((tmp1, tmp2)))

Resources