In Pytorch, how can I make the gradient of a parameter a function itself?
Here is a simple code snippet:
import torch
def fun(q):
def result(w):
l = w * q
l.backward()
return w.grad
return result
w = torch.tensor((2.), requires_grad=True)
q = torch.tensor((3.), requires_grad=True)
f = fun(q)
print(f(w))
In the code above, how can I make f(w) have gradient with respect to q?
EDIT: based on the accepted answer I was able to write a code that works. Essentially I am alternating between 2 optimization steps. For dim == 1 it works and for dim == 2 it does not. I get the error "RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling backward the first time."
import torch
class f_class():
def __init__(self, dim):
self.dim = dim
if self.dim == 1:
self.w = torch.tensor((3.), requires_grad=True)
elif self.dim == 2:
self.w = [torch.tensor((3.), requires_grad=True), torch.tensor((5.), requires_grad=True)]
else:
raise ValueError("dim 1 or 2")
def forward(self, x):
if self.dim == 1:
return torch.mul(self.w, x)
elif self.dim == 2:
return torch.mul(torch.mul(self.w[0], self.w[1]), x)
def set_w(self, w):
self.w = w
def get_w(self):
return self.w
class g_class():
def __init__(self):
self.q = torch.tensor((4.), requires_grad=True)
def forward(self, f):
return torch.mul(self.q, f)
def set_q(self, q):
self.q = q
def get_q(self):
return self.q
def w_new(f, g, dim):
loss_g = g.forward(f.forward(xd))
if dim == 1:
grads = torch.autograd.grad(loss_g, f.get_w(), create_graph=True, only_inputs=True)[0]
temp = f.get_w().detach() + grads
else:
grads = torch.autograd.grad(loss_g, f.get_w(), create_graph=True, only_inputs=True)
temp = [wi.detach() + gi for wi, gi in zip(f.get_w(), grads)]
return temp
def q_new(f, g):
loss_f = 2 * f.forward(xd)
loss_f.backward()
temp = g.get_q().detach() + g.get_q().grad
temp.requires_grad = True
return temp
dim = 1
xd = torch.tensor((2.))
f = f_class(dim)
g = g_class()
for _ in range(3):
print(f.get_w(), g.get_q())
wnew = w_new(f, g, dim)
f.set_w(wnew)
print(f.get_w(), g.get_q())
qnew = q_new(f, g)
g.set_q(qnew)
print(f.get_w(), g.get_q())
When computing gradients, if you want to construct a computation graph for the gradient itself you need to specify create_graph=True to autograd.
A potential source of error in your code is using Tensor.backward within f. The problem here is that w.grad and q.grad will be populated with the gradient of l. This means that when you call f(w).backward(), the gradients of both f and l will be added to w.grad and q.grad. In effect you will end up with w.grad being equal to dl/dw + df/dw and similarly for q.grad. One way to get around this is to zero the gradients after f(w) but before .backward(). A better way is to use torch.autograd.grad within f. Using the latter approach, the grad attribute of w and q will not be populated when calling f, only when calling .backward(). This leaves room for things like gradient accumulation during training.
import torch
def fun(q):
def result(w):
l = w * q
return torch.autograd.grad(l, w, only_inputs=True, retain_graph=True)[0]
return result
w = torch.tensor((2.), requires_grad=True)
q = torch.tensor((3.), requires_grad=True)
f = fun(q)
f(w).backward()
print('w.grad:', w.grad)
print('q.grad:', q.grad)
which results in
w.grad: None
q.grad: tensor(1.)
Note that w.grad was not populated. This is because f(w) = dl/dw = q is not a function of w, and therefore w is not part of the computation graph. If you're using a standard pytorch optimizer this is fine since None gradients are implicitly assumed to be zero.
If l were instead a non-linear function of w, then w.grad would have been populated after f(w).backward(). For example
import torch
def fun(q):
def result(w):
# now dl/dw = 2 * w * q
l = w**2 * q
return torch.autograd.grad(l, w, only_inputs=True, create_graph=True)[0]
return result
w = torch.tensor((2.), requires_grad=True)
q = torch.tensor((3.), requires_grad=True)
f = fun(q)
f(w).backward()
print('w.grad:', w.grad)
print('q.grad:', q.grad)
which results in
w.grad: tensor(6.)
q.grad: tensor(4.)
Why am I getting this error
sigmoid() takes 1 positional argument but 2 were given
while using function yHat = NN.forward(X)??
class Neural_Networks(object):
def __init__(self):
self.inputLayerSize =2
self.outputLayerSize =1
self.hiddenLayerSize = 3
#weights
self.W1 = np.random.randn(self.inputLayerSize, self.hiddenLayerSize)
self.W2 = np.random.randn(self.hiddenLayerSize, self.outputLayerSize)
def forward(self,X):
#propogates input through network
self.z2 = np.dot(X, self.W1)
self.a2 = self.sigmoid( self.z2 )
self.Z3 = np.dot(self.a2,self.W2)
yHat = self.sigmoid(self.z3)
return yHat
def sigmoid(z):
return 1/(1+np.exp(-z))
You are using it as an instance method so you must include self as the first argument
class Neural_Networks(object):
def __init__(self):
self.inputLayerSize =2
self.outputLayerSize =1
self.hiddenLayerSize = 3
#weights
self.W1 = np.random.randn(self.inputLayerSize, self.hiddenLayerSize)
self.W2 = np.random.randn(self.hiddenLayerSize, self.outputLayerSize)
def forward(self,X):
#propogates input through network
self.z2 = np.dot(X, self.W1)
self.a2 = self.sigmoid( self.z2 )
self.Z3 = np.dot(self.a2,self.W2)
yHat = self.sigmoid(self.z3)
return yHat
def sigmoid(self, z):
return 1/(1+np.exp(-z))
Conversely if you want to use sigmoid as a class method than you'll need to add a #staticmethod decorator to it eg:
#staticmethod
def sigmoid(z):
return 1/(1+np.exp(-z))
Making it a static method is likely the right option since you don't use self in the method.
I am trying to learn how to group functions by class. As an example, I tried to code a generalized least squares method to find the equation of a best-fitting line between a set of (x,y) coordinates. For my particular case, I chose a simple line y = x + 5, so slope should be close to 1 and y-intercept should be close to 5. Running my attempt at a coded solution below produces the error TypeError: set_x() takes 1 positional argument but 2 were given, though I am trying to pass an array of x-points. How can I circumvent this error?
import numpy as np
from scipy.optimize import minimize
class GeneralizedLeastSquares:
def __init__(self, residuals=None, parameters=None, x=None, y_true=None, y_fit=None, weights=None, method=None):
self.residuals = residuals
self.parameters = parameters
self.x = x
self.y_true = y_true
self.y_fit = y_fit
self.weights = weights
self.method = method
def set_residuals(self, residuals):
self.residuals = residuals
def set_parameters(self, parameters):
self.parameters = parameters
def set_x(self, x):
self.x = x
def set_y_true(self, y_true):
self.y_true = y_true
def set_y_fit(self, y_fit):
self.y_fit = y_fit
def set_weights(self, weights):
self.weights = weights
def set_method(self, method):
self.method = method
def get_residuals(self):
return [(self.y_true[idx] - self.y_fit[idx])**2 for idx in range(len(self.y_true)) if len(self.y_true) == len(self.y_fit) ]
def get_parameters(self):
return self.parameters
def get_x(self):
return self.x
def get_y_true(self):
return self.y_true
def get_y_fit(self):
return [self.parameters[0] * self.x[idx] + self.parameters[1] for idx in range(len(self.x))]
def get_weights(self):
return self.weights
def update_weights(self):
inverse_residuals = [1/self.residuals[idx] for idx in range(len(residuals))]
inverse_residuals_abs = [abs(inverse_residual) for inverse_residual in inverse_residuals]
residual_abs_total = sum(inverse_residuals_abs)
return [inverse_residuals_abs[idx]/residual_abs_total for idx in range(len(inverse_residuals_abs))]
def get_method(self):
return self.method
def get_error_by_residuals(self):
return sum([self.weights[idx] * self.residuals[idx] for idx in range(len(self.residuals))])
def get_error_by_std_mean(self):
return np.std(self.y_true)/np.sqrt(len(self.y_true))
def get_linear_fit(self):
"""
"""
if self.parameters == 'estimate':
slope_init = (self.y_true[-1] - self.y_true[0]) / (self.x[-1] - self.x[0])
b_init = np.mean([self.y_true[-1] - slope_init * self.x[-1], self.y_true[0] - slope_init * self.x[0]])
self.parameters = [slope_init, b_init]
elif not isinstance(self.parameters, (list, np.ndarray)):
raise ValueError("parameters = 'estimate' or [slope, y-intercept]")
meths = ['residuals', 'std of mean']
funcs = [get_error_by_residuals, get_error_by_std_mean]
func = dict(zip(meths, funcs))[self.method]
res = minimize(func, x0=self.parameters, args=(self,), method='Nelder-Mead')
self.parameters = [res.x[0], res.x[1]]
self.y_fit = get_y_fit(self)
self.residuals = get_residuals(self)
self.weights = update_weights(self)
return self.parameters, self.y_fit, self.residuals, self.weights
x = np.linspace(0, 4, 5)
y_true = np.linspace(5, 9, 5) ## using slope=1, y-intercept=5
y_actual = np.array([4.8, 6.2, 7, 8.1, 8.9]) ## test data
GLS = GeneralizedLeastSquares()
GLS.set_x(x)
GLS.set_y_true(y_actual)
GLS.set_weights(np.ones(len(x)))
GLS.set_parameters('estimate')
# GLS.set_parameters([1.2, 4.9])
GLS.set_method('residuals')
results = GLS.get_linear_fit()
print(results)
Your method is not taking an argument. It should be:
def set_x(self, x):
self.x = x
Wrapping properties in get/set methods is a very Java / outdated way of doing things. It is much easier to access the underlying property outside of your class. I.e. rather than: GLS.set_x(12), consider the more Pythonic: GLS.x = 12. This way you don't have to write a get and set method for each property.
Also, it might make more sense for the heavy lifting method of your object, get_linear_fit to be put in the __call__ method. This way, you can run the regression using by just typing GLS() rather than GLS.get_linear_fit()
I designed a variable net, but it occurred some problems with theano. The general idea is that different input will get different net with same parameters, something like a recursive neural network with auto-encoder.
There are two cases in my code, one case is run combine_feat_gt1_1() if c > 1, the other case is run combine_feat_gt1_0().
It is weird that the code can run without bugs if I comment updates=updates, which is not my expected (train_test theano function in code). However, if I uncomment updates=updates, an error occurred (train_test_bug theano function in code). The later one is that I'd like to implement.
I have been already spend some days on this bug. Who can help me? I will appreciate that.
import os
import sys
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from theano.ifelse import ifelse
class Test(object):
def __init__(
self,
numpy_rng,
input=None,
output=None,
n_output=6,
n_input=3,
n_group=2,
W_r=None,
b_r=None
):
self.n_output = n_output
self.n_input = n_input
self.n_group = n_group
if not W_r:
initial_W_r = numpy.asarray(
numpy_rng.uniform(
low=-4 * numpy.sqrt(6. / (n_input + n_input)),
high=4 * numpy.sqrt(6. / (n_input + n_input)),
size=(n_input, n_input)
),
dtype=theano.config.floatX
)
W_r = theano.shared(value=initial_W_r, name='W_r', borrow=True)
if not b_r:
b_r = theano.shared(
value=numpy.zeros(
n_input,
dtype=theano.config.floatX
),
borrow=True
)
self.W_r = W_r
self.b_r = b_r
if input is None:
self.x = T.tensor4(name='input', dtype=theano.config.floatX)
else:
self.x = input
if output is None:
self.y = T.matrix(name='output', dtype=theano.config.floatX)
else:
self.y = output
self.params = [self.W_r, self.b_r]
def get_output_values(self, input):
a, b, c, d = input.shape
def recusive(x_t, h_tm1, wr, hr):
h_t = T.dot(h_tm1, wr) + T.dot(x_t, wr) + hr
return h_t
def combine_recusive(data):
hidden, _ = theano.scan(fn=recusive,
sequences=data[1:],
outputs_info=data[0],
non_sequences=[self.W_r, self.b_r],
n_steps=data[1:].shape[0],
strict=True)
return hidden[-1]
def combine_feat_gt1_1(input):
feats, _ = theano.scan(fn=combine_recusive,
sequences=input[0],
outputs_info=None,
n_steps=input[0].shape[0])
recusive_flag = T.ones(1)
return T.reshape(feats, (1,-1)) # concatenation
def combine_feat_gt1_0(input):
feats = input[0]
recusive_flag = T.zeros(1)
return T.reshape(feats, (1,-1)) # concatenation
feat = ifelse(T.gt(c, 1), combine_feat_gt1_1(input), combine_feat_gt1_0(input))
# debug code snippet
self.debug_ifelse = theano.function([input], T.gt(c, 1))
self.debug_1_0 = theano.function([input], ifelse(T.gt(c, 1), 1, 0))
return feat
def get_cost_updates(self):
learning_rate = 0.1
self.y_given_x = self.get_output_values(self.x)
cost = T.sum(( self.y_given_x - self.y) ** 2)
gparams = T.grad(cost, self.params)
updates = [
(param, param - learning_rate * gparam)
for param, gparam in zip(self.params, gparams)
]
return (cost, updates)
if __name__ == "__main__":
toy_data = numpy.array([[[[1,1,1],[2,2,2]], [[3, 4,5],[4,5,6]]]],dtype=theano.config.floatX)
lable = numpy.array([[1,2,3,4,5,6]],dtype=theano.config.floatX)
toy_data2 = numpy.array([[[[1,1,1]], [[3,4,5]]]],dtype=theano.config.floatX)
lable2 = numpy.array([[6,5,4,3,2,1]],dtype=theano.config.floatX)
x = T.tensor4('x', dtype=theano.config.floatX)
y = T.matrix('y', dtype=theano.config.floatX)
newX = T.tensor4(dtype=x.dtype)
newY = T.matrix(dtype=y.dtype)
rng = numpy.random.RandomState(123)
test = Test(
numpy_rng=rng,
input=x,
output=y,
n_group=2,
n_input=3,
n_output=6
)
cost, updates= test.get_cost_updates()
train_test = theano.function(
[newX, newY],
cost,
# updates=updates,
givens={
x : newX,
y : newY
}
)
train_test_bug = theano.function(
[newX, newY],
cost,
updates=updates,
givens={
x : newX,
y : newY
}
)
print train_test(toy_data, lable)
print train_test(toy_data2, lable2)
# code with bug
# print train_test_bug(toy_data, lable)
# print train_test_bug(toy_data2, lable2)
EDIT (by #danielrenshaw)
I've cut the code down to a simpler demonstration of the problem.
The cause is in the gradient computation of a double-nested scan expression. The problem disappears when a modified inner-most recursive expression is used (see comments in first function below).
import numpy
import theano
import theano.tensor as tt
import theano.ifelse
def inner_scan_step(x_t_t, h_tm1, w):
# Fails when using this recursive expression
h_t = tt.dot(h_tm1, w) + x_t_t
# No failure when using this recursive expression
# h_t = h_tm1 + tt.dot(x_t_t, w)
return h_t
def outer_scan_step(x_t, w):
h, _ = theano.scan(inner_scan_step,
sequences=[x_t[1:]],
outputs_info=[x_t[0]],
non_sequences=[w],
strict=True)
return h[-1]
def get_outputs(x, w):
features, _ = theano.scan(outer_scan_step,
sequences=[x],
non_sequences=[w],
strict=True)
return tt.grad(features.sum(), w)
def main():
theano.config.compute_test_value = 'raise'
x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))
x = tt.tensor3()
x.tag.test_value = x_value
w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)
f = theano.function(inputs=[x], outputs=get_outputs(x, w))
print f(x_value)
if __name__ == "__main__":
main()
I solved this problem edited by danielrenshaw. When I add h0 as outputs_info, it work. Before that I used first element of sequence as outputs_info, I think it caused the error. But I still cannot solve my original problem.
import numpy
import theano
import theano.tensor as tt
import theano.ifelse
def inner_scan_step(x_t_t, h_tm1, w):
# Fails when using this recursive expression
h_t = tt.dot(h_tm1, w) + x_t_t
# No failure when using this recursive expression
# h_t = h_tm1 + tt.dot(x_t_t, w)
return h_t
def outer_scan_step(x_t, w, h0):
h, _ = theano.scan(inner_scan_step,
sequences=[x_t],
outputs_info=[h0],
non_sequences=[w],
strict=True)
return h[-1]
def get_outputs(x, w, h0):
features, _ = theano.scan(outer_scan_step,
sequences=[x],
non_sequences=[w, h0],
strict=True)
return tt.grad(features.sum(), w)
def main():
theano.config.compute_test_value = 'raise'
x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))
x = tt.tensor3()
x.tag.test_value = x_value
w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)
h0 = theano.shared(value=numpy.zeros(3, dtype=theano.config.floatX), borrow=True)
f = theano.function(inputs=[x], outputs=get_outputs(x, w, h0))
print f(x_value)
if __name__ == "__main__":
main()
I've encountered the same issue and I fixed it by letting optimizer=fast_compile in theano_flags. Guess that is a bug of theano.