A weird error with updates in theano - theano

I designed a variable net, but it occurred some problems with theano. The general idea is that different input will get different net with same parameters, something like a recursive neural network with auto-encoder.
There are two cases in my code, one case is run combine_feat_gt1_1() if c > 1, the other case is run combine_feat_gt1_0().
It is weird that the code can run without bugs if I comment updates=updates, which is not my expected (train_test theano function in code). However, if I uncomment updates=updates, an error occurred (train_test_bug theano function in code). The later one is that I'd like to implement.
I have been already spend some days on this bug. Who can help me? I will appreciate that.
import os
import sys
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from theano.ifelse import ifelse
class Test(object):
def __init__(
self,
numpy_rng,
input=None,
output=None,
n_output=6,
n_input=3,
n_group=2,
W_r=None,
b_r=None
):
self.n_output = n_output
self.n_input = n_input
self.n_group = n_group
if not W_r:
initial_W_r = numpy.asarray(
numpy_rng.uniform(
low=-4 * numpy.sqrt(6. / (n_input + n_input)),
high=4 * numpy.sqrt(6. / (n_input + n_input)),
size=(n_input, n_input)
),
dtype=theano.config.floatX
)
W_r = theano.shared(value=initial_W_r, name='W_r', borrow=True)
if not b_r:
b_r = theano.shared(
value=numpy.zeros(
n_input,
dtype=theano.config.floatX
),
borrow=True
)
self.W_r = W_r
self.b_r = b_r
if input is None:
self.x = T.tensor4(name='input', dtype=theano.config.floatX)
else:
self.x = input
if output is None:
self.y = T.matrix(name='output', dtype=theano.config.floatX)
else:
self.y = output
self.params = [self.W_r, self.b_r]
def get_output_values(self, input):
a, b, c, d = input.shape
def recusive(x_t, h_tm1, wr, hr):
h_t = T.dot(h_tm1, wr) + T.dot(x_t, wr) + hr
return h_t
def combine_recusive(data):
hidden, _ = theano.scan(fn=recusive,
sequences=data[1:],
outputs_info=data[0],
non_sequences=[self.W_r, self.b_r],
n_steps=data[1:].shape[0],
strict=True)
return hidden[-1]
def combine_feat_gt1_1(input):
feats, _ = theano.scan(fn=combine_recusive,
sequences=input[0],
outputs_info=None,
n_steps=input[0].shape[0])
recusive_flag = T.ones(1)
return T.reshape(feats, (1,-1)) # concatenation
def combine_feat_gt1_0(input):
feats = input[0]
recusive_flag = T.zeros(1)
return T.reshape(feats, (1,-1)) # concatenation
feat = ifelse(T.gt(c, 1), combine_feat_gt1_1(input), combine_feat_gt1_0(input))
# debug code snippet
self.debug_ifelse = theano.function([input], T.gt(c, 1))
self.debug_1_0 = theano.function([input], ifelse(T.gt(c, 1), 1, 0))
return feat
def get_cost_updates(self):
learning_rate = 0.1
self.y_given_x = self.get_output_values(self.x)
cost = T.sum(( self.y_given_x - self.y) ** 2)
gparams = T.grad(cost, self.params)
updates = [
(param, param - learning_rate * gparam)
for param, gparam in zip(self.params, gparams)
]
return (cost, updates)
if __name__ == "__main__":
toy_data = numpy.array([[[[1,1,1],[2,2,2]], [[3, 4,5],[4,5,6]]]],dtype=theano.config.floatX)
lable = numpy.array([[1,2,3,4,5,6]],dtype=theano.config.floatX)
toy_data2 = numpy.array([[[[1,1,1]], [[3,4,5]]]],dtype=theano.config.floatX)
lable2 = numpy.array([[6,5,4,3,2,1]],dtype=theano.config.floatX)
x = T.tensor4('x', dtype=theano.config.floatX)
y = T.matrix('y', dtype=theano.config.floatX)
newX = T.tensor4(dtype=x.dtype)
newY = T.matrix(dtype=y.dtype)
rng = numpy.random.RandomState(123)
test = Test(
numpy_rng=rng,
input=x,
output=y,
n_group=2,
n_input=3,
n_output=6
)
cost, updates= test.get_cost_updates()
train_test = theano.function(
[newX, newY],
cost,
# updates=updates,
givens={
x : newX,
y : newY
}
)
train_test_bug = theano.function(
[newX, newY],
cost,
updates=updates,
givens={
x : newX,
y : newY
}
)
print train_test(toy_data, lable)
print train_test(toy_data2, lable2)
# code with bug
# print train_test_bug(toy_data, lable)
# print train_test_bug(toy_data2, lable2)
EDIT (by #danielrenshaw)
I've cut the code down to a simpler demonstration of the problem.
The cause is in the gradient computation of a double-nested scan expression. The problem disappears when a modified inner-most recursive expression is used (see comments in first function below).
import numpy
import theano
import theano.tensor as tt
import theano.ifelse
def inner_scan_step(x_t_t, h_tm1, w):
# Fails when using this recursive expression
h_t = tt.dot(h_tm1, w) + x_t_t
# No failure when using this recursive expression
# h_t = h_tm1 + tt.dot(x_t_t, w)
return h_t
def outer_scan_step(x_t, w):
h, _ = theano.scan(inner_scan_step,
sequences=[x_t[1:]],
outputs_info=[x_t[0]],
non_sequences=[w],
strict=True)
return h[-1]
def get_outputs(x, w):
features, _ = theano.scan(outer_scan_step,
sequences=[x],
non_sequences=[w],
strict=True)
return tt.grad(features.sum(), w)
def main():
theano.config.compute_test_value = 'raise'
x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))
x = tt.tensor3()
x.tag.test_value = x_value
w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)
f = theano.function(inputs=[x], outputs=get_outputs(x, w))
print f(x_value)
if __name__ == "__main__":
main()

I solved this problem edited by danielrenshaw. When I add h0 as outputs_info, it work. Before that I used first element of sequence as outputs_info, I think it caused the error. But I still cannot solve my original problem.
import numpy
import theano
import theano.tensor as tt
import theano.ifelse
def inner_scan_step(x_t_t, h_tm1, w):
# Fails when using this recursive expression
h_t = tt.dot(h_tm1, w) + x_t_t
# No failure when using this recursive expression
# h_t = h_tm1 + tt.dot(x_t_t, w)
return h_t
def outer_scan_step(x_t, w, h0):
h, _ = theano.scan(inner_scan_step,
sequences=[x_t],
outputs_info=[h0],
non_sequences=[w],
strict=True)
return h[-1]
def get_outputs(x, w, h0):
features, _ = theano.scan(outer_scan_step,
sequences=[x],
non_sequences=[w, h0],
strict=True)
return tt.grad(features.sum(), w)
def main():
theano.config.compute_test_value = 'raise'
x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))
x = tt.tensor3()
x.tag.test_value = x_value
w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)
h0 = theano.shared(value=numpy.zeros(3, dtype=theano.config.floatX), borrow=True)
f = theano.function(inputs=[x], outputs=get_outputs(x, w, h0))
print f(x_value)
if __name__ == "__main__":
main()

I've encountered the same issue and I fixed it by letting optimizer=fast_compile in theano_flags. Guess that is a bug of theano.

Related

Why does the error say: class has no attribute in my case

I have the following code. I'm running a "fit" function and I'm getting the following error: 'P' object has no attribute 'iterations'. I do not really understand why its happening, I declare it in the "init" method.
import numpy as np
class P:
def __init__(self, iterations: int = 100):
self.w = None
self.iterations = iterations
self.classes_map = None
def fit(self, X: np.ndarray, y: np.ndarray) -> NoReturn:
X = np.hstack((np.ones((X.shape[0], 1)), X))
self.w = np.zeros(X.shape[1])
classes = np.unique(y)
self.classes_map = {-1: classes[0], 1: classes[1]}
y_ = np.where(y == classes[0], -1, 1)
for _ in range(self.iterations):
predictions = np.sign(X.dot(self.w))
incorrect_indices = predictions != y_
if np.any(incorrect_indices):
self.w += np.dot(y_[incorrect_indices], X[incorrect_indices])
def predict(self, X: np.ndarray) -> np.ndarray:
X = np.hstack((np.ones((X.shape[0], 1)), X))
predictions = np.sign(X.dot(self.w))
predictions[predictions == -1] = self.classes_map[-1]
predictions[predictions == 1] = self.classes_map[1]
return predictions
So I try to call it:
X, true_labels = make_blobs(400, 2, centers=[[0, 0], [2.5, 2.5]])
c = P()
c.fit(X, true_labels) #AttributeError: 'P' object has no attribute 'iterations'

How to remove inplace operation error in Pytorch?

I get this error from the following Pytorch code:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.DoubleTensor [3]] is at version 10; expected version 9 instead.
As it is seen the code does not have inplace operations.
import torch
device = torch.device('cpu')
class MesNet(torch.nn.Module):
def __init__(self):
super(MesNet, self).__init__()
self.cov_lin = torch.nn.Sequential(torch.nn.Linear(6, 5)).double()
def forward(self, u):
z_cov = self.cov_lin(u.transpose(0, 2).squeeze(-1))
return z_cov
class UpdateModel(torch.nn.Module):
def __init__(self):
torch.nn.Module.__init__(self)
self.P_dim = 18
self.Id3 = torch.eye(3).double()
def run_KF(self):
N = 10
u = torch.randn(N, 6).double()
v = torch.zeros(N, 3).double()
model = MesNet()
measurements_covs_l = model(u.t().unsqueeze(0))
# remember to remove this afterwards
torch.autograd.set_detect_anomaly(True)
for i in range(1, N):
v[i] = self.update_pos(v[i].detach(), measurements_covs_l[i-1])
criterion = torch.nn.MSELoss(reduction="sum")
targ = torch.rand(10, 3).double()
loss = criterion(v, targ)
loss = torch.mean(loss)
loss.backward()
return v, p
def update_pos(self, v, measurement_cov):
Omega = torch.eye(3).double()
H = torch.ones((5, self.P_dim)).double()
R = torch.diag(measurement_cov)
Kt = H.t().mm(torch.inverse(R))
# it is indicating inplace error even with this:
# Kt = H.t().mm(R)
dx = Kt.mv(torch.ones(5).double())
dR = self.trans(dx[:9].clone())
v_up = dR.mv(v)
return v_up
def trans(self, xi):
phi = xi[:3].clone()
angle = torch.norm(phi.clone())
if angle.abs().lt(1e-10):
skew_phi = torch.eye(3).double()
J = self.Id3 + 0.5 * skew_phi
Rot = self.Id3 + skew_phi
else:
axis = phi / angle
skew_axis = torch.eye(3).double()
s = torch.sin(angle)
c = torch.cos(angle)
Rot = c * self.Id3
return Rot
net = UpdateModel()
net.run_KF()
I think the issue is that you are overwriting v[i] elements.
You could instead construct an auxiliary list v_ from the loop, then convert it tensor:
v_ = [v[0]]
for i in range(1, N):
v_.append(self.update_pos(v[i].detach(), measurements_covs_l[i-1]))
v = torch.stack(v_)

Pytorch autograd: Make gradient of a parameter a function of another parameter

In Pytorch, how can I make the gradient of a parameter a function itself?
Here is a simple code snippet:
import torch
def fun(q):
def result(w):
l = w * q
l.backward()
return w.grad
return result
w = torch.tensor((2.), requires_grad=True)
q = torch.tensor((3.), requires_grad=True)
f = fun(q)
print(f(w))
In the code above, how can I make f(w) have gradient with respect to q?
EDIT: based on the accepted answer I was able to write a code that works. Essentially I am alternating between 2 optimization steps. For dim == 1 it works and for dim == 2 it does not. I get the error "RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling backward the first time."
import torch
class f_class():
def __init__(self, dim):
self.dim = dim
if self.dim == 1:
self.w = torch.tensor((3.), requires_grad=True)
elif self.dim == 2:
self.w = [torch.tensor((3.), requires_grad=True), torch.tensor((5.), requires_grad=True)]
else:
raise ValueError("dim 1 or 2")
def forward(self, x):
if self.dim == 1:
return torch.mul(self.w, x)
elif self.dim == 2:
return torch.mul(torch.mul(self.w[0], self.w[1]), x)
def set_w(self, w):
self.w = w
def get_w(self):
return self.w
class g_class():
def __init__(self):
self.q = torch.tensor((4.), requires_grad=True)
def forward(self, f):
return torch.mul(self.q, f)
def set_q(self, q):
self.q = q
def get_q(self):
return self.q
def w_new(f, g, dim):
loss_g = g.forward(f.forward(xd))
if dim == 1:
grads = torch.autograd.grad(loss_g, f.get_w(), create_graph=True, only_inputs=True)[0]
temp = f.get_w().detach() + grads
else:
grads = torch.autograd.grad(loss_g, f.get_w(), create_graph=True, only_inputs=True)
temp = [wi.detach() + gi for wi, gi in zip(f.get_w(), grads)]
return temp
def q_new(f, g):
loss_f = 2 * f.forward(xd)
loss_f.backward()
temp = g.get_q().detach() + g.get_q().grad
temp.requires_grad = True
return temp
dim = 1
xd = torch.tensor((2.))
f = f_class(dim)
g = g_class()
for _ in range(3):
print(f.get_w(), g.get_q())
wnew = w_new(f, g, dim)
f.set_w(wnew)
print(f.get_w(), g.get_q())
qnew = q_new(f, g)
g.set_q(qnew)
print(f.get_w(), g.get_q())
When computing gradients, if you want to construct a computation graph for the gradient itself you need to specify create_graph=True to autograd.
A potential source of error in your code is using Tensor.backward within f. The problem here is that w.grad and q.grad will be populated with the gradient of l. This means that when you call f(w).backward(), the gradients of both f and l will be added to w.grad and q.grad. In effect you will end up with w.grad being equal to dl/dw + df/dw and similarly for q.grad. One way to get around this is to zero the gradients after f(w) but before .backward(). A better way is to use torch.autograd.grad within f. Using the latter approach, the grad attribute of w and q will not be populated when calling f, only when calling .backward(). This leaves room for things like gradient accumulation during training.
import torch
def fun(q):
def result(w):
l = w * q
return torch.autograd.grad(l, w, only_inputs=True, retain_graph=True)[0]
return result
w = torch.tensor((2.), requires_grad=True)
q = torch.tensor((3.), requires_grad=True)
f = fun(q)
f(w).backward()
print('w.grad:', w.grad)
print('q.grad:', q.grad)
which results in
w.grad: None
q.grad: tensor(1.)
Note that w.grad was not populated. This is because f(w) = dl/dw = q is not a function of w, and therefore w is not part of the computation graph. If you're using a standard pytorch optimizer this is fine since None gradients are implicitly assumed to be zero.
If l were instead a non-linear function of w, then w.grad would have been populated after f(w).backward(). For example
import torch
def fun(q):
def result(w):
# now dl/dw = 2 * w * q
l = w**2 * q
return torch.autograd.grad(l, w, only_inputs=True, create_graph=True)[0]
return result
w = torch.tensor((2.), requires_grad=True)
q = torch.tensor((3.), requires_grad=True)
f = fun(q)
f(w).backward()
print('w.grad:', w.grad)
print('q.grad:', q.grad)
which results in
w.grad: tensor(6.)
q.grad: tensor(4.)

Having Problems With Implementing the Perceptron Algorithm in Python

I'm having a problem debugging the following code, for some reason The perceptron stops updating itself after a couple of steps with random values as the weights. I have tried not using a class for my work and edited everything to the bare minimum, but still had the same problem. I have also checked the Perceptron.train(), and it works just fine. So, I'm guessing the main problem is with the train function itself. I am kind of new to python programming so any help would be apreciated guys.
import random
import Plot as plt
import numpy as np
#-----Function Of the line that seperates the two different Data Types-----$
def f(x):
return x
#-----Activation Function-----#
def act(x):
if x >= 0:
return 1.0
return 0.0
class Point:
def __init__(self, x, y):
self.X = x
self.Y = y
if y > f(x):
self.Target = 1.0
else:
self.Target = 0.0
class Perceptron:
def __init__(self, n, actFunc = act, lr = 0.2):
self.Weights = [0 for i in range(n)]
self.ActFunc = actFunc
self.LR = lr
def guess(self, inputs):
valSum = 0
for i in range(len(inputs)):
valSum += self.Weights[i] * inputs[i]
return self.ActFunc(valSum)
def train(self, inputs, target):
cal = self.guess(inputs)
err = target - cal
for i in range(0, len(self.Weights)):
self.Weights[i] += self.LR * err * inputs[i]
def printWeights(self):
for i in range(len(self.Weights)):
print("WEIGHT[" + str(i) + "] = " + str(self.Weights[i]))
print("")
def lineFunc(self):
# y = w0 + w1x + w2y
# (1 - w2)y = w0 + w1x
# y = w0/(1-w2) + w1/(1 - w2)x
w0 = self.Weights[0]
w1 = self.Weights[1]
w2 = self.Weights[2]
return (str(w0/(1 - w2)) + " + " + str(w1/(1 - w2)) + " * x")
#-----INITIALISING DATA------#
brain = Perceptron(3)
n = 20
points = [Point(random.uniform(-10, 10), random.uniform(-10, 10)) for x in range(n)]
t = 1000
#-----Training-----#
for i in range(t):
point = points[random.randrange(0, n)]
brain.train([1, point.X, point.Y], point.Target)
brain.printWeights()
print(brain.lineFunc())
I did find the problem myself. There was a bug in the LineFunc() method. The return value was wrong and it should have been:
return (str(-w0/w2) + " + " + str(-w1/w2) + " * x")

a simple tensorflow test from book

I copy a simple tensorflow test from the book written by Sam Abrahams. In chapter 4,when I test the softmax.py about the iris.data,the program has no errors but does not have any results. I debug the program for several days but don't know how to debug it.The code is as follows. This problem puzzles me almost one week and thanks to anyone answers this question. Thank you very much!
import tensorflow as tf
import os
W = tf.Variable(tf.zeros([4, 3]), name="weights")
b = tf.Variable(tf.zeros([3]), name="bias")
def combine_inputs(X):
return tf.matmul(X, W) + b
def inference(X):
return tf.nn.softmax(combine_inputs(X))
def loss(X, Y):
return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=combine_inputs(X), labels=Y))
def read_csv(batch_size, file_name, record_defaults):
filename_queue = tf.train.string_input_producer([os.path.dirname(os.path.abspath(__file__)) + "/" + file_name])
reader = tf.TextLineReader()
key, value = reader.read(filename_queue)
decoded = tf.decode_csv(value, record_defaults=record_defaults)
return tf.train.shuffle_batch(decoded,
batch_size=batch_size,
capacity=batch_size * 50,
min_after_dequeue=batch_size)
def inputs():
sepal_length, sepal_width, petal_length, petal_width, label =\
read_csv(100, "iris.data", [[0.0], [0.0], [0.0], [0.0], [""]])
label_number = tf.to_int32(tf.argmax(tf.to_int32(tf.stack([
tf.equal(label, ["Iris-setosa"]),
tf.equal(label, ["Iris-versicolor"]),
tf.equal(label, ["Iris-virginica"])
])), 0))
features = tf.transpose(tf.stack([sepal_length, sepal_width, petal_length, petal_width]))
return features, label_number
def train(total_loss):
learning_rate = 0.01
return tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)
def evaluate(sess, X, Y):
predicted = tf.cast(tf.argmax(inference(X), 1), tf.int32)
print (sess.run(tf.reduce_mean(tf.cast(tf.equal(predicted, Y), tf.float32))))
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
tf.global_variables_initializer().run()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
X, Y = inputs()
total_loss = loss(X, Y)
train_op = train(total_loss)
training_steps = 1000
for step in range(training_steps):
sess.run([train_op])
evaluate(sess, X, Y)
coord.request_stop()
coord.join(threads)
sess.close()

Resources