How to handle JAX reshape with JIT

I am trying to implement entmax-alpha as is described in here.
Here is the code.
import jax
import jax.numpy as jnp
from jax import custom_jvp
from jax import jit
from jax import lax
from jax import vmap
#jax.partial(jit, static_argnums=(2,))
def p_tau(z, tau, alpha=1.5):
return jnp.clip((alpha - 1) * z - tau, a_min=0) ** (1 / (alpha - 1))
def get_tau(tau, tau_max, tau_min, z_value):
return lax.cond(z_value < 1,
lambda _: (tau, tau_min),
lambda _: (tau_max, tau),
def body(kwargs, x):
tau_min = kwargs['tau_min']
tau_max = kwargs['tau_max']
z = kwargs['z']
alpha = kwargs['alpha']
tau = (tau_min + tau_max) / 2
z_value = p_tau(z, tau, alpha).sum()
taus = get_tau(tau, tau_max, tau_min, z_value)
tau_max, tau_min = taus[0], taus[1]
return {'tau_min': tau_min, 'tau_max': tau_max, 'z': z, 'alpha': alpha}, None
#jax.partial(jit, static_argnums=(1, 2,))
def map_row(z_input, alpha, T):
z = (alpha - 1) * z_input
tau_min, tau_max = jnp.min(z) - 1, jnp.max(z) - z.shape[0] ** (1 - alpha)
result, _ = lax.scan(body, {'tau_min': tau_min, 'tau_max': tau_max, 'z': z, 'alpha': alpha}, xs=None,
tau = (result['tau_max'] + result['tau_min']) / 2
result = p_tau(z, tau, alpha)
return result / result.sum()
#jax.partial(custom_jvp, nondiff_argnums=(1, 2, 3,))
def entmax(input, axis=-1, alpha=1.5, T=10):
reduce_length = input.shape[axis]
input = jnp.swapaxes(input, -1, axis)
input = input.reshape(input.size / reduce_length, reduce_length)
result = vmap(jax.partial(map_row, alpha=alpha, T=T), 0)(input)
return jnp.swapaxes(result, -1, axis)
#jax.partial(jit, static_argnums=(1, 2,))
def _entmax_jvp_impl(axis, alpha, T, primals, tangents):
input = primals[0]
Y = entmax(input, axis, alpha, T)
gppr = Y ** (2 - alpha)
grad_output = tangents[0]
dX = grad_output * gppr
q = dX.sum(axis=axis) / gppr.sum(axis=axis)
q = jnp.expand_dims(q, axis=axis)
dX -= q * gppr
return Y, dX
def entmax_jvp(axis, alpha, T, primals, tangents):
return _entmax_jvp_impl(axis, alpha, T, primals, tangents)
When I call it with the following code:
import numpy as np
from jax import value_and_grad
input = jnp.array(np.random.randn(64, 10))
weight = jnp.array(np.random.randn(64, 10))
def toy(input, weight):
return (weight*entmax(input, axis=-1, alpha=1.5, T=20)).sum()
value_and_grad(toy)(input, weight)
I got the following error.
TypeError Traceback (most recent call last)
<ipython-input-3-3a62e54c67d2> in <module>()
7 return (weight*entmax(input, axis=-1, alpha=1.5, T=20)).sum()
----> 9 value_and_grad(toy)(input, weight)
35 frames
<ipython-input-1-d85b1daec668> in entmax(input, axis, alpha, T)
49 #jax.partial(custom_jvp, nondiff_argnums=(1, 2, 3,))
50 def entmax(input, axis=-1, alpha=1.5, T=10):
---> 51 reduce_length = input.shape[axis]
52 input = jnp.swapaxes(input, -1, axis)
53 input = input.reshape(input.size / reduce_length, reduce_length)
TypeError: tuple indices must be integers or slices, not DynamicJaxprTracer
It seems to be always connected to the reshape operations. I am not sure why this happens, and any help will be really appreciated.
To recreate the problem, here is the colab notebook
Thanks a lot.

The error comes from the fact that you are attempting to index a Python tuple with a traced quantity, axis. You can fix this error by making axis a static argument:
#jax.partial(jit, static_argnums=(0, 1, 2,))
def _entmax_jvp_impl(axis, alpha, T, primals, tangents):
Unfortunately, this uncovers another problem: p_tau declares that the alpha parameter is static, but body() calls this with a traced quantity. This quantity cannot be easily marked static in body because it is passed within a dictionary of parameters that contains the input that is being traced.
To fix this, you'll have to rewrite your function signatures, carefully marking in each one which inputs are static and which are not, and making sure the two do not mix across the layers of function calls.


Determine the rating for text quality in an Image

I want to determine the quality score of the text by giving them some score or rating (something like ' image-text is 90% bad. Texts are not readable ).
What I am doing now is I am using the Blind/referenceless image spatial quality evaluator (BRISQUE) model to assess the quality.
It gives scores from 0 to 100. 0 score for good quality and 100 for bad quality.
The problem I am having with this code is that it is giving bad scores to even good quality "images-texts".
Also, the score exceeds 100 sometimes but according to the reference I am taking, the score should be between 0 to 100 only.
Can someone please suggest to me how can I get promising and reliable results for assessing the quality of the text-based images?
import collections
from itertools import chain
# import urllib.request as request
import pickle
import numpy as np
import scipy.signal as signal
import scipy.special as special
import scipy.optimize as optimize
# import matplotlib.pyplot as plt
import skimage.transform
import cv2
from libsvm import svmutil
from os import listdir
# Calculating Local Mean
def normalize_kernel(kernel):
return kernel / np.sum(kernel)
def gaussian_kernel2d(n, sigma):
Y, X = np.indices((n, n)) - int(n/2)
gaussian_kernel = 1 / (2 * np.pi * sigma ** 2) * np.exp(-(X ** 2 + Y ** 2) / (2 * sigma ** 2))
return normalize_kernel(gaussian_kernel)
def local_mean(image, kernel):
return signal.convolve2d(image, kernel, 'same')
# Calculating the local deviation
def local_deviation(image, local_mean, kernel):
"Vectorized approximation of local deviation"
sigma = image ** 2
sigma = signal.convolve2d(sigma, kernel, 'same')
return np.sqrt(np.abs(local_mean ** 2 - sigma))
# Calculate the MSCN coefficients
def calculate_mscn_coefficients(image, kernel_size=6, sigma=7 / 6):
C = 1 / 255
kernel = gaussian_kernel2d(kernel_size, sigma=sigma)
local_mean = signal.convolve2d(image, kernel, 'same')
local_var = local_deviation(image, local_mean, kernel)
return (image - local_mean) / (local_var + C)
# It is found that the MSCN coefficients are distributed as a Generalized Gaussian Distribution (GGD) for a broader spectrum of distorted image.
# Calculate GGD
def generalized_gaussian_dist(x, alpha, sigma):
beta = sigma * np.sqrt(special.gamma(1 / alpha) / special.gamma(3 / alpha))
coefficient = alpha / (2 * beta() * special.gamma(1 / alpha))
return coefficient * np.exp(-(np.abs(x) / beta) ** alpha)
# Pairwise products of neighboring MSCN coefficients
def calculate_pair_product_coefficients(mscn_coefficients):
return collections.OrderedDict({
'mscn': mscn_coefficients,
'horizontal': mscn_coefficients[:, :-1] * mscn_coefficients[:, 1:],
'vertical': mscn_coefficients[:-1, :] * mscn_coefficients[1:, :],
'main_diagonal': mscn_coefficients[:-1, :-1] * mscn_coefficients[1:, 1:],
'secondary_diagonal': mscn_coefficients[1:, :-1] * mscn_coefficients[:-1, 1:]
# Asymmetric Generalized Gaussian Distribution (AGGD) model
def asymmetric_generalized_gaussian(x, nu, sigma_l, sigma_r):
def beta(sigma):
return sigma * np.sqrt(special.gamma(1 / nu) / special.gamma(3 / nu))
coefficient = nu / ((beta(sigma_l) + beta(sigma_r)) * special.gamma(1 / nu))
f = lambda x, sigma: coefficient * np.exp(-(x / beta(sigma)) ** nu)
return np.where(x < 0, f(-x, sigma_l), f(x, sigma_r))
# Fitting Asymmetric Generalized Gaussian Distribution
def asymmetric_generalized_gaussian_fit(x):
def estimate_phi(alpha):
numerator = special.gamma(2 / alpha) ** 2
denominator = special.gamma(1 / alpha) * special.gamma(3 / alpha)
return numerator / denominator
def estimate_r_hat(x):
size =
return (np.sum(np.abs(x)) / size) ** 2 / (np.sum(x ** 2) / size)
def estimate_R_hat(r_hat, gamma):
numerator = (gamma ** 3 + 1) * (gamma + 1)
denominator = (gamma ** 2 + 1) ** 2
return r_hat * numerator / denominator
def mean_squares_sum(x, filter=lambda z: z == z):
filtered_values = x[filter(x)]
squares_sum = np.sum(filtered_values ** 2)
return squares_sum / ((filtered_values.shape))
def estimate_gamma(x):
left_squares = mean_squares_sum(x, lambda z: z < 0)
right_squares = mean_squares_sum(x, lambda z: z >= 0)
return np.sqrt(left_squares) / np.sqrt(right_squares)
def estimate_alpha(x):
r_hat = estimate_r_hat(x)
gamma = estimate_gamma(x)
R_hat = estimate_R_hat(r_hat, gamma)
solution = optimize.root(lambda z: estimate_phi(z) - R_hat, [0.2]).x
return solution[0]
def estimate_sigma(x, alpha, filter=lambda z: z < 0):
return np.sqrt(mean_squares_sum(x, filter))
def estimate_mean(alpha, sigma_l, sigma_r):
return (sigma_r - sigma_l) * constant * (special.gamma(2 / alpha) / special.gamma(1 / alpha))
alpha = estimate_alpha(x)
sigma_l = estimate_sigma(x, alpha, lambda z: z < 0)
sigma_r = estimate_sigma(x, alpha, lambda z: z >= 0)
constant = np.sqrt(special.gamma(1 / alpha) / special.gamma(3 / alpha))
mean = estimate_mean(alpha, sigma_l, sigma_r)
return alpha, mean, sigma_l, sigma_r
# Calculate BRISQUE features
def calculate_brisque_features(image, kernel_size=7, sigma=7 / 6):
def calculate_features(coefficients_name, coefficients, accum=np.array([])):
alpha, mean, sigma_l, sigma_r = asymmetric_generalized_gaussian_fit(coefficients)
if coefficients_name == 'mscn':
var = (sigma_l ** 2 + sigma_r ** 2) / 2
return [alpha, var]
return [alpha, mean, sigma_l ** 2, sigma_r ** 2]
mscn_coefficients = calculate_mscn_coefficients(image, kernel_size, sigma)
coefficients = calculate_pair_product_coefficients(mscn_coefficients)
features = [calculate_features(name, coeff) for name, coeff in coefficients.items()]
flatten_features = list(chain.from_iterable(features))
return np.array(flatten_features, dtype=object)
# Loading image from local machine
def load_image(file):
return cv2.imread(file)
# return"img.png", plugin='pil')
path = "C:\\Users\\Krishna\\PycharmProjects\\ImageScore\\images2\\"
image_list = listdir(path)
for file in image_list:
image = load_image(path+file)
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# image = load_image()
# gray_image = skimage.color.rgb2gray(image)
# _ =
# Calculate Coefficients
mscn_coefficients = calculate_mscn_coefficients(gray_image, 7, 7/6)
coefficients = calculate_pair_product_coefficients(mscn_coefficients)
# Fit Coefficients to Generalized Gaussian Distributions
brisque_features = calculate_brisque_features(gray_image, kernel_size=7, sigma=7/6)
# Resize Image and Calculate BRISQUE Features
downscaled_image = cv2.resize(gray_image, None, fx=1/2, fy=1/2, interpolation = cv2.INTER_CUBIC)
downscale_brisque_features = calculate_brisque_features(downscaled_image, kernel_size=7, sigma=7/6)
brisque_features = np.concatenate((brisque_features, downscale_brisque_features))
# a pretrained SVR model to calculate the quality assessment. However, in order to have good results, we need to scale the features to [-1, 1]
def scale_features(features):
with open('normalize.pickle', 'rb') as handle:
scale_params = pickle.load(handle)
min_ = np.array(scale_params['min_'])
max_ = np.array(scale_params['max_'])
return -1 + (2.0 / (max_ - min_) * (features - min_))
def calculate_image_quality_score(brisque_features):
model = svmutil.svm_load_model('brisque_svm.txt')
scaled_brisque_features = scale_features(brisque_features)
x, idx = svmutil.gen_svm_nodearray(
isKernel=(model.param.kernel_type == svmutil.PRECOMPUTED))
nr_classifier = 1
prob_estimates = (svmutil.c_double * nr_classifier)()
return svmutil.libsvm.svm_predict_probability(model, x, prob_estimates)
Here is one output for the quality score I am getting for one of the "text-based image"

How to resolve value error in Scipy function fmintnc?

I am trying to implement coursera assignments in python, while doing Scipy optimise for logistic regression. However, I am getting the error below.
Can any one help!
Note: cost, gradient functions are working fine.
#Sigmoid function
def sigmoid(z):
h_of_z = np.zeros([z.shape[0]])
h_of_z = np.divide(1,(1+(np.exp(-z))))
return h_of_z
def cost(x,y,theta):
m = y.shape[0]
h_of_x = sigmoid(np.matmul(x,theta))
term1 = sum(-1 * y.T # np.log(h_of_x) - (1-y.T) # np.log(1-h_of_x))
J = 1/m * term1
return J
def grad(x,y,theta):
grad = np.zeros_like(theta)
m = y.shape[0]
h_of_x = sigmoid(x#theta)
grad = (x.T # (h_of_x - y)) * (1/m)
return grad
#add intercept term for X
x = np.hstack([np.ones_like(y),X[:,0:2]])
#initialise theta
[m,n] = np.shape(x)
initial_theta = np.zeros([n,1])
#optimising theta from given theta and gradient
result = opt.fmin_tnc(func=cost, x0=initial_theta, args=(x, y))
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 99 is different from 3)
I got it !
so the problem is fmin_tnc function programmed in a way we should parse the the parameter 'theta' before calling arguments x and y .
Since in my function 'cost' I have passed x and y first, it interpreted values differently so thrown ValueError .
Below are the corrected code..
def sigmoid(x):
return 1/(1+np.exp(-x))
def cost(theta,x,y):
J = (-1/m) * np.sum(np.multiply(y, np.log(sigmoid(x # theta)))
+ np.multiply((1-y), np.log(1 - sigmoid(x # theta))))
return J
def gradient(theta,x,y):
h_of_x = sigmoid(x#theta)
grad = 1 / m * (x.T # (h_of_x - y))
return grad
#initialise theta
init_theta = np.zeros([n+1,1])
#optimise theta
from scipy import optimize as op
result = op.fmin_tnc(func=cost,

Tensor("pow:0", ...) must be from the same graph as Tensor("Cast_2:0", ...)

I am trying to model something which needs to do the definite integration. The code is showing as below:
import tensorflow as tf
from numpy import pi, inf
from tensorflow import log, sqrt, exp, pow
from scipy.integrate import quad # for integration
def risk_neutral_pdf(phi, a, S, K, r, sigma, Mt, p_dict):
phii = tf.complex(0., phi)
A = tf.cast(0., tf.complex64)
B = tf.cast(0., tf.complex64)
p_dict['gamma'] = p_dict['gamma'] + p_dict['lamda'] + .5
p_dict['lamda'] = -.5
for t in range(Mt-1, -1, -1):
temp = 1. - 2. * p_dict['alpha'] * B
A = A + (phii + a) * r + p_dict['omega'] * B - .5 * log(temp)
B = B * p_dict['beta'] + (phii + a) * (p_dict['lamda'] + p_dict['gamma']) - \
.5 * p_dict['gamma']**2. + (.5*((phii + a) - p_dict['gamma'])**2. / temp)
return tf.real(S**a * (S/K)**phii * exp(A + B * sigma**2.) / phii)
p_dict={'lamda': 0.205, 'omega': 5.02e-6, 'beta': 0.589, 'gamma': 421.39, 'alpha': 1.32e-6}
S = 100.
K = 100.
r = 0.
Mt = 0
sq_ht = sqrt(.15**2/252.)
sigma = sq_ht
P1 = tf.py_func(lambda z: quad(risk_neutral_pdf, z, inf, args=(1., S, K, r, sigma, Mt, p_dict))[0],
[0.], tf.float64)
with tf.Session() as sess:
res =
The result returns "InvalidArgumentError (see above for traceback): ValueError: Tensor("pow:0", shape=(), dtype=float32) must be from the same graph as Tensor("Cast_2:0", shape=(), dtype=complex64)." However, no matter how I change the code or reference the solution in "ValueError: Tensor A must be from the same graph as Tensor B", it does not work. I am wondering if I did wrong when putting the tf.reset_default_graph() at the top place or should the code needs be done some changes.
Thank you. (Tensroflow version: 1.6.0)
I find that the sigma variable has been sqrt before passing into the risk_neutral_pdf function and be powered when return which is not necessary. So after modifying the return to return tf.real(S**a * (S/K)**phii * exp(A + B * sigma) / phii) and the sq_ht to .15**2/252.. The error changes to "TypeError: a float is required", which I think caused by quad and Tensor. Any ideas to solve??
Many thanks.

IndexError: too many indices for array in Scipy.Optimize

I'm trying to debbug some code with Scipy.Optimize.
The bug comes from the constante: the optimisation works fine without it. The constante itself seems to works fine outside scipy.optimize (the variable testconst is computed normally). The code is the following:
from scipy.optimize import minimize
import numpy as np
def totaldist(dy):
n = np.shape(dy)[0]
temp = 0
for i in range(n):
temp += dy[i] ** 2
return -0.5 * temp
def create_bond(dy_max):
n = np.shape(dy_max)[0]
bond = np.zeros((n, 2))
for i in range(n):
bond[i, :] = [0, dy_max[i]]
tot = tuple([tuple(row) for row in bond])
return tot
# def create_const(type_x, dx, gamma, P):
def create_const(dy, *args):
arg = np.asarray(args)
n = np.shape(dy)[0]
dx = np.zeros((n, 2))
bnd = np.zeros((n, 2))
# from args to numpy array
type_x = np.zeros(n)
dP = 0
delta1 = np.zeros(n)
delta2 = np.zeros(n)
gamma = np.zeros((n, n))
for i in range(n):
a, b = bndr(arg[0, i])
delta1[i] = arg[0, i + n + 1]
delta2[i] = arg[0, i + 2*n + 1]
dx[i, 0] = (b - a) * dy[i]
gamma = GammaApprox(delta1, delta2, dx[:, 1], dx[:, 0])
d =, dx[:, 0])
g =[:, 0], gamma)
g =, dx[:, 0])
dP = float(arg[0, n])
return d + 0.5 * g - dP
def GammaApprox(delta1, delta2, x1, x2):
n = np.shape(delta1)[0]
gamma = np.zeros((n, n))
for i in range(n):
if x2[i] == x1[i]:
gamma[i, i] = 0
gamma[i, i] = (delta2[i] - delta1[i]) / (x2[i] - x1[i])
return gamma
def GetNewPoint(x1, x2, delta1, delta2, type_x, P):
n = np.shape(delta1)[0]
dmax = np.zeros(n)
dy0 = np.zeros(n)
# create the inequality data and initial points
for i in range(n):
a, b = bndr(type_x[i])
if x2[i] > x1[i]:
dmax[i] = (x2[i] - x1[i])/(b - a)
dy0[i] = 1 / (b - a) * (x2[i] - x1[i]) / 2
dmax[i] = (x1[i] - x2[i])/(b - a)
dy0[i] = 1 / (b - a) * (x1[i] - x2[i]) / 2
bond = create_bond(dmax)
# create the args tuple
arg = ()
# type x
for i in range(n):
arg = arg + (type_x[i],)
# dP
arg = arg + (abs(P[0] - P[1]), )
# delta1
for i in range(n):
arg = arg + (delta1[i], )
# delta1
for i in range(n):
arg = arg + (delta2[i], )
testconst = create_const(dy0, arg)
# create the equality constraint
con1 = {'type': 'eq', 'fun': create_const}
cons = ([con1, ])
solution = minimize(totaldist, dy0, args=arg, method='SLSQP', bounds=bond, constraints=cons, options={'disp': True})
x = solution.x
return x
def bndr(type_x):
if type_x == 'normal':
x_0 = -5
x_f = 1.5
if type_x == 'lognorm':
x_0 = 0.0001
x_f = 5
if type_x == 'chisquare':
x_0 = 0.0001
x_f = (0.8 * (10 ** .5))
return x_0, x_f
def test():
x1 = np.array([0.0001, 0.0001, -5])
x2 = np.array([1.6673, 0.84334, -5])
delta1 = np.array([0, 0, 0])
delta2 = np.array([2.44E-7, 2.41E-6, 4.07E-7])
type_x = np.array(['lognorm', 'chisquare', 'normal'])
P = (0, 6.54E-8)
f = GetNewPoint(x1, x2, delta1, delta2, type_x, P)
return f
the error message is the following:
Traceback (most recent call last):
File "D:/Anaconda Project/TestQP - Simplified/", line 134, in <module>
File "D:/Anaconda Project/TestQP - Simplified/", line 130, in test
f = GetNewPoint(x1, x2, delta1, delta2, type_x, P)
File "D:/Anaconda Project/TestQP - Simplified/", line 103, in GetNewPoint
solution = minimize(totaldist, dy0, args=arg, method='SLSQP', bounds=bond, constraints=cons, options={'disp': True})
File "C:\Program Files\Anaconda\lib\site-packages\scipy\optimize\", line 458, in minimize
constraints, callback=callback, **options)
File "C:\Program Files\Anaconda\lib\site-packages\scipy\optimize\", line 311, in _minimize_slsqp
meq = sum(map(len, [atleast_1d(c['fun'](x, *c['args'])) for c in cons['eq']]))
File "C:\Program Files\Anaconda\lib\site-packages\scipy\optimize\", line 311, in <listcomp>
meq = sum(map(len, [atleast_1d(c['fun'](x, *c['args'])) for c in cons['eq']]))
File "D:/Anaconda Project/TestQP - Simplified/", line 40, in create_const
a, b = bndr(arg[0, i])
IndexError: too many indices for array
I find roughly similar error in the website like: IndexError: index 1 is out of bounds for axis 0 with size 1/ForwardEuler
...but I failed to see it's really the same problem.
args is not passed to constraint-functions (automatically)!
This is indicated in the docs:
args : tuple, optional
Extra arguments passed to the objective function and its derivatives (Jacobian, Hessian).
You can see the problem easily by adding a print:
def create_const(dy, *args):
arg = np.asarray(args)
which will output something like:
(('lognorm', 'chisquare', 'normal', 6.54e-08, 0, 0, 0, 2.4400000000000001e-07, 2.4099999999999998e-06, 4.0699999999999998e-07),)
If you remove your test (which is manually passing args; which works) testconst = create_const(dy0, arg), you will see only the non-working output:
Constraints have their own mechanism of passing args as described in the docs:
constraints : dict or sequence of dict, optional
Constraints definition (only for COBYLA and SLSQP). Each constraint is defined in a dictionary with fields:
type : str
Constraint type: ‘eq’ for equality, ‘ineq’ for inequality.
fun : callable
The function defining the constraint.
jac : callable, optional
The Jacobian of fun (only for SLSQP).
args : sequence, optional
Extra arguments to be passed to the function and Jacobian.
Equality constraint means that the constraint function result is to be zero whereas inequality means that it is to be non-negative. Note that COBYLA only supports inequality constraints.
In your case:
con1 = {'type': 'eq', 'fun': create_const} # incomplete!
con1 = {'type': 'eq', 'fun': create_const, 'args': (arg,)} # (,)
# to make it behave as needed
# for your code!
This will make it run until some other problem occurs!

A weird error with updates in theano

I designed a variable net, but it occurred some problems with theano. The general idea is that different input will get different net with same parameters, something like a recursive neural network with auto-encoder.
There are two cases in my code, one case is run combine_feat_gt1_1() if c > 1, the other case is run combine_feat_gt1_0().
It is weird that the code can run without bugs if I comment updates=updates, which is not my expected (train_test theano function in code). However, if I uncomment updates=updates, an error occurred (train_test_bug theano function in code). The later one is that I'd like to implement.
I have been already spend some days on this bug. Who can help me? I will appreciate that.
import os
import sys
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from theano.ifelse import ifelse
class Test(object):
def __init__(
self.n_output = n_output
self.n_input = n_input
self.n_group = n_group
if not W_r:
initial_W_r = numpy.asarray(
low=-4 * numpy.sqrt(6. / (n_input + n_input)),
high=4 * numpy.sqrt(6. / (n_input + n_input)),
size=(n_input, n_input)
W_r = theano.shared(value=initial_W_r, name='W_r', borrow=True)
if not b_r:
b_r = theano.shared(
self.W_r = W_r
self.b_r = b_r
if input is None:
self.x = T.tensor4(name='input', dtype=theano.config.floatX)
self.x = input
if output is None:
self.y = T.matrix(name='output', dtype=theano.config.floatX)
self.y = output
self.params = [self.W_r, self.b_r]
def get_output_values(self, input):
a, b, c, d = input.shape
def recusive(x_t, h_tm1, wr, hr):
h_t =, wr) +, wr) + hr
return h_t
def combine_recusive(data):
hidden, _ = theano.scan(fn=recusive,
non_sequences=[self.W_r, self.b_r],
return hidden[-1]
def combine_feat_gt1_1(input):
feats, _ = theano.scan(fn=combine_recusive,
recusive_flag = T.ones(1)
return T.reshape(feats, (1,-1)) # concatenation
def combine_feat_gt1_0(input):
feats = input[0]
recusive_flag = T.zeros(1)
return T.reshape(feats, (1,-1)) # concatenation
feat = ifelse(, 1), combine_feat_gt1_1(input), combine_feat_gt1_0(input))
# debug code snippet
self.debug_ifelse = theano.function([input],, 1))
self.debug_1_0 = theano.function([input], ifelse(, 1), 1, 0))
return feat
def get_cost_updates(self):
learning_rate = 0.1
self.y_given_x = self.get_output_values(self.x)
cost = T.sum(( self.y_given_x - self.y) ** 2)
gparams = T.grad(cost, self.params)
updates = [
(param, param - learning_rate * gparam)
for param, gparam in zip(self.params, gparams)
return (cost, updates)
if __name__ == "__main__":
toy_data = numpy.array([[[[1,1,1],[2,2,2]], [[3, 4,5],[4,5,6]]]],dtype=theano.config.floatX)
lable = numpy.array([[1,2,3,4,5,6]],dtype=theano.config.floatX)
toy_data2 = numpy.array([[[[1,1,1]], [[3,4,5]]]],dtype=theano.config.floatX)
lable2 = numpy.array([[6,5,4,3,2,1]],dtype=theano.config.floatX)
x = T.tensor4('x', dtype=theano.config.floatX)
y = T.matrix('y', dtype=theano.config.floatX)
newX = T.tensor4(dtype=x.dtype)
newY = T.matrix(dtype=y.dtype)
rng = numpy.random.RandomState(123)
test = Test(
cost, updates= test.get_cost_updates()
train_test = theano.function(
[newX, newY],
# updates=updates,
x : newX,
y : newY
train_test_bug = theano.function(
[newX, newY],
x : newX,
y : newY
print train_test(toy_data, lable)
print train_test(toy_data2, lable2)
# code with bug
# print train_test_bug(toy_data, lable)
# print train_test_bug(toy_data2, lable2)
EDIT (by #danielrenshaw)
I've cut the code down to a simpler demonstration of the problem.
The cause is in the gradient computation of a double-nested scan expression. The problem disappears when a modified inner-most recursive expression is used (see comments in first function below).
import numpy
import theano
import theano.tensor as tt
import theano.ifelse
def inner_scan_step(x_t_t, h_tm1, w):
# Fails when using this recursive expression
h_t =, w) + x_t_t
# No failure when using this recursive expression
# h_t = h_tm1 +, w)
return h_t
def outer_scan_step(x_t, w):
h, _ = theano.scan(inner_scan_step,
return h[-1]
def get_outputs(x, w):
features, _ = theano.scan(outer_scan_step,
return tt.grad(features.sum(), w)
def main():
theano.config.compute_test_value = 'raise'
x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))
x = tt.tensor3()
x.tag.test_value = x_value
w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)
f = theano.function(inputs=[x], outputs=get_outputs(x, w))
print f(x_value)
if __name__ == "__main__":
I solved this problem edited by danielrenshaw. When I add h0 as outputs_info, it work. Before that I used first element of sequence as outputs_info, I think it caused the error. But I still cannot solve my original problem.
import numpy
import theano
import theano.tensor as tt
import theano.ifelse
def inner_scan_step(x_t_t, h_tm1, w):
# Fails when using this recursive expression
h_t =, w) + x_t_t
# No failure when using this recursive expression
# h_t = h_tm1 +, w)
return h_t
def outer_scan_step(x_t, w, h0):
h, _ = theano.scan(inner_scan_step,
return h[-1]
def get_outputs(x, w, h0):
features, _ = theano.scan(outer_scan_step,
non_sequences=[w, h0],
return tt.grad(features.sum(), w)
def main():
theano.config.compute_test_value = 'raise'
x_value = numpy.arange(12, dtype=theano.config.floatX).reshape((2, 2, 3))
x = tt.tensor3()
x.tag.test_value = x_value
w = theano.shared(value=numpy.ones((3, 3), dtype=theano.config.floatX), borrow=True)
h0 = theano.shared(value=numpy.zeros(3, dtype=theano.config.floatX), borrow=True)
f = theano.function(inputs=[x], outputs=get_outputs(x, w, h0))
print f(x_value)
if __name__ == "__main__":
I've encountered the same issue and I fixed it by letting optimizer=fast_compile in theano_flags. Guess that is a bug of theano.
