class RNNSLU(object):
''' elman neural net model '''
def __init__(self, nh, nc, ne, de, cs):
'''
nh :: dimension of the hidden layer
nc :: number of classes
ne :: number of word embeddings in the vocabulary
de :: dimension of the word embeddings
cs :: word window context size
'''
# parameters of the model
self.emb = theano.shared(name='embeddings',
value=0.2 * numpy.random.uniform(-1.0, 1.0,
(ne+1, de))
# add one for padding at the end
.astype(theano.config.floatX))
self.wx = theano.shared(name='wx',
value=0.2 * numpy.random.uniform(-1.0, 1.0,
(de * cs, nh))
.astype(theano.config.floatX))
self.wh = theano.shared(name='wh',
value=0.2 * numpy.random.uniform(-1.0, 1.0,
(nh, nh))
.astype(theano.config.floatX))
self.w = theano.shared(name='w',
value=0.2 * numpy.random.uniform(-1.0, 1.0,
(nh, nc))
.astype(theano.config.floatX))
self.bh = theano.shared(name='bh',
value=numpy.zeros(nh,
dtype=theano.config.floatX))
self.b = theano.shared(name='b',
value=numpy.zeros(nc,
dtype=theano.config.floatX))
self.h0 = theano.shared(name='h0',
value=numpy.zeros(nh,
dtype=theano.config.floatX))
# bundle
self.params = [self.emb, self.wx, self.wh, self.w, self.bh, self.b, self.h0]
def recurrence(x_t, h_tm1):
h_t = T.nnet.sigmoid(T.dot(x_t, self.wx)
+ T.dot(h_tm1, self.wh) + self.bh)
s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
return [h_t, s_t]
[h, s], = theano.scan(fn=recurrence,
sequences=x,
outputs_info=[self.h0, None],
n_steps=x.shape[0])
I am following this Theano tutorial about RNN.(http://deeplearning.net/tutorial/rnnslu.html) But I have two questions about it.
First. In this tutorial, recurrence function like this:
def recurrence(x_t, h_tm1):
h_t = T.nnet.sigmoid(T.dot(x_t, self.wx) + T.dot(h_tm1, self.wh) + self.bh)
s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
return [h_t, s_t]
I wounder why do not plus h0 in h_t ? (i.e. h_t = T.nnet.sigmoid(T.dot(x_t, self.wx) + T.dot(h_tm1, self.wh) + self.bh + self.h0))
Second, why outputs_info=[self.h0, None]? I know outputs_info is the Initialization result. So I think outputs_info=[self.bh+self.h0, T.nnet.softmax(T.dot(self.bh+self.h0, self.w_h2y) + self.b_h2y)]
def recurrence(x_t, h_tm1):
h_t = T.nnet.sigmoid(T.dot(x_t, self.wx)
+ T.dot(h_tm1, self.wh) + self.bh)
s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
return [h_t, s_t]
So, first you ask why we don't use h0 in the recurrence function. Let's breakdown this part,
h_t = T.nnet.sigmoid(T.dot(x_t, self.wx)+ T.dot(h_tm1, self.wh) + self.bh)
What we expect is 3 terms.
The first term is the input layer multiplied by the weighting matrix T.dot(x_t, self.wx).
The second term is the hidden layer muliplied by another weighting matrix (this is what makes it recurrent) T.dot(h_tm1, self.wh). Note that you must have a weighting matrix, you proposed to add self.h0 as a bias basically.
The third term is the bias of the hidden layer, self.bh.
Now, after every iteration we want to keep track of the hidden layer activations, contained in self.h0. However, self.h0 is meant to contain the CURRENT activations and what we need is the previous activations.
[h, s], _ = theano.scan(fn=recurrence,
sequences=x,
outputs_info=[self.h0, None],
n_steps=x.shape[0])
So, look at the scan function again. You are right that outputs_info=[self.h0, None] initializes the values, but the values are also linked to the outputs. There are two outputs from recurrence(), namely [h_t, s_t].
So what the outputs_info does as well is that after every iteration, the value of self.h0 is overwritten with the value of h_t (the first returned value). The second element of outputs_info is None, because we do not save or initialize the value for s_t anywhere (the second argument of outputs_info is linked to the returned values of the recurrence function this way.)
In the next iteration, the first argument of outputs_info is used again as input, such that h_tm1 is the same value as self.h0. But, since we must have an argument for h_tm we must initialize this value. Since we don't need to initialize a second argument in outputs_info, we leave the second term as None.
Granted, the theano.scan() function is very confusing at times and I'm new at it too. But, this is what I understood from doing this same tutorial.
Related
I am trying to convert a nested loop over a numpy array into a numpy-optimized implementation.
The function being called inside the loop takes a 4D vector and a separate parameter, and outputs a 4D vector which is supposed to replace the old 4D vector based on operations with the new value. If relevant, the function a Welford online update which updates mean and standard deviation based on a new value, with the 4D vector being [old_mean, old_std, old_s, num_values]. For each pixel channel, I am saving these values in the history_array for updating the distribution based on future pixel values.
My present code looks like this:
def welford_next(arr:np.ndarray, new_point:np.float32) -> np.ndarray:
old_mean, _, old_s, num_points = arr
num_points += 1
new_mean = old_mean + (new_point - old_mean) / num_points
new_s = old_s + (new_point - old_mean) * (new_point - new_mean)
return [new_mean, np.sqrt(new_s / num_points) if num_points > 1 else new_s, new_s, num_points]
updates = [10., 20., 30., 40., 90., 80.]
history_array = np.zeros(shape = b.shape + (4,)) # shape: [6,3,3,4]
print(f'History Shape: {history_array.shape}')
history_array_2 = np.zeros_like(history_array)
for update in updates:
image = np.empty(shape = b.shape) # shape: [6,3,3] (h x w x c)
image.fill(update)
for i, row in enumerate(image): # Prohibitively expensive
for j, col in enumerate(row):
for k, channel in enumerate(col):
history_array[i][j][k] = welford_next(history_array[i][j][k], channel)
history_array_2 = np.apply_along_axis(welford_next, axis=2, arr=history_array_2)
print(history_array == history_array_2)
However, the np.apply_along_axis() is not seem to be viable because it does not allow additional parameters to be passed alongside the array itself.I also came across np.ufunc which the welford_next() function can be converted to using np.frompyfunc() but it is unclear how it could help me reach the desired target.
How do I achieve this looped operation using numpy?
The numpy optimized way to do this would be to change the way we use the welford_next() function. As mentioned in the comments, repeated calls to a function cannot be optimized, thus the function call needs to be limited to once per frame and optimization needs to be done inside the function itself. The following implementation works ~ 50x faster.
def welford(history:np.ndarray, frame:np.ndarray) -> np.ndarray:
old_mean, _, old_s, num_points = np.transpose(history, [3,0,1,2])
num_points += 1.
new_mean = old_mean + (frame - old_mean) / num_points
new_s = old_s + (frame - old_mean) * (frame - new_mean)
new_std = np.sqrt(new_s / num_points) if num_points[0][0][0] > 1 else new_s
return np.transpose(np.array([new_mean, new_std, new_s, num_points]), [1,2,3,0])
updates = [10., 20., 30., 40., 90., 80.]
history_array = np.zeros(shape = b.shape + (4,)) # shape: [6,3,3,4]
for update in updates:
image = np.empty(shape = b.shape) # shape: [6,3,3] (h x w x c)
image.fill(update)
history_array = welford(history_array, image)
I have trained a Lightgbm model on learning to rank dataset. The model predicts relevance score of a sample. So higher the prediction the better it is. Now that the model has learned I would like to find the best values of some features that gives me the highest prediction score.
So, lets say I have features u,v,w,x,y,z and the features I would like to optimize over are x,y,z.
maximize f(u,v,w,x,y,z) w.r.t features x,y,z where f is a lightgbm model
subject to constraints :
y = Ax + b
z = 4 if y < thresh_a else 4-0.5 if y >= thresh_b else 4-0.3
thresh_m < x <= thresh_n
The numbers are randomly made up but constraints are linear.
Objective function with respect to x looks like the following :
So the function is very spiky, non-smooth. I also don't have the gradient information as f is a lightgbm model.
Using Nathan's answer I wrote down the following class :
class ProductOptimization:
def __init__(self, estimator, features_to_change, row_fixed_values,
bnds=None):
self.estimator = estimator
self.features_to_change = features_to_change
self.row_fixed_values = row_fixed_values
self.bounds = bnds
def get_sample(self, x):
new_values = {k:v for k,v in zip(self.features_to_change, x)}
return self.row_fixed_values.replace({k:{self.row_fixed_values[k].iloc[0]:v}
for k,v in new_values.items()})
def _call_model(self, x):
pred = self.estimator.predict(self.get_sample(x))
return pred[0]
def constraint1(self, vector):
x = vector[0]
y = vector[2]
return # some float value
def constraint2(self, vector):
x = vector[0]
y = vector[3]
return #some float value
def optimize_slsqp(self, initial_values):
con1 = {'type': 'eq', 'fun': self.constraint1}
con2 = {'type': 'eq', 'fun': self.constraint2}
cons = ([con1,con2])
result = minimize(fun=self._call_model,
x0=np.array(initial_values),
method='SLSQP',
bounds=self.bounds,
constraints=cons)
return result
The results that I get are always around the initial guess. And I think its because of non-smoothness of the function and absence of any gradient information which is important for the SLSQP optimizer. Any advices how should I deal with this kind of problem ?
It's been a good minute since I last wrote some serious code, so I appologize if it's not entirely clear what everything does, please feel free to ask for more explanations
The imports:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
from scipy.optimize import minimize
from copy import copy
First I define a new class that allows me to easily redefine values. This class has 5 inputs:
value: this is the 'base' value. In your equation y=Ax + b it's the b part
minimum: this is the minimum value this type will evaluate as
maximum: this is the maximum value this type will evaluate as
multipliers: the first tricky one. It's a list of other InputType objects. The first is the input type and the second the multiplier. In your example y=Ax +b you would have [[x, A]], if the equation was y=Ax + Bz + Cd it would be [[x, A], [z, B], [d, C]]
relations: the most tricky one. It's also a list of other InputType objects, it has four items: the first is the input type, the second defines if it's an upper boundary you use min, if it's a lower boundary you use max. The third item in the list is the value of the boundary, and the fourth the output value connected to it
Watch out if you define your input values too strangely I'm sure there's weird behaviour.
class InputType:
def __init__(self, value=0, minimum=-1e99, maximum=1e99, multipliers=[], relations=[]):
"""
:param float value: base value
:param float minimum: value can never be lower than x
:param float maximum: value can never be higher than y
:param multipliers: [[InputType, multiplier], [InputType, multiplier]]
:param relations: [[InputType, min, threshold, output_value], [InputType, max, threshold, output_value]]
"""
self.val = value
self.min = minimum
self.max = maximum
self.multipliers = multipliers
self.relations = relations
def reset_val(self, value):
self.val = value
def evaluate(self):
"""
- relations to other variables are done first if there are none then the rest is evaluated
- at most self.max
- at least self.min
- self.val + i_x * w_x
i_x is input i, w_x is multiplier (weight) of i
"""
for term, min_max, value, output_value in self.relations:
# check for each term if it falls outside of the expected terms
if min_max(term.evaluate(), value) != term.evaluate():
return self.return_value(output_value)
output_value = self.val + sum([i[0].evaluate() * i[1] for i in self.multipliers])
return self.return_value(output_value)
def return_value(self, output_value):
return min(self.max, max(self.min, output_value))
Using this, you can fix the input types sent from the optimizer, as shown in _call_model:
class Example:
def __init__(self, lst_args):
self.lst_args = lst_args
self.X = np.random.random((10000, len(lst_args)))
self.y = self.get_y()
self.clf = GradientBoostingRegressor()
self.fit()
def get_y(self):
# sum of squares, is minimum at x = [0, 0, 0, 0, 0 ... ]
return np.array([[self._func(i)] for i in self.X])
def _func(self, i):
return sum(i * i)
def fit(self):
self.clf.fit(self.X, self.y)
def optimize(self):
x0 = [0.5 for i in self.lst_args]
initial_simplex = self._get_simplex(x0, 0.1)
result = minimize(fun=self._call_model,
x0=np.array(x0),
method='Nelder-Mead',
options={'xatol': 0.1,
'initial_simplex': np.array(initial_simplex)})
return result
def _get_simplex(self, x0, step):
simplex = []
for i in range(len(x0)):
point = copy(x0)
point[i] -= step
simplex.append(point)
point2 = copy(x0)
point2[-1] += step
simplex.append(point2)
return simplex
def _call_model(self, x):
print(x, type(x))
for i, value in enumerate(x):
self.lst_args[i].reset_val(value)
input_x = np.array([i.evaluate() for i in self.lst_args])
prediction = self.clf.predict([input_x])
return prediction[0]
I can define your problem as shown below (be sure to define the inputs in the same order as the final list, otherwise not all the values will get updated correctly in the optimizer!):
A = 5
b = 2
thresh_a = 5
thresh_b = 10
thresh_c = 10.1
thresh_m = 4
thresh_n = 6
u = InputType()
v = InputType()
w = InputType()
x = InputType(minimum=thresh_m, maximum=thresh_n)
y = InputType(value = b, multipliers=([[x, A]]))
z = InputType(relations=[[y, max, thresh_a, 4], [y, min, thresh_b, 3.5], [y, max, thresh_c, 3.7]])
example = Example([u, v, w, x, y, z])
Calling the results:
result = example.optimize()
for i, value in enumerate(result.x):
example.lst_args[i].reset_val(value)
print(f"final values are at: {[i.evaluate() for i in example.lst_args]}: {result.fun)}")
I want to add custom constraints on the parameters of a layer.
I write a custom activation layer with two trainable parameters a and b s.t:
activation_fct = a*fct() + b*fct().
I need to have the sum of the parameters (a+b) equal to 1 but I don't know how to write such a constraint.
Can you give me some advices ?
Thanks in advance.
Two approaches come to my mind.
First one is to lock one of the parameters, let's say b and make only the other one (a in this case) trainable. Then you can compute b as follows
b = 1 - a
The second approach could be making both a and b trainable and transform them via softmax function. Softmax function will make sure that their sum is always 1.
from scipy.special import softmax
a = 0.12
b = 0.3
w1, w2 = softmax([a, b])
print(f'w1: {w1}, w2: {w2}, w1 + w2: {w1 + w2}')
This will produce
w1: 0.45512110762641994, w2: 0.5448788923735801, w1 + w2: 1.0
And once you have w1 and w2, you can use them in the mentioned formula instead of a and b.
activation_fct = w1 * fct() + w2 * fct()
You can have a single weight instead of two, and use this custom constraint:
import keras.backend as K
class Between_0_1(keras.constraints.Constraint):
def __call__(self, w):
return K.clip(w, 0, 1)
Then when building the weights, build only a and use the constraints.
def build(self, input_shape):
self.a = self.add_weight(name='weight_a',
shape=(1,),
initializer='uniform',
constraint = Between_0_1(),
trainable=True)
#if you want to start as 0.5
K.set_value(self.a, [0.5])
self.built = True
In call, b = 1-a:
def call(self, inputs, **kwargs):
#do stuff
....
return (self.a * something) + ((1-self.a)*another_thing)
You can alsto try #MatusDubrava softmax approach, but in this case your weights need to have shape (2,), and no constraint:
def build(self, input_shape):
self.w = self.add_weight(name='weights',
shape=(2,),
initializer='zeros',
trainable=True)
self.build = True
def call(self, inputs, **kwargs):
w = K.softmax(self.w)
#do stuff
....
return (w[0] * something ) + (w[1] * another_thing)
I'm trying to write a hook that will allow me to compute some global metrics (rather than batch-wise metrics). To prototype, I thought I'd get a simple hook up and running that would capture and remember true positives. It looks like this:
class TPHook(tf.train.SessionRunHook):
def after_create_session(self, session, coord):
print("Starting Hook")
tp_name = 'metrics/f1_macro/TP'
self.tp = []
self.args = session.graph.get_operation_by_name(tp_name)
print(f"Got Args: {self.args}")
def before_run(self, run_context):
print("Starting Before Run")
return tf.train.SessionRunArgs(self.args)
def after_run(self, run_context, run_values):
print("After Run")
print(f"Got Values: {run_values.results}")
However, the values returned in the "after_run" part of the hook are always None. I tested this in both the train and evaluation phase. Am I misunderstanding something about how the SessionRunHooks are supposed to work?
Maybe relevant information:
The model was build in keras and converted to an estimator with the keras.estimator.model_to_estimator() function. The model has been tested and works fine, and the op that I'm trying to retrieve in the hook is defined in this code block:
def _f1_macro_vector(y_true, y_pred):
"""Computes the F1-score with Macro averaging.
Arguments:
y_true {tf.Tensor} -- Ground-truth labels
y_pred {tf.Tensor} -- Predicted labels
Returns:
tf.Tensor -- The computed F1-Score
"""
y_true = K.cast(y_true, tf.float64)
y_pred = K.cast(y_pred, tf.float64)
TP = tf.reduce_sum(y_true * K.round(y_pred), axis=0, name='TP')
FN = tf.reduce_sum(y_true * (1 - K.round(y_pred)), axis=0, name='FN')
FP = tf.reduce_sum((1 - y_true) * K.round(y_pred), axis=0, name='FP')
prec = TP / (TP + FP)
rec = TP / (TP + FN)
# Convert NaNs to Zero
prec = tf.where(tf.is_nan(prec), tf.zeros_like(prec), prec)
rec = tf.where(tf.is_nan(rec), tf.zeros_like(rec), rec)
f1 = 2 * (prec * rec) / (prec + rec)
# Convert NaN to Zero
f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
return f1
In case anyone runs into the same problem, I found out how to restructure the program so that it worked. Although the documentation makes it sound like I can pass raw ops into the SessionRunArgs, it seems like it requires actual tensors (maybe this is a misreading on my part).
This is pretty easy to accomplish - I just changed the after_create_session code to what's shown below.
def after_create_session(self, session, coord):
tp_name = 'metrics/f1_macro/TP'
self.tp = []
tp_tensor = session.graph.get_tensor_by_name(tp_name+':0')
self.args = [tp_tensor]
And this successfully runs.
I am doing word-level language modelling with a vanilla rnn, I am able to train the model but for some weird reasons I am not able to get any samples/predictions from the model; here is the relevant part of the code:
train_set_x, train_set_y, voc = load_data(dataset, vocab, vocab_enc) # just load all data as shared variables
index = T.lscalar('index')
x = T.fmatrix('x')
y = T.ivector('y')
n_x = len(vocab)
n_h = 100
n_y = len(vocab)
rnn = Rnn(input=x, input_dim=n_x, hidden_dim=n_h, output_dim=n_y)
cost = rnn.negative_log_likelihood(y)
updates = get_optimizer(optimizer, cost, rnn.params, learning_rate)
train_model = theano.function(
inputs=[index],
outputs=cost,
givens={
x: train_set_x[index],
y: train_set_y[index]
},
updates=updates
)
predict_model = theano.function(
inputs=[index],
outputs=rnn.y,
givens={
x: voc[index]
}
)
sampling_freq = 2
sample_length = 10
n_train_examples = train_set_x.get_value(borrow=True).shape[0]
train_cost = 0.
for i in xrange(n_train_examples):
train_cost += train_model(i)
train_cost /= n_train_examples
if i % sampling_freq == 0:
# sample from the model
seed = randint(0, len(vocab)-1)
idxes = []
for j in xrange(sample_length):
p = predict_model(seed)
seed = p
idxes.append(p)
# sample = ''.join(ix_to_words[ix] for ix in idxes)
# print(sample)
I get the error: "TypeError: ('Bad input argument to theano function with name "train.py:94" at index 0(0-based)', 'Wrong number of dimensions: expected 0, got 1 with shape (1,).')"
Now this corresponds to the following line (in the predict_model):
givens={ x: voc[index] }
Even after spending hours I am not able to comprehend how could there be a dimension mis-match when:
train_set_x has shape: (42, 4, 109)
voc has shape: (109, 1, 109)
And when I do train_set_x[index], I am getting (4, 109) which 'x' Tensor of type fmatrix can hold (this is what happens in train_model) but when I do voc[index], I am getting (1, 109), which is also a matrix but 'x' cannot hold this, why ? !
Any help will be much appreciated.
Thanks !
The error message refers to the definition of the whole Theano function named predict_model, not the specific line where the substitution with givens occurs.
The issue seems to be that predict_model gets called with an argument that is a vector of length 1 instead of a scalar. The initial seed sampled from randint is actually a scalar, but I would guess that the output p of predict_model(seed) is a vector and not a scalar.
In that case, you could either return rnn.y[0] in predict_model, or replace seed = p with seed = p[0] in the loop over j.