optimize a function with two bounded matrices as inputs - python-3.x

I am trying to optimise a loss function which takes two inputs "m, d" as inputs. Both of these are (32, 32, 1) matrices. I am not able to figure out how to bound/constrain their values between 0 and 1. "m, d" are filters that I apply to some input being fed into a trained ML model.
I have looked at these documentations
https://scipy-lectures.org/advanced/mathematical_optimization/index.html#id54
(See Box-Bounds; hyperlink in Chapter Contents)
https://docs.scipy.org/doc/scipy/reference/tutorial/optimize.html
scipy minimize with constraints
def lossfunction(MD):
m = MD[:, :, 0]
d = MD[:, :, 1]
x = data[np.argwhere(label != 6)]
xt = np.multiply((1 - m), x) + np.multiply(m, d) # Todo: Apply Filter
num_examples = xt.shape[0]
sess = tf.get_default_session()
totalloss = 0
for offset in range(0, num_examples, BATCH_SIZE):
batchx, batchy = xt[offset:offset + BATCH_SIZE], (np.ones(BATCH_SIZE) * targetlabel)
loss = sess.run(loss_operation, feed_dict={x: batchx, y: batchy, prob: 0.8})
totalloss = totalloss + loss
finalloss = totalloss + lam * np.linalg.norm(m, 1)
return finalloss
optimize.minimize(lossfunction, np.zeros((32, 32, 2)), bounds=((0, 1), (0, 1)))
I get this error message: ValueError: length of x0 != length of bounds
I understand that the bounds and inputs should be of the same dimensions.
Is there a convenient way of inputting the bounds?

Related

Applying a 2D function to a 4D tensor PyTorch

I got a 2D function that takes a matrix - 2D tensor with shape (28, 28)
and I got a tensor, lets say (64, 10, 28, 28) - it's a tensor that contains a batch of 64 images that passed through a (10 kernels) conv2d layer.
Now, I want to activate on the last two dimentions of the tensor, the (28,28) bit, a 2D function.
Now I did that in a very inefficient way:
def activation_func(input):
for batch_idx in range(input.shape[0]):
for channel_inx in range(input.shape[1]):
input[batch_idx][channel_inx] = 2D_function(input[batch_idx][channel_inx])
return input
which is highly inefficient as I noticed.
is there any way of doing this efficiently?
I can write the entire code If necessary
EDIT:
def 2D_function(input):
global indices # yes I know, I will remove this global stuff later
# indices = [(i, j) for i in range(1, 28, 4) for j in range(1, 28, 4)]
for x, y in indices:
relu_decision = relu(input[x, y]) # standard relu - relu(x)=(x>1)*x
if not relu_decision:
# zero out the patch
input[x - 1: x + 3, y - 1: y + 3] = 0
return input
In such cases, I use a Kronecker product trick:
import torch
torch.set_printoptions(linewidth=200) # you can better see how the mask is shaped
# simulating an input
input = torch.rand(1, 1, 28, 28) - 0.5
ids = torch.meshgrid((torch.arange(1, 28, 4), torch.arange(1, 28, 4)))
# note that relu(x) = (x > 0.) * x, so adjust it to your needs
relus = torch.nn.functional.relu(input[(slice(None), slice(None), *ids)]).to(bool)
A = torch.ones(4, 4)
# generate a block matrix with ones in positions where blocks are set to 0 in correspondence of relus = 0
mask = torch.kron(relus, A)
print(mask.shape)
output = input * mask
print(mask[0, 0])
print(output[0, 0])

How to increase batch size in GPT2 training for translation task?

I am developing a code to use the pre-trained GPT2 model for a machine translation task. The length of my data's word-to-id is 91, and I developed the following code for my model:
import torch
from torch.utils.data import DataLoader
from transformers.models.gpt2.modeling_gpt2 import GPT2Model
# data preparation code
def batch_sequences(x, y, env):
"""
Take as input a list of n sequences (torch.LongTensor vectors) and return
a tensor of size (slen, n) where slen is the length of the longest
sentence, and a vector lengths containing the length of each sentence.
"""
lengths_x = torch.LongTensor([len(s) + 2 for s in x])
lengths_y = torch.LongTensor([len(s) + 2 for s in y])
max_length = max(lengths_x.max().item(), lengths_y.max().item())
sent_x = torch.LongTensor(
max_length, lengths_x.size(0)).fill_(env.pad_index)
sent_y = torch.LongTensor(
max_length, lengths_y.size(0)).fill_(env.pad_index)
assert lengths_x.min().item() > 2
assert lengths_y.min().item() > 2
sent_x[0] = env.eos_index
for i, s in enumerate(x):
sent_x[1:lengths_x[i] - 1, i].copy_(s)
sent_x[lengths_x[i] - 1, i] = env.eos_index
sent_y[0] = env.eos_index
for i, s in enumerate(y):
sent_y[1:lengths_y[i] - 1, i].copy_(s)
sent_y[lengths_y[i] - 1, i] = env.eos_index
return sent_x, sent_y, max_length
def collate_fn(elements):
"""
Collate samples into a batch.
"""
x, y = zip(*elements)
x = [torch.LongTensor([env.word2id[w]
for w in seq if w in env.word2id]) for seq in x]
y = [torch.LongTensor([env.word2id[w]
for w in seq if w in env.word2id]) for seq in y]
x, y, length = batch_sequences(x, y, env)
return (x, length), (y, length), torch.LongTensor(nb_ops)
loader = DataLoader(data, batch_size=1, shuffle=False, collate_fn=collate_fn)
gpt2 = GPT2Model.from_pretrained('gpt2')
in_layer = nn.Embedding(len(env.word2id), 768)
out_layer = nn.Linear(768, len(env.word2id))
parameters = list(gpt2.parameters()) + list(in_layer.parameters()) + list(out_layer.parameters())
optimizer = torch.optim.Adam(parameters)
loss_fn = nn.CrossEntropyLoss()
for layer in (gpt2, in_layer, out_layer):
layer.train()
accuracies = list()
n_epochs = 5
for i in range(n_epochs):
for (x, x_len), (y, y_len) in loader:
x = x.to(device=device)
y = y.to(device=device)
embeddings = in_layer(x.reshape(1, -1))
hidden_state = gpt2(inputs_embeds=embeddings).last_hidden_state[:, :]
logits = out_layer(hidden_state)[0]
loss = loss_fn(logits, y.reshape(-1))
accuracies.append(
(logits.argmax(dim=-1) == y.reshape(-1)).float().mean().item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
if len(accuracies) % 500 == 0:
accuracy = sum(accuracies[-50:]) / len(accuracies[-50:])
print(f'Samples: {len(accuracies)}, Accuracy: {accuracy}')
This code works pretty well when the batch size is 1. But it is so slow. I wanted to increase the batch size from 1 to 32, but I get some dimension compatibility problems. How can I increase the batch size without errors?
My data consists of pair of sentences, the first one is a sentence in the first language and the second one is its translation in the second language.
For example, assume that x.shape is (batch_size, 12) (meaning we have 'batch_size' sentences of length 12 as input and y.shape is also (batch_size, 12) (the translations). And also we have a word-to-id dictionary of length 90 that matches each word in a sentence with its index)
This problem can be solved using padding. We need two special symbols:
code 0 in inputs (x) will denote "blank" tokens that should not be translated.
code -100 in outputs (y) will denote "blank" tokens that should not participate in the calculation of loss. nn.CrossEntropyLoss() is programmed to ignore this value (by the argument ignore_index).
The batch of size 3 could look like this:
x:
[[1, 2, 3, 0, 0],
[ 4, 5, 6, 7, 8],
[ 9, 8, 0, 0, 0]]
y:
[[1, 2, 3, -100, -100],
[ 4, 5, 6, 7, 8],
[ 9, 8, -100, -100, -100]]
You could generate it with code such as:
def pad_sequences(batch, pad_value=0):
n = max(len(v) for v in batch)
return torch.tensor([v + [pad_value] * (n - len(v)) for v in batch])
However, I feel there is an issue with your problem statement. If you perform machine translation, then your inputs and outputs can have different lengths, but your architecture only allows x and y to have the same lengths. If you want to support x and y of different lengths, I would suggest to use a seq2seq architecture such as T5 instead.
Another issue is that GPT is autoregressive, so if y is completely aligned with x, then we cannot use the suffix of x while generating the left part of y. So if you wish your x and y to be perfectly aligned, but still would like to use the full information about x when generating y, I would recommend using a bidirectional encoder such as BERT.

What is causing tensorflow retracing?

Here is my code for factorization
I am trying to run some part of my iterate function with TPUs and I am getting retracing error ? If I am making an obvious mistake I am sorry. update_xu and update_yi is mostly matrix multiplication and matrix inversion. Hence I am try to run them in strategy.scope() ?. Eventually some kind of memory leakage or something
def loss(R, X, Y, C, lmda_x, lmda_y):
'''
returns the MSE of the weighted least squares plus L2 regularisation error
'''
Error = R - tf.matmul(X, Y, transpose_a = True)
Error = Error * Error
Error = C * Error
Reg = 0 + tf.math.reduce_sum(X * X * lmda_x) + tf.math.reduce_sum(Y * Y * lmda_y)
return Reg + tf.math.reduce_sum(Error) * 1 / (Error.shape[0] * Error.shape[1])
def update_xu(Ru, Y, Cuser, lmda):
'''
input
column vector Ru,
k x m matrix Y,
m x m matrix Cuser and
lmda_x
output
the updated user row vector (xu) by making Y matrix constant
'''
Ru = tf.reshape(Ru, shape = [Y.shape[1], 1])
C = Cuser # Ru
inverse = tf.linalg.inv(Y # Cuser # tf.transpose(Y) + lmda * tf.eye(Y.shape[0]))
ans = (inverse # Y # C)
return tf.reshape(ans, [Y.shape[0]])
def update_yi(Ri, X, Citem, lmda):
'''
input
column vector Ri,
k x n matrix X,
n x n matrix Citem and
lmda_y
output
the updated user row vector (yi) by making X matrix constant
'''
Ri = tf.reshape(Ri, shape = [X.shape[1], 1])
C = Citem # Ri
inverse = tf.linalg.inv(X # Citem # tf.transpose(X) + lmda * tf.eye(X.shape[0]))
ans = inverse # X # C
return tf.reshape(ans, [X.shape[0]])
def iterate(R, X, Y, C, lmda_x, lmda_y, epochs):
'''
returns approximately updated X and Y such R = X(Y.T) with WALS algorithm
'''
with strategy.scope():
for _ in range(epochs):
Xtt = tf.vectorized_map(lambda x: update_xu(x[0], Y, tf.linalg.diag(x[1]), lmda_x), (R, C))
#Xtt = tf.map_fn(lambda x: update_xu(x[0], Y, tf.linalg.diag(x[1]), lmda_x), (R, C), dtype = tf.TensorSpec([Y.shape[0]], dtype = tf.float32), parallel_iterations=6)
print(Xtt.shape)
X = tf.transpose(Xtt)
R, C = tf.transpose(R), tf.transpose(C)
Ytt = tf.vectorized_map(lambda x: update_yi(x[0], X, tf.linalg.diag(x[1]), lmda_y), (R, C))
#Ytt = tf.map_fn(lambda x: update_yi(x[0], X, tf.linalg.diag(x[1]), lmda_y), (R, C), dtype = tf.TensorSpec([X.shape[0]], dtype = tf.float32), parallel_iterations=6)
R , C = tf.transpose(R), tf.transpose(C)
print(Ytt.shape)
Y = tf.transpose(Ytt)
return X, Y
def init_weights(R):
chk = tf.where(
tf.math.abs(R) > eps, 1., 0.
)
cnt = tf.constant(
tf.math.reduce_sum(chk, axis = 0), shape = [1, R.shape[1]]
)
cnt = tf.repeat(
cnt, repeats = R.shape[0], axis = 0
)
cnt = tf.reshape(cnt, [R.shape[0], R.shape[1]])
cnt = cnt * chk
cnt = tf.random.normal( R.shape, mean=0.0, stddev=1.0) * cnt
cnt = cnt + tf.random.normal([1], mean = 0.0, stddev = 1.0)
return cnt
def train( R, lmda_x, lmda_y, epochs, embd):
flag = False
loss_train , loss_test, total = 0., 0., 0
loss_train_list, loss_test_list,total_epochs = [], [], []
X, Y = tf.zeros([embd, R.shape[0]]), tf.zeros([embd, R.shape[1]])
C = init_weights(train_data)
for epoch in epochs:
if flag == False:
X, Y = tf.random.normal( [embd, R.shape[0]], mean=0.0, stddev=1.0), tf.random.normal( [embd, R.shape[1]], mean=0.0, stddev=1.0)
X, Y = iterate(R, X, Y, C, lmda_x, lmda_y, epoch)
flag = True
else:
X, Y = iterate(train_data, X, Y, C, lmda_x, lmda_y, epoch)
total += epoch
loss_train = loss(train_data, X, Y, C, lmda_x, lmda_y)
loss_test = loss(test_data, X, Y, C, lmda_x, lmda_y)
loss_train_list.append(loss_train)
loss_test_list.append(loss_test)
total_epochs.append(total)
print(loss_train, loss_test)
plt.plot(total_epochs, loss_train_list, label = "Training", linewidth = 5)
plt.plot(total_epochs, loss_test_list, label = "Test", linewidth = 1)
plt.xticks(fontsize = 10)
plt.title(str(embd)+ ', ' +str(lmda_x) + ',' + str(lmda_y))
plt.yticks(fontsize = 10)
plt.xlim(0, 200)
plt.ylim(0, 1000000000)
plt.xlabel('iterations', fontsize=30);
plt.ylabel('MSE', fontsize=30);
plt.legend(loc='best', fontsize=20);
return X, Y
def grid_search(epochs, embds, lmdas_x, lmdas_y, train_data, test_data):
for lmda_x in lmdas_x:
for lmda_y in lmdas_y:
for embd in embds:
lmda_x , lmda_y , epochs, embd = tf.convert_to_tensor(lmda_x, dtype = tf.float32), tf.convert_to_tensor(lmda_y, dtype =tf.float32), tf.convert_to_tensor(epochs, dtype = tf.int64), tf.convert_to_tensor(embd, dtype =tf.int64)
X, Y = train(train_data, lmda_x, lmda_y, epochs, embd)
Some errors on calling grid search
WARNING:tensorflow:11 out of the last 11 calls to <function pfor.<locals>.f at 0x7f4dd38f7b90> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating #tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your #tf.function outside of the loop. For (2), #tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.
WARNING:tensorflow:11 out of the last 11 calls to <function pfor.<locals>.f at 0x7f4dd38f7b90> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating #tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your #tf.function outside of the loop. For (2), #tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.
(943, 1)
WARNING:tensorflow:11 out of the last 11 calls to <function pfor.<locals>.f at 0x7f4dd38f7560> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating #tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your #tf.function outside of the loop. For (2), #tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.
WARNING:tensorflow:11 out of the last 11 calls to <function pfor.<locals>.f at 0x7f4dd38f7560> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating #tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your #tf.function outside of the loop. For (2), #tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.
(1682, 1)
(943, 1)
ERROR:tensorflow:==================================
Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x7f4e018c73d0>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2778, in while_loop
return result File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2726, in <lambda>
body = lambda i, lv: (i + 1, orig_body(*lv)) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/map_fn.py", line 507, in compute
return (i + 1, tas) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/map_fn.py", line 505, in <listcomp>
ta.write(i, value) for (ta, value) in zip(tas, result_value_batchable) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 249, in wrapped
error_in_function=error_in_function)
==================================
InvalidArgumentError Traceback (most recent call last)
<ipython-input-58-a2922620d8dd> in <module>()
7 epochs = [5, 100] #2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]#, 588, 299, 300, 200]
8 #X, Y, C = train(train_data.shape[0], train_data.shape[1], train_data, 0, 0, 2, 2)
----> 9 grid_search(epochs, embds, lmdas, lmdas, train_data, test_data)
3 frames
<ipython-input-57-8cc0dfdb85de> in grid_search(epochs, embds, lmdas_x, lmdas_y, train_data, test_data)
5 plt.figure(figsize = (10,10))
6 lmda_x , lmda_y , epochs, embd = tf.convert_to_tensor(lmda_x, dtype = tf.float32), tf.convert_to_tensor(lmda_y, dtype =tf.float32), tf.convert_to_tensor(epochs, dtype = tf.int64), tf.convert_to_tensor(embd, dtype =tf.int64)
----> 7 X, Y = train(train_data, lmda_x, lmda_y, epochs, embd)
8
9
<ipython-input-56-d70632663530> in train(R, lmda_x, lmda_y, epochs, embd)
88 flag = True
89 else:
---> 90 X, Y = iterate(train_data, X, Y, C, lmda_x, lmda_y, epoch)
91 total += epoch
92 loss_train = loss(train_data, X, Y, C, lmda_x, lmda_y)
<ipython-input-56-d70632663530> in iterate(R, X, Y, C, lmda_x, lmda_y, epochs)
50 Xtt = tf.vectorized_map(lambda x: update_xu(x[0], Y, tf.linalg.diag(x[1]), lmda_x), (R, C))
51 #Xtt = tf.map_fn(lambda x: update_xu(x[0], Y, tf.linalg.diag(x[1]), lmda_x), (R, C), dtype = tf.TensorSpec([Y.shape[0]], dtype = tf.float32), parallel_iterations=6)
---> 52 print(Xtt.shape)
53 X = tf.transpose(Xtt)
54 R, C = tf.transpose(R), tf.transpose(C)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in shape(self)
1173 # `_tensor_shape` is declared and defined in the definition of
1174 # `EagerTensor`, in C.
-> 1175 self._tensor_shape = tensor_shape.TensorShape(self._shape_tuple())
1176 except core._NotOkStatusException as e:
1177 six.raise_from(core._status_to_exception(e.code, e.message), None)
InvalidArgumentError: {{function_node __inference_f_5094764}} Input is not invertible.
[[{{node loop_body/MatrixInverse/pfor/MatrixInverse}}]]

Matrix calculations in SIMPLERNN KERAS

I'm trying to understand the Matrix Calculations involved in SIMPLERNN.
I have understood from some blog posts and stackoverflow that SimpleRNN(units) creates a layer where in each layer contains no.of units of RNN.
SimpleRNN involves the following calculations
W = kernel #shape-(1,units)
U = recurrent_kernel #shape-(units,units)
B = bias #shape -(units,)
output = new_state = act(W * input + U * state + B)
Help me understand what are the input and output dimensions of the following below code snippet.
Function for generating X and Y. Where Y is the Cummulative sum of X.
'def generate_batch(n=256):
X = np.random.choice(a=[0, 1], size = n*seq_len, p=[0.9, 0.1]).reshape(n, -1)
y = np.cumsum(X, axis=1)
X = X.reshape(n, -1, 1)
y = y.reshape(n, -1, 1)
return(X, y) #returns shape X,y-(256,60,1)
model = Sequential()
model.add(SimpleRNN(10, input_shape=(60, 1), return_sequences=True))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')
X, y = generate_batch()#(256,60,1)
model.fit(X, y, verbose=0, epochs=1)'
please help me figure out the dimensions of input to RNN ,state and output dimension from RNN.How does the matrix calculations occur i.e (W * input + U * state + B).

Keras: IoU backend implementation where the inputs are the box corners?

I've seen a few implementations of Intersection over Union in Keras Tensorflow, but they all use inputs that represent the contents of the box regions. The network I'm working with passes the four corners of the boxes themselves. How can I implement the algorithm this way with Keras backend?
My attempt:
def IoU():
# y_true: Tensor from the generator of shape (B, N, 5). The last value for each box is the state of the anchor (ignore, negative, positive).
# y_pred: Tensor from the network of shape (B, N, 4).
# box coordinates (in axis 2): [x1, y1, x2, y2]
def _IoU(y_true, y_pred):
anchor_state = y_true[:,:,-1]
regression = y_pred
regression_target = y_true[:,:,:-1]
indices = backend.where(keras.backend.equal(anchor_state, 1))
regression = backend.gather_nd(regression, indices)
regression_target = backend.gather_nd(regression_target, indices)
x1_pred = regression[:,0] # <- fails here
y1_pred = regression[:,1]
x2_pred = regression[:,2]
y2_pred = regression[:,3]
x1_true = regression_target[:,0]
y1_true = regression_target[:,1]
x2_true = regression_target[:,2]
y2_true = regression_target[:,3]
xA = keras.backend.maximum(x1_pred, keras.backend.transpose(x1_true))
yA = keras.backend.maximum(y1_pred, keras.backend.transpose(y1_true))
xB = keras.backend.maximum(x2_pred, keras.backend.transpose(x2_true))
yB = keras.backend.maximum(y2_pred, keras.backend.transpose(y2_true))
interArea = keras.backend.maximum((xB-xA+1),0)*keras.backend.maximum((yB-yA+1),0)
pred_area = (x2_pred-x1_pred+1)*(y2_pred-y1_pred+1)
truth_area = (x2_true-x1_true+1)*(y2_true-y1_true+1)
iou_arr = interArea/(pred_area + keras.backend.transpose(truth_area) - interArea)
iou = keras.backend.mean(iou_arr)
return iou
return _IoU
After some debugging I found that backend.gather_nd reduced regression and regression_target from shapes (?, ?, 4) to shapes (?, 4), hence the indexing for x1_pred.
However, when I try to use this as a metric for a compiled model, I get the following error during runtime:
ValueError: slice index 2 of dimension 1 out of bounds. for 'metrics/_IoU_1/strided_slice_4' (op: 'StridedSlice') with input shapes: [?,2], [2], [2], [2] and with computed input tensors: input[1] = <0 2>, input[2] = <0 3>, input[3] = <1 1>.
If anybody knows a way that I can implement this in Keras, or sees where I've gone wrong, I'd be forever grateful.
EDIT: I figured out that this was because the method was being called twice, once where (y_true, y_pred) were the box prediction/truth (and the last dimension was of shape 4), and then again where (y_true, y_pred) were the class prediction/truth (and I have only 2 classes).
I did get some negative values and some values > 1 for the returned iou values, so maybe I'm still doing something incorrectly?

Resources