I am trying to generate a custom loss function in TF/Keras,the loss function works if it is run in a session and passed constants, however, it stops working when compiled into a Keras.
The cost function (thanks to Lior for converting it to TF)
def ginicTF(actual,pred):
n = int(actual.get_shape()[-1])
inds = K.reverse(tf.nn.top_k(pred,n)[1],axes=[0])
a_s = K.gather(actual,inds)
a_c = K.cumsum(a_s)
giniSum = K.sum(a_c)/K.sum(a_s) - (n+1)/2.0
return giniSum / n
def gini_normalizedTF(a,p):
return -ginicTF(a, p) / ginicTF(a, a)
#Test the cost function
sess = tf.InteractiveSession()
p = [0.9, 0.3, 0.8, 0.75, 0.65, 0.6, 0.78, 0.7, 0.05, 0.4, 0.4, 0.05, 0.5, 0.1, 0.1]
a = [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
ac = tf.placeholder(shape=(len(a),),dtype=K.floatx())
pr = tf.placeholder(shape=(len(p),),dtype=K.floatx())
print(gini_normalizedTF(ac,pr).eval(feed_dict={ac:a,pr:p}))
this prints -0.62962962963, which is the correct value.
Now let's put this into Keras MLP
def makeModel(n_feat):
model = Sequential()
#hidden layer #1
model.add(layers.Dense(12, input_shape=(n_feat,)))
model.add(layers.Activation('selu'))
model.add(layers.Dropout(0.2))
#output layer
model.add(layers.Dense(1))
model.add(layers.Activation('softmax'))
model.compile(loss=gini_normalizedTF, optimizer='sgd', metrics=['binary_accuracy'])
return model
model=makeModel(n_feats)
model.fit(x=Mout,y=targets,epochs=n_epochs,validation_split=0.2,batch_size=batch_size)
This generates error
<ipython-input-62-6ade7307336f> in ginicTF(actual, pred)
9 def ginicTF(actual,pred):
10
---> 11 n = int(actual.get_shape()[-1])
12
13 inds = K.reverse(tf.nn.top_k(pred,n)[1],axes=[0])
TypeError: __int__ returned non-int (type NoneType)
I tried going around it by giving a default value of n/etc but this doesn't seem to be going anywhere.
Can someone explain the nature of this problem and how I can remedy it?
Thank you!
Edit:
Updated things to keep it as tensor and then cast
def ginicTF(actual,pred):
nT = K.shape(actual)[-1]
n = K.cast(nT,dtype='int32')
inds = K.reverse(tf.nn.top_k(pred,n)[1],axes=[0])
a_s = K.gather(actual,inds)
a_c = K.cumsum(a_s)
n = K.cast(nT,dtype=K.floatx())
giniSum = K.cast(K.sum(a_c)/K.sum(a_s),dtype=K.floatx()) - (n+1)/2.0
return giniSum / n
def gini_normalizedTF(a,p):
return ginicTF(a, p) / ginicTF(a, a)
Still has the issue of getting "none" when used as a cost function
Related
I would like to match the results of the self_attention() function on page 339 of the Chollet's book, Deep learning with Python, second edition, with those of the MultiHeadAttention() example just below on the same page.
I wrote an example with the same input and I have different results. Can somebody explain why? I inserted the self_attention() function for clarity.
import numpy as np
from scipy.special import softmax
from tensorflow.keras.layers import MultiHeadAttention
def self_attention(input_sequence):
output = np.zeros(shape=input_sequence.shape)
# The output will consist of contextual embeddinsgs of the same shape
for i, pivot_vector in enumerate(input_sequence):
scores = np.zeros(shape=(len(input_sequence),))
for j, vector in enumerate(input_sequence):
scores[j] = np.dot(pivot_vector, vector.T) # Q K^T
scores /= np.sqrt(input_sequence.shape[1]) # sqrt(d_k)
scores = softmax(scores) # softmax(Q K^T / sqrt(d_k))
print(i, scores)
new_pivot_representation = np.zeros(shape=pivot_vector.shape)
for j, vector in enumerate(input_sequence):
new_pivot_representation += vector * scores[j]
output[i] = new_pivot_representation
return output
test_input_sequence = np.array([[[1.0, 0.0, 0.0, 1.0],
[0.0, 1.0, 0.0, 0.0],
[0.0, 1.0, 1.0, 1.0]]])
test_input_sequence.shape
# (1, 3, 4)
self_attention(test_input_sequence[0])
"""
returns
[[0.50648039 0.49351961 0.30719589 0.81367628]
[0.23269654 0.76730346 0.38365173 0.61634827]
[0.21194156 0.78805844 0.57611688 0.78805844]]
the attention scores being:
[0.50648039 0.18632372 0.30719589]
[0.23269654 0.38365173 0.38365173]
[0.21194156 0.21194156 0.57611688]
"""
att_layer = MultiHeadAttention(num_heads=1,
key_dim=4,
use_bias=False,
attention_axes=(1,))
att_layer(test_input_sequence,
test_input_sequence,
test_input_sequence,
return_attention_scores=True)
"""
returns
array([[[-0.46123487, 0.36683324, -0.47130704, -0.00722525],
[-0.49571565, 0.37488416, -0.52883905, -0.02713571],
[-0.4566634 , 0.38055322, -0.45884743, -0.00156384]]],
dtype=float32)
and the attention scores
array([[[[0.31446996, 0.36904442, 0.3164856 ],
[0.34567958, 0.2852166 , 0.36910382],
[0.2934979 , 0.3996053 , 0.30689687]]]], dtype=float32)>)
"""
I found the answer. This is due to the three dense layers before query, key, and value, and the one after the attention module (this last dense layer is missing from Fig. 11.8 in the book).
To reproduce the results of self_attention(), we just need to have pass-through dense layers:
i_4 = np.identity(4)
w_pt_4 = [i_4.reshape(4, 1, 4) for _ in range(3)] + [i_4.reshape(1, 4, 4)]
att_layer.set_weights(w_pt_4)
Say I have a an input array of size (64,100)
t = torch.randn((64,100))
Now say I want to multiply each of the 6400 elements of t with 6400 separate vectors each of size 256 to produce a tensor of size [64, 100, 256]. This is what I am doing currently -
import copy
def clones(module, N):
"Produce N identical layers."
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
linears = clones(nn.Linear(1,256, bias=False), 6400)
idx = 0
t_final = []
for i in range(64):
t_bs = []
for j in range(100):
t1 = t[i, j] * linears[idx].weight.view(-1)
idx += 1
t_bs.append(t1)
t_bs = torch.cat(t_bs).view(1, 100, 256)
t_final.append(t_bs)
t_final = torch.cat(t_final)
print(t_final.shape)
Output: torch.Size([64, 100, 256])
Is there a faster and cleaner way of doing the same thing? I tried torch.matmul and torch.dot but couldn't do any better.
It seems broadcast is what you are looking for.
t = torch.randn((64,100)).view(6400, 1)
weights = torch.randn((6400, 256))
output = (t * weights).view(64, 100, 256)
You don't actually need to clone your linear layer if you really want to multiply tenor t with the same weight of linear layer for 6400 times. rather you can do the following:
t = torch.randn((64,100)).unsqueeze(-1)
w = torch.rand((256)).view(1,1,256).repeat(64, 100, 1)
#or
w = torch.stack(6400*[torch.rand((256))]).view(64,100,256)
result = t*w # shape: [64, 100, 256]
However, If your want to keep the same structure you currently have, then you can do something following:
t = torch.randn((64,100)).unsqueeze(-1)
w = torch.stack([linears[i].weight for i in range(len(linears))]).view(64,100,256)
result = t*w # shape: [64, 100, 256]
I have a tensor of shape [h, w], which consists of a normalized, 2-dimensional activation map. Considering this to be some distribution, I want to find the mean and the covariance within this activation map in pytorch. Is there an efficient way to do that?
You can use the following code, where activation_map is a tensor of shape (h,w), with non-negative elements, and is normalised (activation_map.sum() is 1):
activation_map = torch.tensor(
[[0.2, 0.1, 0.0],
[0.1, 0.2, 0.4]])
h, w = activation_map.shape
range_h = torch.arange(h)
range_w = torch.arange(w)
idxs = torch.stack([
range_w[None].repeat(h, 1),
range_h[:, None].repeat(1, w)
])
map_flat = activation_map.view(-1)
idxs_flat = idxs.reshape(2, -1).T
mean = (map_flat[:, None] * idxs_flat).sum(0)
mats = idxs_flat[:, :, None] # idxs_flat[:, None, :]
second_moments = (map_flat[:, None, None] * mats).sum(0)
covariance = second_moments - mean[:, None] # mean[None]
# mean:
# tensor([1.1000, 0.7000])
# covariance:
# tensor([[0.6900, 0.2300],
# [0.2300, 0.2100]])
One way for the covariance matrix:
h,w = 3,5
def cov(X):
X = X/np.sqrt(X.size(0) - 1)
return X.T # X
x = torch.randn(h,w)
print(x)
c = cov(x)
print(c)
Out:
tensor([[-1.5029e-01, -2.0626e-01, -7.7845e-01, -1.6811e+00, 5.0312e-01],
[ 4.4658e-01, -1.8570e+00, -6.2250e-01, -1.0989e+00, 1.6159e+00],
[ 6.8612e-01, -4.2650e-02, -9.5685e-01, -1.7947e-03, 2.1187e-01]])
tensor([[ 0.3464, -0.4138, -0.4088, -0.1197, 0.3957],
[-0.4138, 1.7464, 0.6787, 1.1938, -1.5568],
[-0.4088, 0.6787, 0.9545, 0.9972, -0.8001],
[-0.1197, 1.1938, 0.9972, 2.0169, -1.3110],
[ 0.3957, -1.5568, -0.8001, -1.3110, 1.4546]])
The mean() should be trivial just refer the documentation.
I have a synthetic dataset consisting of features (X) and labels (y) which is used for KMeans clustering using Python 3.8 and sklearn 0.22.2 and numpy 1.19.
X.shape, y.shape
# ((100, 2), (100,))
kmeans = KMeans(n_clusters = 3, init = 'random', n_init = 10, max_iter = 300)
# Train model on scaled features-
kmeans.fit(X)
After training KMeans on 'X', I want to replace the unique (continuous) values of 'X' with the cluster centers (discreet) obtained using KMeans.
for i in range(3):
print("cluster number {0} has center = {1}".format(i + 1, kmeans.cluster_centers_[i, :]))
'''
cluster number 1 has center = [-0.7869159 1.14173859]
cluster number 2 has center = [ 1.28010442 -1.04663318]
cluster number 3 has center = [-0.54654735 0.0054752 ]
'''
set(kmeans.labels_)
# {0, 1, 2}
One way I have of doing it is:
X[np.where(clustered_labels == 0)] = val[0,:]
X[np.where(clustered_labels == 1)] = val[1,:]
X[np.where(clustered_labels == 2)] = val[2,:]
Can I do it using np.select()?
cond = [clustered_labels == i for i in range(3)]
val = kmeans.cluster_centers_[:,:]
But on executing the code:
np.select(cond, val)
I get the following error:
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) in
----> 1 np.select(cond, val)
<array_function internals> in select(*args, **kwargs)
~/.local/lib/python3.8/site-packages/numpy/lib/function_base.py in
select(condlist, choicelist, default)
693 result_shape = condlist[0].shape
694 else:
--> 695 result_shape = np.broadcast_arrays(condlist[0], choicelist[0])[0].shape
696
697 result = np.full(result_shape, choicelist[-1], dtype)
<array_function internals> in broadcast_arrays(*args, **kwargs)
~/.local/lib/python3.8/site-packages/numpy/lib/stride_tricks.py in
broadcast_arrays(subok, *args)
256 args = [np.array(_m, copy=False, subok=subok) for _m in args]
257
--> 258 shape = _broadcast_shape(*args)
259
260 if all(array.shape == shape for array in args):
~/.local/lib/python3.8/site-packages/numpy/lib/stride_tricks.py in
_broadcast_shape(*args)
187 # use the old-iterator because np.nditer does not handle size 0 arrays
188 # consistently
--> 189 b = np.broadcast(*args[:32])
190 # unfortunately, it cannot handle 32 or more arguments directly
191 for pos in range(32, len(args), 31):
ValueError: shape mismatch: objects cannot be broadcast to a single
shape
Suggestions?
Thanks!
Somewhat cleaner way to do it (but very similar to your way) will be the following. Here's a simple example:
from sklearn.cluster import KMeans
import numpy as np
x1 = np.random.normal(0, 2, 100)
y1 = np.random.normal(0, 1, 100)
label1 = np.ones(100)
d1 = np.column_stack([x1, y1, label1])
x2 = np.random.normal(3, 1, 100)
y2 = np.random.normal(1, 2, 100)
label2 = np.ones(100) * 2
d2 = np.column_stack([x2, y2, label2])
x3 = np.random.normal(-3, 0.5, 100)
y3 = np.random.normal(0.5, 0.25, 100)
label3 = np.ones(100) * 3
d3 = np.column_stack([x3, y3, label3])
D = np.row_stack([d1, d2, d3])
np.random.shuffle(D)
X = D[:, :2]
y = D[:, 2]
print(f'X.shape = {X.shape}, y.shape = {y.shape}')
# X.shape = (300, 2), y.shape = (300,)
kmeans = KMeans(n_clusters = 3, init = 'random', n_init = 10, max_iter = 300)
# Train model on scaled features-
kmeans.fit(X)
preds = kmeans.predict(X)
X[preds==0] = kmeans.cluster_centers_[0]
X[preds==1] = kmeans.cluster_centers_[1]
X[preds==2] = kmeans.cluster_centers_[2]
Yet another way to accomplish the task is to use the np.put method instead of the assignment as follows:
np.put(X, preds==0, kmeans.cluster_centers_[0])
np.put(X, preds==1, kmeans.cluster_centers_[1])
np.put(X, preds==2, kmeans.cluster_centers_[2])
Frankly, I don't see a way to accomplish the task by the means of the np.select function, and I guess the way you do it is the best way, based on this answer.
Cheers.
I am trying to understand how pytorch autograd works. If I have functions y = 2x and z = y**2, if I do normal differentiation, I get dz/dx at x = 1 as 8 (dz/dx = dz/dy * dy/dx = 2y*2 = 2(2x)*2 = 8x). Or, z = (2x)**2 = 4x^2 and dz/dx = 8x, so at x = 1, it is 8.
If I do the same with pytorch autograd, I get 4
x = torch.ones(1,requires_grad=True)
y = 2*x
z = y**2
x.backward(z)
print(x.grad)
which prints
tensor([4.])
where am I going wrong?
You're using Tensor.backward wrong. To get the result you asked for you should use
x = torch.ones(1,requires_grad=True)
y = 2*x
z = y**2
z.backward() # <-- fixed
print(x.grad)
The call to z.backward() invokes the back-propagation algorithm, starting at z and working back to each leaf node in the computation graph. In this case x is the only leaf node. After calling z.backward() the computation graph is reset and the .grad member of each leaf node is updated with the gradient of z with respect to the leaf node (in this case dz/dx).
What's actually happening in your original code? Well, what you've done is apply back-propagation starting at x. With no arguments x.backward() would simply result in x.grad being set to 1 since dx/dx = 1. The additional argument (gradient) is effectively a scale to apply to the resulting gradient. In this case z=4 so you get x.grad = z * dx/dx = 4 * 1 = 4. If interested, you can check out this for more information on what the gradient argument does.
If you still have some confusion on autograd in pytorch, Please refer this:
This will be basic xor gate representation
import numpy as np
import torch.nn.functional as F
inputs = torch.tensor(
[
[0, 0],
[0, 1],
[1, 0],
[1, 1]
]
)
outputs = torch.tensor(
[
0,
1,
1,
0
],
)
weights = torch.randn(1, 2)
weights.requires_grad = True #set it as true for gradient computation
bias = torch.randn(1, requires_grad=True) #set it as true for gradient computation
preds = F.linear(inputs, weights, bias) #create a basic linear model
loss = (outputs - preds).mean()
loss.backward()
print(weights.grad) # this will print your weights