I want to create a CNN network for object localization (It is given that there is only one object). For this I am using some general layers and in the end I want to get the nearest and farthest corner to origin. I am also using self defined loss function which is (100 - intersaction over union in %). Loss is not converging. What may be the problem? Wheather backpropogation will work with this network or some other problem? Below is the code:
For illustraion purpose see .
Network:
class convnet(nn.Module):
def __init__(self):
super(convnet, self).__init__()
self.conv1 = nn.Conv2d(1, 4, kernel_size=5)
self.pool1 = nn.MaxPool2d(kernel_size=3,stride=3)
self.conv2 = nn.Conv2d(4, 8, kernel_size=5)
self.pool2 = nn.MaxPool2d(kernel_size=3,stride=3)
self.conv3 = nn.Conv2d(8, 16, kernel_size=5)
self.pool3 = nn.MaxPool2d(kernel_size=3,stride=3)
self.fc1 = nn.Linear(5040, 1000)
self.fc2 = nn.Linear(1000, 84)
self.fc3 = nn.Linear(84, 4)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool1(x)
x = F.relu(self.conv2(x))
x = self.pool2(x)
x = F.relu(self.conv3(x))
x = self.pool3(x)
x = x.view(-1, 5040)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
a = self.sigmoid(self.fc3(x))
c = torch.zeros(a.shape[0], 2)
for idx, x in enumerate(a):
d1 = x[0] ** 2 + x[1] ** 2
d2 = x[2] ** 2 + x[3] ** 2
d3 = x[0] ** 2 + x[3] ** 2
d4 = x[2] ** 2 + x[1] ** 2
dmin = min(d1, d2, d3, d4)
if d1 == dmin:
c[idx] = torch.tensor([x[0], x[1]])
elif d2 == dmin:
c[idx] = torch.tensor([x[2], x[3]])
elif d3 == dmin:
c[idx] = torch.tensor([x[0], x[3]])
elif d4 == dmin:
c[idx] = torch.tensor([x[2], x[1]])
m = torch.tensor([[640, 480, 640, 480]]).type(torch.DoubleTensor).cuda()
return c*m
def sigmoid(self, z):
return 1/(1+torch.exp(-z))
Loss function:
def iou(box_a, box_b):
A = box_a.size(0)
B = box_b.size(0)
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
box_b[:, :2].unsqueeze(0).expand(A, B, 2))
inter = torch.clamp((max_xy - min_xy), min=0)
inter =inter[:, :, 0] * inter[:, :, 1]
area_a = ((box_a[:, 2]-box_a[:, 0]) *
(box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)
area_b = ((box_b[:, 2]-box_b[:, 0]) *
(box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)
union = area_a + area_b - inter
return ((inter / union)*100/float(A*A)).sum()
def criterion(output, labels):
return (100-iou(output, labels))
You can check full code here: link
Related
I am trying to run Video Vision Transformer (ViViT) code with my dataset but getting an error using CrossEntropyLoss from Pytorch as the Loss function.
There are 6 classes I have:
['Run', 'Sit', 'Walk', 'Wave', 'Sit', 'Stand']
Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, weight_decay=1e-9, momentum=0.9)
Class Weights
tensor([0.0045, 0.0042, 0.0048, 0.0038, 0.0070, 0.0065])
Loss Function
loss_func = nn.CrossEntropyLoss(weight=class_weights.to(device))
Code Throwning Error
train_epoch(model, optimizer, train_loader, train_loss_history, loss_func)
Error
RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15
Code Calling the transformer
model = ViViT(224, 16, 100, 16).cuda()
Getting Video Frames
def get_frames(filename, n_frames=1):
frames = []
v_cap = cv2.VideoCapture(filename)
v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_list = np.linspace(0, v_len - 1, n_frames + 1, dtype=np.int16)
frame_dims = np.array([224, 224, 3])
for fn in range(v_len):
success, frame = v_cap.read()
if success is False:
continue
if (fn in frame_list):
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = cv2.resize(frame, (frame_dims[0], frame_dims[1]))
frames.append(frame)
v_cap.release()
return frames, v_len
Dataset Preprocessing
class DatasetProcessing(data.Dataset):
def __init__(self, df, root_dir):
super(DatasetProcessing, self).__init__()
# List of all videos path
video_list = df["Video"].apply(lambda x: root_dir + '/' + x)
self.video_list = np.asarray(video_list)
self.df = df
def __getitem__(self, index):
# Ensure that the raw videos are in respective folders and folder name matches the output class label
video_label = self.video_list[index].split('/')[-2]
video_name = self.video_list[index].split('/')[-1]
video_frames, len_ = get_frames(self.video_list[index], n_frames = 15)
video_frames = np.asarray(video_frames)
video_frames = video_frames/255
class_list = ['Run', 'Walk', 'Wave', 'Sit', 'Turn', 'Stand']
class_id_loc = np.where(class_list == video_label)
label = class_id_loc
d = torch.as_tensor(np.array(video_frames).astype('float'))
l = torch.as_tensor(np.array(label).astype('float'))
return (d, l)
def __len__(self):
return self.video_list.shape[0]
Training Epochs
def train_epoch(model, optimizer, data_loader, loss_history, loss_func):
total_samples = len(data_loader.dataset)
model.train()
for i, (data, target) in enumerate(data_loader):
optimizer.zero_grad()
x = data.cuda()
data = rearrange(x, 'b p h w c -> b p c h w').cuda()
target = target.type(torch.LongTensor).cuda()
pred = model(data.float())
output = F.log_softmax(pred, dim=1)
loss = loss_func(output, target.squeeze(1))
loss.backward()
optimizer.step()
if i % 100 == 0:
print('[' + '{:5}'.format(i * len(data)) + '/' + '{:5}'.format(total_samples) +
' (' + '{:3.0f}'.format(100 * i / len(data_loader)) + '%)] Loss: ' +
'{:6.4f}'.format(loss.item()))
loss_history.append(loss.item())
Evaluate Model
def evaluate(model, data_loader, loss_history, loss_func):
model.eval()
total_samples = len(data_loader.dataset)
correct_samples = 0
total_loss = 0
with torch.no_grad():
for data, target in data_loader:
x = data.cuda()
data = rearrange(x, 'b p h w c -> b p c h w').cuda()
target = target.type(torch.LongTensor).cuda()
output = F.log_softmax(model(data.float()), dim=1)
loss = loss_func(output, target)
_, pred = torch.max(output, dim=1)
total_loss += loss.item()
correct_samples += pred.eq(target).sum()
avg_loss = total_loss / total_samples
loss_history.append(avg_loss)
print('\nAverage test loss: ' + '{:.4f}'.format(avg_loss) +
' Accuracy:' + '{:5}'.format(correct_samples) + '/' +
'{:5}'.format(total_samples) + ' (' +
'{:4.2f}'.format(100.0 * correct_samples / total_samples) + '%)\n')
Transformer
class Transformer(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
super().__init__()
self.layers = nn.ModuleList([])
self.norm = nn.LayerNorm(dim)
for _ in range(depth):
self.layers.append(nn.ModuleList([
PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
]))
def forward(self, x):
for attn, ff in self.layers:
x = attn(x) + x
x = ff(x) + x
return self.norm(x)
ViViT Code
class ViViT(nn.Module):
def __init__(self, image_size, patch_size, num_classes, num_frames, dim = 192, depth = 4, heads = 3, pool = 'cls', in_channels = 3, dim_head = 64, dropout = 0.,
emb_dropout = 0., scale_dim = 4, ):
super().__init__()
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_size // patch_size) ** 2
patch_dim = in_channels * patch_size ** 2
self.to_patch_embedding = nn.Sequential(
Rearrange('b t c (h p1) (w p2) -> b t (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
nn.Linear(patch_dim, dim),
)
self.pos_embedding = nn.Parameter(torch.randn(1, num_frames, num_patches + 1, dim))
self.space_token = nn.Parameter(torch.randn(1, 1, dim))
self.space_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)
self.temporal_token = nn.Parameter(torch.randn(1, 1, dim))
self.temporal_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)
self.dropout = nn.Dropout(emb_dropout)
self.pool = pool
self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)
def forward(self, x):
x = self.to_patch_embedding(x)
b, t, n, _ = x.shape
cls_space_tokens = repeat(self.space_token, '() n d -> b t n d', b = b, t=t)
x = torch.cat((cls_space_tokens, x), dim=2)
x += self.pos_embedding[:, :, :(n + 1)]
x = self.dropout(x)
x = rearrange(x, 'b t n d -> (b t) n d')
x = self.space_transformer(x)
x = rearrange(x[:, 0], '(b t) ... -> b t ...', b=b)
cls_temporal_tokens = repeat(self.temporal_token, '() n d -> b n d', b=b)
x = torch.cat((cls_temporal_tokens, x), dim=1)
x = self.temporal_transformer(x)
x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
return self.mlp_head(x)
Multi target appears to be a feature supported since version 1.10.0.
https://discuss.pytorch.org/t/crossentropyloss-vs-per-class-probabilities-target/138331
Please check your pytorch version.
Please refer to the example of using the UTF101 top5 dataset, which is available on my Colab. The version of pytorch is 1.12.0+cu113, and the code you listed was able to run the training almost exactly as it was written.
I wrote this code and when I run it I get the following error: forward() takes 1 positional argument but 2 were given. As far as I know, I am passing only one argument to forward().
ResNet is a basic residual block
class ResNet(nn.Module):
def __init__(self, in_channels, mid_channels, mid2_channels ,out_channels):
super().__init__()
self.conv1 = nn.Conv2d(in_channels,mid_channels,kernel_size = 3, stride = 1, padding = 1)
self.conv1_bn = nn.BatchNorm2d(mid_channels)
self.conv2 = nn.Conv2d(mid_channels,mid2_channels,kernel_size = 3, stride = 1, padding = 1)
self.conv2_bn = nn.BatchNorm2d(mid2_channels)
self.conv3 = nn.Conv2d(mid2_channels,out_channels,kernel_size = 3, stride = 1, padding = 1)
self.conv3_bn = nn.BatchNorm2d(out_channels)
if (in_channels != out_channels):
self.conv_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size = 1, stride = 1, padding = 0 )
def forward(self, X):
X_shortcut = X
X = F.relu(self.conv1(X))
X = self.conv1_bn(X)
X = F.relu(self.conv2(X))
X = self.conv2_bn(X)
X = F.relu(self.conv2(X))
X = self.conv2_bn(X)
if (in_channels == out_channels):
X = self.conv3(X) + X_shortcut
else:
X = self.conv3(X) + self.conv_shortcut(X_shortcut)
X = self.conv3_bn(F.relu(x))
return X
This the method for generating a model using the given layers.
class TotalNet(nn.Module):
def __init__(self, Layers):
super().__init__()
self.hidden = nn.ModuleList()
self.hidden.append(nn.BatchNorm2d(1))
for i in range(0,len(Layers)-1,3):
in_channels, mid_channels, mid2_channels, out_channels = Layers[i:(i+4)]
self.hidden.append(ResNet(in_channels, mid_channels, mid2_channels, out_channels))
self.hidden.append(nn.Flatten())
def forward(self, X):
X = self.hidden(X)
return X
the following is how I am calling the function:
test = TotalNet([9,2,9,9,9,9,9,9,9,9])
a = torch.rand((1,9,9), dtype = torch.float32)
test(a)
I realized that I was passing the X to the nn.ModuleList. This is incorrect that the right way would be to apply X to the elements of nn.ModuleList and updating the values of X.
In other words, the forward function of TotalNet should be the following:
for operation in self.hidden:
X = operation(X)
return X
I created a neural network in PyTorch. My loss function is a weighted negative log-likelihood. The weights are determined by the output of my neural network and must be fixed. It means the weights depend on the output of the neural network but must be fixed so the network only calculates the gradient of log part and not the weights. Here is my code:
import torch
import torch.nn as nn
def extended_cumsum(x):
return torch.cat([torch.zeros(x.size()[1:3]).unsqueeze(0), torch.cumsum(x, 0)], 0)
def is_positive(x):
x_sign = torch.sign(x)
return torch.where(x_sign < 0, torch.zeros_like(x_sign), x_sign)
def coupling_transform(x1, x2, nn_output, k):
nn_output_normalized = torch.softmax(nn_output, 0)
bins_weights = torch.ones_like(nn_output_normalized)/k
knots_xs = extended_cumsum(bins_weights)
prev_bins = is_positive(x2.unsqueeze(0) - knots_xs)
current_bins = prev_bins[:-1,:,:] - prev_bins[1:,:,:]
q_sum = torch.sum(prev_bins[1:,:,:]*nn_output_normalized, 0)
q_current = torch.sum(current_bins*nn_output_normalized, 0)
w_sum = torch.sum(prev_bins[1:,:,:]*bins_weights, 0)
c_values = q_sum + k*(x2 - w_sum)*q_current
log_det = torch.log(torch.prod(k*q_current, 1))
return x1, c_values, log_det
def flipping_dims(n, d1, d2):
dims = []
for i in range(n):
if i%2 == 0:
dims.append(d1)
else:
dims.append(d2)
return dims
class Linear(nn.Module):
def __init__(self, d1, d2, k, hidden):
super().__init__()
self.d1, self.d2, self.k = d1, d2, k
self.net = nn.Sequential(nn.Linear(d1, hidden),
nn.ReLU(),
nn.Linear(hidden, hidden),
nn.ReLU(),
nn.Linear(hidden, hidden),
nn.ReLU(),
nn.Linear(hidden, k*d2))
def forward(self, x, log_prob, flip=False):
x1, x2 = x[:, :self.d1], x[:, self.d1:]
if flip:
x1, x2 = x2, x1
z1, z2, log_det = coupling_transform(x1, x2, torch.reshape(self.net(x1), (self.k, -1, self.d2)), self.k)
if flip:
z1, z2 = z2, z1
z = torch.cat([z1, z2], 1)
return z, log_prob - log_det
class stacked_Linear(nn.Module):
def __init__(self, d1, d2, k, hidden, n):
super().__init__()
self.layers = nn.ModuleList([
Linear(_d1, _d2, k, hidden=hidden) for _, _d1, _d2 in zip(range(n), flipping_dims(n, d1, d2), flipping_dims(n, d1, d2)[::-1])
])
self.flips = [True if i%2 else False for i in range(n)]
def forward(self, x, log_prev_prob):
for layer, f in zip(self.layers, self.flips):
x, log_prob = layer(x, log_prev_prob, flip=f)
log_prev_prob = log_prob
return x, log_prob
def f(x):
return torch.prod(torch.exp(x), 1)
def my_loss(weights, log_prob):
loss = -torch.mean(weights*log_prob)
return loss
model = stacked_Linear(3, 3, 32, 16, 4)
optim = torch.optim.Adam(model.parameters(), lr=1e-4)
losses = []
for _ in range(100):
x = torch.rand(1000, 6)
optim.zero_grad()
z, log_prob = model(x, torch.zeros(1000))
f_values = f(z)
weights = f_values/torch.exp(log_prob)
loss = my_loss(weights, log_prob)
losses.append(loss)
loss.backward()
But the loss value doesn't decrease and it doesn't change if I fix x:
losses = []
x = torch.rand(1000, 6)
for _ in range(100):
optim.zero_grad()
z, log_prob = model(x, torch.zeros(1000))
f_values = f(z)
weights = f_values/torch.exp(log_prob)
loss = my_loss(weights, log_prob)
losses.append(loss)
loss.backward()
[tensor(-0.1160, grad_fn=<NegBackward>),
tensor(-0.1160, grad_fn=<NegBackward>),
tensor(-0.1160, grad_fn=<NegBackward>),
tensor(-0.1160, grad_fn=<NegBackward>),
tensor(-0.1160, grad_fn=<NegBackward>), ...]
I am implementing my own Neural Network model for regression using only NumPy, and I'm getting really weird results when I'm testing my model on m > 1 samples (for m=1 it works fine).. It seems like the model collapses and predicts only specific values for the whole batch:
Input:
X [[ 7.62316802 -6.12433912]
[ 1.11048966 4.97509421]]
Expected Output:
Y [[16.47952332 12.50288412]]
Model Output
y_hat [[10.42446234 10.42446234]]
Any idea what might cause this issue?
My code:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# np.seterr(all=None, divide=None, over=None, under=None, invalid=None)
data_x = np.random.uniform(0, 10, size=(2, 1))
data_y = (2 * data_x).sum(axis=0, keepdims=True)
# data_y = data_x[0, :] ** 2 + data_x[1, :] ** 2
# data_y = data_y.reshape((1, -1))
# # fig = plt.figure()
# # ax = fig.add_subplot(111, projection='3d')
# # ax.scatter(data_x[0, :], data_x[1, :], data_y)
# # plt.show()
memory = dict()
nn_architecture = [
{"input_dim": 2, "output_dim": 6, "activation": "sigmoid", "bias": True},
{"input_dim": 6, "output_dim": 4, "activation": "sigmoid", "bias": True},
{"input_dim": 4, "output_dim": 1, "activation": "relu", "bias": True}
]
def init_network_parameters(nn_architecture):
parameters = []
for idx, layer in enumerate(nn_architecture):
layer_params = {}
input_dim, output_dim, activation, bias = layer.values()
W = np.random.uniform(0, 1, (output_dim, input_dim))
B = np.zeros((output_dim, 1))
if bias:
B = np.ones((output_dim, 1))
activation_func = identity
backward_activation_func = identity_backward
if activation is 'sigmoid':
activation_func = sigmoid
backward_activation_func = sigmoid_backward
elif activation is 'relu':
activation_func = relu
backward_activation_func = relu_backward
else:
print(f"Activation function set to identity for layer {idx}")
layer_params[f"W"] = W
layer_params[f"B"] = B
layer_params[f"activation"] = activation_func
layer_params[f"backward_activation"] = backward_activation_func
layer_params[f"bias"] = bias
parameters.append(layer_params)
return parameters
def identity(z):
return z
def sigmoid(z):
return np.clip(1 / (1 + np.exp(-z)), -100, 100)
def relu(z):
output = np.array(z, copy=True)
output[z <= 0] = 0
return output
def identity_backward(z, dA):
return dA
def sigmoid_backward(z, dA):
return np.clip(z * (1-z) * dA, -100, 100)
def relu_backward(z, dA):
output = np.ones(z.shape)
output[z <= 0] = 0
return output * dA
def forward_single_layer(prev_A, parameters, idx):
W = parameters[f"W"]
B = parameters[f"B"]
activation = parameters[f"activation"]
if parameters["bias"]:
curr_Z = W.dot(prev_A) + B
else:
curr_Z = W.dot(prev_A)
curr_A = activation(curr_Z)
memory[f"Z{idx+1}"] = curr_Z
memory[f"A{idx+1}"] = curr_A
return curr_Z, curr_A
def forward(X, parameters):
prev_A = X
memory["A0"] = prev_A
for idx, layer_params in enumerate(parameters):
curr_Z, prev_A = forward_single_layer(prev_A=prev_A, parameters=layer_params, idx=idx)
return prev_A
def criteria(y_hat, y):
assert y_hat.shape == y.shape
n = y_hat.shape[0]
m = y_hat.shape[1]
loss = np.sum(y_hat - y, axis=1) / m
dA = (y_hat - y) / m
return loss, dA
def backward_single_layer(prev_A, dA, curr_W, curr_Z, backward_activation, idx):
m = prev_A.shape[1]
dZ = backward_activation(z=curr_Z, dA=dA)
dW = np.dot(dZ, prev_A.T) / m
dB = np.sum(dZ, axis=1, keepdims=True) / m
dA = np.dot(curr_W.T, dZ)
return dA, dW, dB
def backpropagation(parameters, dA):
grads = {}
for idx in reversed(range(len(parameters))):
layer = parameters[idx]
prev_A = memory[f"A{idx}"]
curr_Z = memory[f"Z{idx+1}"]
curr_W = layer["W"]
backward_activation = layer["backward_activation"]
dA, dW, dB = backward_single_layer(prev_A, dA, curr_W, curr_Z, backward_activation, idx)
grads[f"W{idx}"] = dW
grads[f"B{idx}"] = dB
return grads
def update_params(parameters, grads, lr=0.001):
new_params = []
for idx, layer in enumerate(parameters):
layer["W"] -= lr*grads[f"W{idx}"]
layer["B"] -= lr*grads[f"B{idx}"]
new_params.append(layer)
return new_params
X = np.random.uniform(-10, 10, (2, 2))
Y = 2*X[0, :] + X[1, :] ** 2
Y = Y.reshape((1, X.shape[1]))
parameters = init_network_parameters(nn_architecture)
n_epochs = 1000
lr = 0.01
loss_history = []
for i in range(n_epochs):
y_hat = forward(X, parameters)
loss, dA = criteria(y_hat, Y)
loss_history.append(loss)
grads = backpropagation(parameters, dA)
parameters = update_params(parameters, grads, lr)
if not i % 10:
print(f"Epoch {i}/{n_epochs} loss={loss}")
print("X", X)
print("Y", Y)
print("y_hat", y_hat)
There wasn't a problem with my implementation, just overfitting.
More information can be found here.
I am trying to implement self attention in Pytorch.
I need to calculate the following expressions.
Similarity function S (2 dimensional), P(2 dimensional), C'
S[i][j] = W1 * inp[i] + W2 * inp[j] + W3 * x1[i] * inp[j]
P[i][j] = e^(S[i][j]) / Sum for all j( e ^ (S[i]))
basically, P is a softmax function
C'[i] = Sum (for all j) P[i][j] * x1[j]
I tried the following code using for loops
for i in range(self.dim):
for j in range(self.dim):
S[i][j] = self.W1 * x1[i] + self.W2 * x1[j] + self.W3 * x1[i] * x1[j]
for i in range(self.dim):
for j in range(self.dim):
P[i][j] = torch.exp(S[i][j]) / torch.sum( torch.exp(S[i]))
# attend
for i in range(self.dim):
out[i] = 0
for j in range(self.dim):
out[i] += P[i][j] * x1[j]
Is there any faster way to implement this in Pytorch?
Here is an example of Self Attention I had implemented in Dual Attention for HSI Imagery
class PAM_Module(Module):
""" Position attention module https://github.com/junfu1115/DANet/blob/master/encoding/nn/attention.py"""
#Ref from SAGAN
def __init__(self, in_dim):
super(PAM_Module, self).__init__()
self.chanel_in = in_dim
self.query_conv = Conv2d(in_channels=in_dim, out_channels=in_dim//8, kernel_size=1)
self.key_conv = Conv2d(in_channels=in_dim, out_channels=in_dim//8, kernel_size=1)
self.value_conv = Conv2d(in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.gamma = Parameter(torch.zeros(1))
self.softmax = Softmax(dim=-1)
def forward(self, x):
"""
inputs :
x : input feature maps( B X C X H X W)
returns :
out : attention value + input feature
attention: B X (HxW) X (HxW)
"""
m_batchsize, C, height, width = x.size()
proj_query = self.query_conv(x).view(m_batchsize, -1, width*height).permute(0, 2, 1)
proj_key = self.key_conv(x).view(m_batchsize, -1, width*height)
energy = torch.bmm(proj_query, proj_key)
attention = self.softmax(energy)
proj_value = self.value_conv(x).view(m_batchsize, -1, width*height)
out = torch.bmm(proj_value, attention.permute(0, 2, 1))
out = out.view(m_batchsize, C, height, width)
out = self.gamma*out + x
#out = F.avg_pool2d(out, out.size()[2:4])
return out