NotImplementedError: You must implement the backward function for custom autograd.Function - pytorch

class Layer(nn.Module):
def forward(self,pxo):
p, x, o = pxo # (n,3),(n,in_planes),(b)
n, c = x.shape
npoints = self.npoints
v = self.linear_v(x) # (n, c)
sampling_offsets = self.sampling_offsets(x) # (n,npoints * 3)
sampling_offsets = sampling_offsets.reshape(n * npoints, 3) # (n * npoints, 3)
sampling_offsets /= self.resolution # (n * npoints,3)
sampling_offsets = sampling_offsets.reshape(n, npoints, 3) # (n, npoints, 3)
sampling_positions = (sampling_offsets + p.unsqueeze(dim=1)).reshape(-1, 3) # (n * npoints, 3)
attention_weights = self.attention_weights(x) # (n,npoints)
attention_weights = self.softmax(attention_weights) # (n, npoints)
new_o = o * npoints
v = pointops.interpolation(p, sampling_positions, v, o, new_o).reshape(n,npoints,c)
v = v.transpose(1,2).contiguous() #(n, c, npoints)
h_v = torch.matmul(v, attention_weights.unsqueeze(-1)).reshape(n, c) # (n, nheads * c)
x = self.linear_output(h_v)
return x
Once I use this layer, there will be an error. If I switch to another network, there will be not. I am curious about how this model causes this error?
def interpolation(xyz, new_xyz, feat, offset, new_offset, k=3):
input: xyz: (m, 3), new_xyz: (n, 3), feat: (m, c), offset: (b), new_offset: (b)
output: (n, c)
assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, 3), (n, 3)
dist_recip = 1.0 / (dist + 1e-8) # (n, 3)
norm = torch.sum(dist_recip, dim=1, keepdim=True)
weight = dist_recip / norm # (n, 3)
new_feat = torch.cuda.FloatTensor(new_xyz.shape[0], feat.shape[1]).zero_()
for i in range(k):
new_feat += feat[idx[:, i].long(), :] * weight[:, i].unsqueeze(-1)
return new_feat
What I use is nn.CrossEntropyLoss().
I can get the loss, but loss.backward() will report error.


DFS vs. Kruskal runtime (maze generation)

I have written two algorithms for creating unique mazes, one of them using depth-first-search (DFS) and the other using Kruskal's. The DFS algorithm performs as expected, however Kruskal's algorithm runs marginally slower than DFS and I do not know why.
I had written Kruskal's algorithm in Python.
I suspect the random.choice() function seems to be the underlying problem. The difference in runtime becomes noticeable when (r, c) > 30.
Here is the code for Kruskal's algorithm:
# Create a list of all possible edges
def create_edges(r, c):
edges = []
for y in range(r):
for x in range(c):
i = (y, x)
for d in ((0, 1), (0, -1), (1, 0), (-1, 0)):
p = tuple(map(sum, zip(d, i)))
py = p[0]
px = p[1]
if px in range(c) and py in range(r):
edges.append([i, p])
return edges
def kruskal(r, c, sz):
path = []
# Create a list of parent root nodes
roots = {(y, x) : [(y, x)] for y in range(r) for x in range(c)}
edges = create_edges(r, c)
while edges:
# Choose a random edge
edge = random.choice(edges)
parent = edge[0]
child = edge[1]
parent_set = get_set(roots, parent)
child_set = get_set(roots, child)
# Check if the parent / child are already in the same set
if parent_set == child_set:
rev_edge = edge.reverse()
if rev_edge in edges:
roots[parent_set] += roots[child_set]
path.extend((parent, child))
rev_edge = edge.reverse()
if rev_edge in edges:
return path
def get_set(roots, member):
s = None
for parent, children in roots.items():
if member in children:
s = parent
return s
def create_maze(t, r, c, sz):
maze = [['|_' for _ in range(c)] for _ in range(r)]
for cell in maze: cell.append('| ')
wd = {'DOWN' : ( 1, 0),
'UP' : (-1, 0),
'LEFT' : ( 0, -1),
'RIGHT': ( 0, 1)}
for n in range(len(t) - 1):
a = n
b = n + 1
p1 = t[a]
p2 = t[b]
ay, ax = p1[0], p1[1]
by, bx = p2[0], p2[1]
w = tuple(numpy.array(p2) - numpy.array(p1))
if w in wd.values():
k = list(wd.keys())[list(wd.values()).index(w)]
if k == 'DOWN': maze[ay][ax] = maze[ay][ax].replace('_', ' ')
if k == 'UP': maze[by][bx] = maze[by][bx].replace('_', ' ')
if k == 'LEFT': maze[ay][ax] = maze[ay][ax].replace('|', ' ')
if k == 'RIGHT': maze[by][bx] = maze[by][bx].replace('|', ' ')
return maze
def print_maze(maze, r, c, delay = 0):
s, l = min((r, c)), max((r, c))
a = 1 / (4 * r * c)
e = (1 / (s * l)) ** 2
delay = (a * 2.718 ** (-1 * e)) ** 0.5
print(' _' * c)
for iy in range(r):
for ix in range(c + 1):
print(maze[iy][ix], end = '')
def main():
r = 30
c = 30
sz = r * c
path = kruskal(r, c, sz)
maze = create_maze(path, r, c, sz)
print_maze(maze, r, c)
if __name__ == "__main__":

ViVIT PyTorch: RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/

I am trying to run Video Vision Transformer (ViViT) code with my dataset but getting an error using CrossEntropyLoss from Pytorch as the Loss function.
There are 6 classes I have:
['Run', 'Sit', 'Walk', 'Wave', 'Sit', 'Stand']
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, weight_decay=1e-9, momentum=0.9)
Class Weights
tensor([0.0045, 0.0042, 0.0048, 0.0038, 0.0070, 0.0065])
Loss Function
loss_func = nn.CrossEntropyLoss(
Code Throwning Error
train_epoch(model, optimizer, train_loader, train_loss_history, loss_func)
RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/
Code Calling the transformer
model = ViViT(224, 16, 100, 16).cuda()
Getting Video Frames
def get_frames(filename, n_frames=1):
frames = []
v_cap = cv2.VideoCapture(filename)
v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_list = np.linspace(0, v_len - 1, n_frames + 1, dtype=np.int16)
frame_dims = np.array([224, 224, 3])
for fn in range(v_len):
success, frame =
if success is False:
if (fn in frame_list):
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = cv2.resize(frame, (frame_dims[0], frame_dims[1]))
return frames, v_len
Dataset Preprocessing
class DatasetProcessing(data.Dataset):
def __init__(self, df, root_dir):
super(DatasetProcessing, self).__init__()
# List of all videos path
video_list = df["Video"].apply(lambda x: root_dir + '/' + x)
self.video_list = np.asarray(video_list)
self.df = df
def __getitem__(self, index):
# Ensure that the raw videos are in respective folders and folder name matches the output class label
video_label = self.video_list[index].split('/')[-2]
video_name = self.video_list[index].split('/')[-1]
video_frames, len_ = get_frames(self.video_list[index], n_frames = 15)
video_frames = np.asarray(video_frames)
video_frames = video_frames/255
class_list = ['Run', 'Walk', 'Wave', 'Sit', 'Turn', 'Stand']
class_id_loc = np.where(class_list == video_label)
label = class_id_loc
d = torch.as_tensor(np.array(video_frames).astype('float'))
l = torch.as_tensor(np.array(label).astype('float'))
return (d, l)
def __len__(self):
return self.video_list.shape[0]
Training Epochs
def train_epoch(model, optimizer, data_loader, loss_history, loss_func):
total_samples = len(data_loader.dataset)
for i, (data, target) in enumerate(data_loader):
x = data.cuda()
data = rearrange(x, 'b p h w c -> b p c h w').cuda()
target = target.type(torch.LongTensor).cuda()
pred = model(data.float())
output = F.log_softmax(pred, dim=1)
loss = loss_func(output, target.squeeze(1))
if i % 100 == 0:
print('[' + '{:5}'.format(i * len(data)) + '/' + '{:5}'.format(total_samples) +
' (' + '{:3.0f}'.format(100 * i / len(data_loader)) + '%)] Loss: ' +
Evaluate Model
def evaluate(model, data_loader, loss_history, loss_func):
total_samples = len(data_loader.dataset)
correct_samples = 0
total_loss = 0
with torch.no_grad():
for data, target in data_loader:
x = data.cuda()
data = rearrange(x, 'b p h w c -> b p c h w').cuda()
target = target.type(torch.LongTensor).cuda()
output = F.log_softmax(model(data.float()), dim=1)
loss = loss_func(output, target)
_, pred = torch.max(output, dim=1)
total_loss += loss.item()
correct_samples += pred.eq(target).sum()
avg_loss = total_loss / total_samples
print('\nAverage test loss: ' + '{:.4f}'.format(avg_loss) +
' Accuracy:' + '{:5}'.format(correct_samples) + '/' +
'{:5}'.format(total_samples) + ' (' +
'{:4.2f}'.format(100.0 * correct_samples / total_samples) + '%)\n')
class Transformer(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
self.layers = nn.ModuleList([])
self.norm = nn.LayerNorm(dim)
for _ in range(depth):
PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
def forward(self, x):
for attn, ff in self.layers:
x = attn(x) + x
x = ff(x) + x
return self.norm(x)
ViViT Code
class ViViT(nn.Module):
def __init__(self, image_size, patch_size, num_classes, num_frames, dim = 192, depth = 4, heads = 3, pool = 'cls', in_channels = 3, dim_head = 64, dropout = 0.,
emb_dropout = 0., scale_dim = 4, ):
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_size // patch_size) ** 2
patch_dim = in_channels * patch_size ** 2
self.to_patch_embedding = nn.Sequential(
Rearrange('b t c (h p1) (w p2) -> b t (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
nn.Linear(patch_dim, dim),
self.pos_embedding = nn.Parameter(torch.randn(1, num_frames, num_patches + 1, dim))
self.space_token = nn.Parameter(torch.randn(1, 1, dim))
self.space_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)
self.temporal_token = nn.Parameter(torch.randn(1, 1, dim))
self.temporal_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)
self.dropout = nn.Dropout(emb_dropout)
self.pool = pool
self.mlp_head = nn.Sequential(
nn.Linear(dim, num_classes)
def forward(self, x):
x = self.to_patch_embedding(x)
b, t, n, _ = x.shape
cls_space_tokens = repeat(self.space_token, '() n d -> b t n d', b = b, t=t)
x =, x), dim=2)
x += self.pos_embedding[:, :, :(n + 1)]
x = self.dropout(x)
x = rearrange(x, 'b t n d -> (b t) n d')
x = self.space_transformer(x)
x = rearrange(x[:, 0], '(b t) ... -> b t ...', b=b)
cls_temporal_tokens = repeat(self.temporal_token, '() n d -> b n d', b=b)
x =, x), dim=1)
x = self.temporal_transformer(x)
x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
return self.mlp_head(x)
Multi target appears to be a feature supported since version 1.10.0.
Please check your pytorch version.
Please refer to the example of using the UTF101 top5 dataset, which is available on my Colab. The version of pytorch is 1.12.0+cu113, and the code you listed was able to run the training almost exactly as it was written.

convert Tensorflow1 to Tensorflow 2

there is a code written with tensorflow1 on this link.
I want to use this class as a layer in TensorFlow.Keras. So it should be written with TensorFlow version 2.
How can do it?
this is this code:
import tensorflow as tf
class ConvLSTMCell(tf.nn.rnn_cell.RNNCell):
"""A LSTM cell with convolutions instead of multiplications.
Xingjian, S. H. I., et al. "Convolutional LSTM network: A machine learning approach for precipitation nowcasting." Advances in Neural Information Processing Systems. 2015.
def __init__(self, shape, filters, kernel, forget_bias=1.0, activation=tf.tanh, normalize=True, peephole=True, data_format='channels_last', reuse=None):
super(ConvLSTMCell, self).__init__(_reuse=reuse)
self._kernel = kernel
self._filters = filters
self._forget_bias = forget_bias
self._activation = activation
self._normalize = normalize
self._peephole = peephole
if data_format == 'channels_last':
self._size = tf.TensorShape(shape + [self._filters])
self._feature_axis = self._size.ndims
self._data_format = None
elif data_format == 'channels_first':
self._size = tf.TensorShape([self._filters] + shape)
self._feature_axis = 0
self._data_format = 'NC'
raise ValueError('Unknown data_format')
def state_size(self):
return tf.nn.rnn_cell.LSTMStateTuple(self._size, self._size)
def output_size(self):
return self._size
def call(self, x, state):
c, h = state
x = tf.concat([x, h], axis=self._feature_axis)
n = x.shape[-1].value
m = 4 * self._filters if self._filters > 1 else 4
W = tf.get_variable('kernel', self._kernel + [n, m])
y = tf.nn.convolution(x, W, 'SAME', data_format=self._data_format)
if not self._normalize:
y += tf.get_variable('bias', [m], initializer=tf.zeros_initializer())
j, i, f, o = tf.split(y, 4, axis=self._feature_axis)
if self._peephole:
i += tf.get_variable('W_ci', c.shape[1:]) * c
f += tf.get_variable('W_cf', c.shape[1:]) * c
if self._normalize:
j = tf.contrib.layers.layer_norm(j)
i = tf.contrib.layers.layer_norm(i)
f = tf.contrib.layers.layer_norm(f)
f = tf.sigmoid(f + self._forget_bias)
i = tf.sigmoid(i)
c = c * f + i * self._activation(j)
if self._peephole:
o += tf.get_variable('W_co', c.shape[1:]) * c
if self._normalize:
o = tf.contrib.layers.layer_norm(o)
c = tf.contrib.layers.layer_norm(c)
o = tf.sigmoid(o)
h = o * self._activation(c)
state = tf.nn.rnn_cell.LSTMStateTuple(c, h)
return h, state

Failure to achieve an effective speed increase while Cythonizing the python 3 code

I cythonized the Python 3 code, but I failed to speeding up it. Time elapsed during the pure Python 3 code's execution is ~29 seconds while the cythonized code's is ~25 seconds (details are given below). Where did I go wrong in the cythonized code. I will be glad if you help me. I added below the pure Python 3 code, the cythonized code and the setup file, respectively.
Python version: 3.7.5
Cython version: 0.29.14
Editor: Pycharm
OS: Windows 10
The code runs 100 times in for loop. Size of the used arrays at each loop are below:
velos = 3300
V = 3300
S = 3300 x 3300
vels = 201
line_centers (in masks) = ~100
If necessary, I can add a sample data to this post.
import numpy as np
import numpy.linalg as la
def lsd(velos, V, S, vels, masks, Lambda=0.):
m, n = len(vels), len(velos)
Nmask = len(masks)
V = V - 1
M = np.zeros((n, m * len(masks)))
for N, (line_centers, weights) in enumerate(masks):
for l, lc in enumerate(line_centers):
vi = velos - lc
for j in range(m - 1):
w = np.argwhere((vi < vels[j + 1]) & (vi > vels[j])).T[0]
if len(w) == 0: continue
M[w, j + N * m] = weights[l] * (vels[j + 1] - vi[w]) / (vels[j + 1] - vels[j])
M[w, j + 1 + N * m] = weights[l] * (vi[w] - vels[j]) / (vels[j + 1] - vels[j])
if np.abs(np.sum(M)) < 1e-8:
return np.zeros((1, len(vels)))
if Lambda:
R = np.zeros((m * Nmask, m * Nmask))
for i in range(1, m-1):
R[i, i] = 2
R[i-1, i] = -1
R[i+1, i] = -1
R[0, 0] = 1
R[1, 0] = -1
R[-1, -1] = 1
R[-2, -1] = -1
X = np.matmul(M.T, (S**2))
XM = np.matmul(X, M)
if Lambda:
XM = XM + Lambda * R
cc = np.matmul(X, V)
Z, res, rank, s = la.lstsq(XM, cc, rcond=None)
# ZT = Z.T
# ccT = cc.T
# Z_ = []
# C_ = []
# for i in range(len(Z)):
# Z_.append([])
# C_.append([])
# for N in range(Nmask):
# Z_[-1].append(Z[i][N * m: (N + 1) * m])
# C_[-1].append(cc[i][N * m: (N + 1) * m])
return Z.T
import numpy as np
cimport numpy as np
import cython
# from libcpp.vector cimport vector
DTYPE = np.float
ctypedef np.double_t DTYPE_t
# #cython.wraparound(False)
cpdef lsd(np.ndarray[DTYPE_t, ndim=1] velos, np.ndarray[DTYPE_t, ndim=2] V, np.ndarray[DTYPE_t, ndim=2] S,
np.ndarray[DTYPE_t, ndim=1] vels, np.ndarray[DTYPE_t, ndim=3] masks, float Lambda=0.):
cdef int m = vels.shape[0]
cdef int n = velos.shape[0]
cdef int Nmask = masks.shape[0]
cdef int N, l, j, i
cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] M = np.zeros((n, m * Nmask), dtype=DTYPE)
cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] R = np.zeros((m * Nmask, m * Nmask), dtype=DTYPE)
cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] X
cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] XM
cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] cc
cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Z
cdef np.ndarray[DTYPE_t, ndim=1, mode='c'] line_centers, weights, vi
cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] zeros = np.zeros((1, m), dtype=DTYPE)
cdef np.ndarray w
# cdef double lc
V = V - 1
for N in range(Nmask):
line_centers = masks[N][0]
weights = masks[N][1]
for l in range(len(line_centers)):
vi = velos - line_centers[l]
for j in range(m - 1):
# print(np.argwhere((vi < vels[j + 1]) & (vi > vels[j])).T[0])
w = np.argwhere((vi < vels[j + 1]) & (vi > vels[j])).T[0]
if len(w) == 0: continue
M[w, j + N * m] = weights[l] * (vels[j + 1] - vi[w]) / (vels[j + 1] - vels[j])
M[w, j + 1 + N * m] = weights[l] * (vi[w] - vels[j]) / (vels[j + 1] - vels[j])
if np.abs(np.sum(M)) < 1e-8:
return zeros
if Lambda:
for i in range(1, m-1):
R[i, i] = 2
R[i-1, i] = -1
R[i+1, i] = -1
R[0, 0] = 1
R[1, 0] = -1
R[-1, -1] = 1
R[-2, -1] = -1
X = np.matmul(M.T, (S**2))
XM = np.matmul(X, M)
if Lambda:
XM = XM + Lambda * R
cc = np.matmul(X, V)
Z, _, _, _ = np.linalg.lstsq(XM, cc, rcond=None)
# ZT = Z.T
# ccT = cc.T
# Z_ = []
# C_ = []
# for i in range(len(Z)):
# Z_.append([])
# C_.append([])
# for N in range(Nmask):
# Z_[-1].append(Z[i][N * m: (N + 1) * m])
# C_[-1].append(cc[i][N * m: (N + 1) * m])
return Z.T
from setuptools import setup
from Cython.Build import cythonize
import sys
import numpy
compiler_directives={'language_level' : sys.version_info[0]}),

Implementing self attention

I am trying to implement self attention in Pytorch.
I need to calculate the following expressions.
Similarity function S (2 dimensional), P(2 dimensional), C'
S[i][j] = W1 * inp[i] + W2 * inp[j] + W3 * x1[i] * inp[j]
P[i][j] = e^(S[i][j]) / Sum for all j( e ^ (S[i]))
basically, P is a softmax function
C'[i] = Sum (for all j) P[i][j] * x1[j]
I tried the following code using for loops
for i in range(self.dim):
for j in range(self.dim):
S[i][j] = self.W1 * x1[i] + self.W2 * x1[j] + self.W3 * x1[i] * x1[j]
for i in range(self.dim):
for j in range(self.dim):
P[i][j] = torch.exp(S[i][j]) / torch.sum( torch.exp(S[i]))
# attend
for i in range(self.dim):
out[i] = 0
for j in range(self.dim):
out[i] += P[i][j] * x1[j]
Is there any faster way to implement this in Pytorch?
Here is an example of Self Attention I had implemented in Dual Attention for HSI Imagery
class PAM_Module(Module):
""" Position attention module"""
#Ref from SAGAN
def __init__(self, in_dim):
super(PAM_Module, self).__init__()
self.chanel_in = in_dim
self.query_conv = Conv2d(in_channels=in_dim, out_channels=in_dim//8, kernel_size=1)
self.key_conv = Conv2d(in_channels=in_dim, out_channels=in_dim//8, kernel_size=1)
self.value_conv = Conv2d(in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.gamma = Parameter(torch.zeros(1))
self.softmax = Softmax(dim=-1)
def forward(self, x):
inputs :
x : input feature maps( B X C X H X W)
returns :
out : attention value + input feature
attention: B X (HxW) X (HxW)
m_batchsize, C, height, width = x.size()
proj_query = self.query_conv(x).view(m_batchsize, -1, width*height).permute(0, 2, 1)
proj_key = self.key_conv(x).view(m_batchsize, -1, width*height)
energy = torch.bmm(proj_query, proj_key)
attention = self.softmax(energy)
proj_value = self.value_conv(x).view(m_batchsize, -1, width*height)
out = torch.bmm(proj_value, attention.permute(0, 2, 1))
out = out.view(m_batchsize, C, height, width)
out = self.gamma*out + x
#out = F.avg_pool2d(out, out.size()[2:4])
return out
