Why do "score.backward()" return a gradient tensor with all zeros? - pytorch

When I tried to modify a CNN visualization method named gradient CAM to work for my yolo v3 model, I met a problem that the gradient calculated contains all zeros which cannot be used for following process after calling score.backward(). The score is the bounding box's confidence regarding to that class and this value has been processed by NMS.
Any hint or help would be appreciated!
class GradCAM(object):
"""
1: the network does not update gradient, input requires the update
2: use targeted class's score to do backward propagation
"""
def __init__(self, net, layer_name):
self.net = net
self.layer_name = layer_name
self.feature = None
self.gradient = None
self.net.eval()
self.handlers = []
self._register_hook()
def _get_features_hook(self, module, input, output):
self.feature = output
print("feature shape:{}".format(output.size()))
def _get_grads_hook(self, module, input_grad, output_grad):
"""
:param input_grad: tuple, input_grad[0]: None
input_grad[1]: weight
input_grad[2]: bias
:param output_grad:tuple,len = 1
:return:
"""
#print(input_grad)
#print(output_grad)
self.gradient = output_grad[0]
def _register_hook(self):
for i, module in enumerate(self.net.module_list):
if module == self.layer_name:
self.handlers.append(module.register_forward_hook(self._get_features_hook))
self.handlers.append(module.register_backward_hook(self._get_grads_hook))
def remove_handlers(self):
for handle in self.handlers:
handle.remove()
def __call__(self, inputs, index=0):
"""
:param inputs: {"image": [C,H,W], "height": height, "width": width}
:param index: which bbx
:return:
"""
self.net.zero_grad()
output = self.net(inputs['image'])[0]
output_nonmax = utils.non_max_suppression(output, conf_thres=0.25, iou_thres=0.45, multi_label=True)[0]
print(output_nonmax)
score = output_nonmax[index][4]
print(score)
score.retain_grad()
#score = output[0]['instances'].scores[index]
#proposal_idx = output[0]['instances'].indices[index]
proposal_idx = index
**score.backward()**
print(score.is_leaf)
print(score.grad)
gradient = self.gradient[proposal_idx].cpu().data.numpy() # [C,H,W]
#print(gradient)
weight = np.mean(gradient, axis=(1, 2)) # [C]
#print(weight)
feature = self.feature[proposal_idx].cpu().data.numpy() # [C,H,W]
cam = feature * weight[:, np.newaxis, np.newaxis] # [C,H,W]
cam = np.sum(cam, axis=0) # [H,W]
cam = np.maximum(cam, 0) # ReLU
# normalize
cam = cam - np.min(cam)
cam = cam / np.max(cam)
# resize to 224*224
box = output[index][:4].detach().numpy().astype(np.int32)
x1, y1, x2, y2 = box
cam = cv2.resize(cam, (x2 - x1, y2 - y1))
class_id = output[index][-1].detach().numpy()
return cam, box, class_id

Related

How to pad audio clips or mel spectrograms in pytorch custom dataloader?

I am trying to make an audio Siamese network while in the training loop I get a size mismatch in my tensors stack expects each tensor to be equal size, but got [1, 128, 121] at entry 0 and [1, 128, 205] at entry 1.
I am unsure where I messed up with my data since while gathering my data I made sure to pad all my audio clips to same size with background audio. So I have to implement a way to pad the audio clips some other way. I thought about padding clips to a static size bigger than all my clips in my custom dataloader but that still causes me to get the same error. Any ideas where I am messing up?
class OHDataset(data.Dataset):
def __init__(self, audio_dir, audio_dataset, transform = "mel_spectrogram"):
self.audio_labels = pd.read_csv(audio_dataset)
self.audio_dir = audio_dir
self.output_format = transform
def __len__(self):
return len(self.audio_labels)
def __getitem__(self, item, n_fft = 200, hop_length = 120):
positive = self.audio_labels.iloc[item, 0]
if(not bool(re.search(r'\d', positive))):
positive = self.audio_labels.iloc[item+1, 0]
anchor = re.sub(r'\d+', '', self.audio_labels.iloc[item, 0])
negative = self.audio_labels.iloc[random.randint(0, len(self.audio_labels)), 0]
pos_audio_path = os.path.join(self.audio_dir, positive + ".wav")
neg_audio_path = os.path.join(self.audio_dir, negative + ".wav")
anchor_audio_path = os.path.join(self.audio_dir, anchor + ".wav")
if(self.output_format == "spectrogram"):
pos_spectrogram = getSpectrogram(pos_audio_path, n_fft, hop_length)
neg_spectrogram = getSpectrogram(neg_audio_path, n_fft, hop_length)
anchor_spectrogram = getSpectrogram(anchor_audio_path, n_fft, hop_length)
return anchor_spectrogram, pos_spectrogram, neg_spectrogram
elif(self.output_format == "mel_spectrogram"):
pos_mel_spectrogram = getMELSpectrogram(pos_audio_path, n_fft, hop_length)
neg_mel_spectrogram = getMELSpectrogram(neg_audio_path, n_fft, hop_length)
anchor_mel_spectrogram = getMELSpectrogram(anchor_audio_path, n_fft, hop_length)
return anchor_mel_spectrogram, pos_mel_spectrogram, neg_mel_spectrogram
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (A, P, N) in enumerate(dataloader):
anchor = model(A).to(device)
positive = model(P).to(device)
negative = model(N).to(device)
loss = loss_fn(anchor, positive, negative)
optimizer.zero_grad()
loss.backward()
optimizer.step()

Evaluation gives 0 score

I am working on a instance segmentation problem on mask rcnn with pytorch. Training part is working with below code but evaluation gives 0 score at every mAP. What's the problem in the code?
More info:
I use Albumentations for transforms and some files from pytorch vision for training.
Some problems I've been through:
When I use coco for bbox format instead of pascal voc it gives following error.
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
When put labels out of the convert_seg_boolMask function, it gives following error.
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
def get_transforms(train=False):
if train:
transform = A.Compose([
ToTensorV2()
],bbox_params=A.BboxParams("pascal_voc",label_fields=["labels","iscrowd"]))
else:
transform = A.Compose([
ToTensorV2()
],bbox_params=A.BboxParams("pascal_voc",label_fields=["labels","iscrowd"]))
return transform
class Dataset(datasets.VisionDataset):
def __init__(self, coco_, data_dir, transform=None, target_transform=None, transforms=None):
super().__init__(data_dir, transforms, transform, target_transform)
self.coco_info = coco_
self.data_dir = data_dir
self.transforms = transforms
if isinstance(self.coco_info,dict):
self.ids = [x["id"] for x in self.coco_info["images"] if len(self._load_target(x["id"]))>0]
def _load_image(self, id: int):
name = loadImgs(self.coco_info["images"],id)[0]['file_name']
image = cv2.imread(os.path.join(self.data_dir, name))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)/255
return image
def _load_target(self, id):
return loadAnns(self.coco_info["annotations"],id)
def n_classes(self):
category_names = list(set(sorted([a["name"] for a in self.coco_info["categories"]])))
self.classes = ["__background__"]+[i for i in category_names]
return self.classes
def __getitem__(self,idx):
id = self.ids[idx]
image = self._load_image(id)
target = copy.deepcopy(self._load_target(id))
image_shape = (image.shape[0],image.shape[1])
img_info = {
"img_shape":image_shape,
"image_id":id,
"labels":[t["category_id"]for t in target],
"segmentation":[t["segmentation"][0] for t in target],
"id": [t["id"] for t in target]
}
mask, labels = self.convert_seg_to_boolMask(img_info)
obj_ids = np.unique(mask)
obj_ids = obj_ids[1:]
masks = torch.tensor(mask == obj_ids[:, None, None])
boxes = []
bbox = np.array([t["bbox"] for t in target])
for xmin,ymin,width,height in bbox:
xmax = xmin+width
ymax = ymin+height
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.tensor(boxes)
labels = torch.tensor(labels)
image_id = torch.tensor([id])
iscrowd = torch.tensor([t["iscrowd"] for t in target])
transformed = self.transforms(image=image, masks=masks, bboxes=boxes, labels=labels, iscrowd=iscrowd)
image = transformed['image']
masks = torch.tensor(transformed["masks"])
boxes = torch.tensor(transformed['bboxes'])
labels = torch.tensor(transformed["labels"])
iscrowd = torch.tensor(transformed["iscrowd"])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
image_id = torch.tensor(image_id)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
return image, target
def __len__(self):
return len(self.ids)
def convert_seg_to_boolMask(self,img_info):
mask = np.zeros(img_info["img_shape"], dtype=np.uint8)
mask = Image.fromarray(mask)
draw = ImageDraw.Draw(mask)
for seg, i in zip(img_info["segmentation"],img_info["id"]):
points = [tuple([k,l]) for k,l in zip(seg[0::2],seg[1::2])]
draw.polygon(xy=points,
outline=tuple([i]),
fill=tuple([i]))
mask = np.array(mask)
labels = img_info["labels"]
return mask, labels

How to define supervised contrastive loss for a semantic segmentation model?

I have found the code below that defines supervised contrastive loss for classification task.
class SupConLoss(nn.Module):
def __init__(self, temperature=0.07, contrast_mode='all',
base_temperature=0.07):
super(SupConLoss, self).__init__()
self.temperature = temperature
self.contrast_mode = contrast_mode
self.base_temperature = base_temperature
def forward(self, features, labels=None, mask=None):
"""Args:
features: hidden vector of shape [bsz, n_views, ...].
labels: ground truth of shape [bsz].
mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
has the same class as sample i. Can be asymmetric.
Returns:
A loss scalar.
"""
device = (torch.device('cuda')
if features.is_cuda
else torch.device('cpu'))
if len(features.shape) < 3:
raise ValueError('`features` needs to be [bsz, n_views, ...],'
'at least 3 dimensions are required')
if len(features.shape) > 3:
features = features.view(features.shape[0], features.shape[1], -1)
batch_size = features.shape[0]
if labels is not None and mask is not None:
raise ValueError('Cannot define both `labels` and `mask`')
elif labels is None and mask is None:
mask = torch.eye(batch_size, dtype=torch.float32).to(device)
elif labels is not None:
labels = labels.contiguous().view(-1, 1)
if labels.shape[0] != batch_size:
raise ValueError('Num of labels does not match num of features')
mask = torch.eq(labels, labels.T).float().to(device)
else:
mask = mask.float().to(device)
contrast_count = features.shape[1]
contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
if self.contrast_mode == 'one':
anchor_feature = features[:, 0]
anchor_count = 1
elif self.contrast_mode == 'all':
anchor_feature = contrast_feature
anchor_count = contrast_count
else:
raise ValueError('Unknown mode: {}'.format(self.contrast_mode))
# compute logits
anchor_dot_contrast = torch.div(
torch.matmul(anchor_feature, contrast_feature.T),
self.temperature)
# for numerical stability
logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
logits = anchor_dot_contrast - logits_max.detach()
# tile mask
mask = mask.repeat(anchor_count, contrast_count)
# mask-out self-contrast cases
logits_mask = torch.scatter(
torch.ones_like(mask),
1,
torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
0
)
mask = mask * logits_mask
# compute log_prob
exp_logits = torch.exp(logits) * logits_mask
log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
# compute mean of log-likelihood over positive
mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
# loss
loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
loss = loss.view(anchor_count, batch_size).mean()
return loss
My question is how I can use this loss for a semantic segmentation task on a pixel-wise level, where the input of the model is of size (batch, channels, height, width) and the labels are masks of size (batch, height, width).

When I make Transformer for NLP, it doesn't work, how can I fix it?

def get_pad_mask(tokens, i_pad=0):
"""
pad mask 계산하는 함수
:param tokens: tokens (bs, n_seq)
:param i_pad: id of pad
:return mask: pad mask (pad: 1, other: 0)
"""
# pad: True, others: False
mask = torch.eq(tokens, i_pad)
# boolean -> float 32
mask = mask.type(torch.FloatTensor)
# expand dimension for Q n_seq
mask = torch.unsqueeze(mask, 1)
return mask
def get_causal_mask(tokens, i_pad=0):
"""
causal mask 계산하는 함수
:param tokens: tokens (bs, n_seq)
:param i_pad: id of pad
:return mask: causal and pad mask (causal or pad: 1, other: 0)
"""
# n_seq 조회
n_seq = tokens.shape[1]
# all one mask
mask = torch.ones((n_seq, n_seq))
# make reverse causal mask
mask = mask.triu(1)
# 0 -> 1, 1 -> 0
# expand dim for bs
mask = torch.unsqueeze(mask, 0)
# get pad_mask
pad_mask = get_pad_mask(tokens, i_pad)
# mask all causal_mask or pad_mask
mask = torch.maximum(mask, pad_mask)
return mask
class ScaleDotProductAttention(nn.Module):
"""
Scale Dot Product Attention Class
"""
def __init__(self, name="scale_dot_product_attention"):
"""
생성자
:param name: layer name
"""
super(ScaleDotProductAttention, self).__init__()
def forward(self, Q, K, V, attn_mask):
"""
layer 실행
:param Q: Query
:param K: Key
:param V: Value
:param attn_mask: attention mask
:return attn_out: attention 실행 결과
"""
# matmul Q, K.T
attn_score = torch.matmul(Q, K.transpose(-2,-1))
# d_k
d_k = torch.tensor(K.shape[-1])
# scale = d_k ** 0.5
scale = torch.sqrt(d_k)
# divide by scale
attn_scale = torch.divide(attn_score, scale)
# do mask (subtract 1e-9 for masked value)
attn_scale -= 1.e9 * attn_mask
# calculate attention prob
attn_prob = torch.softmax(attn_scale, axis=-1)
# weighted sum of V
attn_out = torch.matmul(attn_prob, V)
return attn_out
class MultiHeadAttention(nn.Module):
#class MultiHeadAttention(tf.keras.layers.Layer):
"""
Multi Head Attention Class
"""
def __init__(self, args, name="MultiHeadAttention"):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(MultiHeadAttention, self).__init__()
self.d_model = args.d_model
self.n_head = args.n_head
self.d_head = args.d_head
# Q, K, V input dense layer
self.W_Q = nn.Linear(args.n_head * args.d_head,args.n_head * args.d_head)
self.W_K = nn.Linear(args.n_head * args.d_head,args.n_head * args.d_head)
self.W_V = nn.Linear(args.n_head * args.d_head,args.n_head * args.d_head)
'''TensorFLow
self.W_Q = tf.keras.layers.Dense(self.n_head * self.d_head)
self.W_K = tf.keras.layers.Dense(self.n_head * self.d_head)
self.W_V = tf.keras.layers.Dense(self.n_head * self.d_head)
TensorFLow'''
# Scale Dot Product Attention class
self.attention = ScaleDotProductAttention(name="self_attention")
# output dense layer
#self.W_O = torch.nn.Linear(args.n_head * args.d_head,self.d_model)
self.W_O = torch.nn.Linear(self.d_model,self.d_model)
'''TensorFLow
self.W_O = tf.keras.layers.Dense(self.d_model)
TensorFLow'''
def forward(self, Q, K, V, attn_mask):
"""
layer 실행
:param Q: Query
:param K: Key
:param V: Value
:param attn_mask: attention mask
:return attn_out: attention 실행 결과
"""
# build multihead Q, K, V
self.Q_m = torch.transpose(torch.reshape(self.W_Q(Q), [-1, Q.shape[1], args.n_head, args.d_head]), 2, 1) # (bs, n_head, Q_len, d_head)
self.K_m = torch.transpose(torch.reshape(self.W_K(K), [-1, K.shape[1], args.n_head, args.d_head]), 2, 1) # (bs, n_head, Q_len, d_head)
self.V_m = torch.transpose(torch.reshape(self.W_V(V), [-1, V.shape[1], args.n_head, args.d_head]), 2, 1) # (bs, n_head, Q_len, d_head)
'''TensorFLow
Q_m = tf.transpose(tf.reshape(self.W_Q(Q), [-1, tf.shape(Q)[1], args.n_head, args.d_head]), [0, 2, 1, 3]) # (bs, n_head, Q_len, d_head)
K_m = tf.transpose(tf.reshape(self.W_K(K), [-1, tf.shape(K)[1], args.n_head, args.d_head]), [0, 2, 1, 3]) # (bs, n_head, Q_len, d_head)
V_m = tf.transpose(tf.reshape(self.W_V(V), [-1, tf.shape(V)[1], args.n_head, args.d_head]), [0, 2, 1, 3]) # (bs, n_head, Q_len, d_head)
TensorFLow'''
# build multihead mask
attn_mask_m = torch.unsqueeze(attn_mask, axis=1)
'''TensorFLow
attn_mask_m = tf.expand_dims(attn_mask, axis=1)
TensorFLow'''
# Scale Dot Product Attention with multi head Q, K, V, attn_mask
attn_out_m = self.attention(self.Q_m, self.K_m, self.V_m, attn_mask_m) # (bs, n_head, Q_len, d_head)
# transpose
attn_out_t = torch.transpose(attn_out_m,2, 1)
'''TensorFLow
attn_out_t = tf.transpose(attn_out_m, perm=[0, 2, 1, 3]) # (bs, n_head, Q_len, d_head) -> (bs, Q_len, n_head, d_head)
TensorFLow'''
# reshape
attn_out_c = torch.reshape(attn_out_t, [-1, Q.shape[1], args.n_head * args.d_head]) # (bs, Q_len, n_head * d_head) -> (bs, Q_len, n_head, d_head)
'''TensorFLow
attn_out_c = tf.reshape(attn_out_t, [-1, tf.shape(Q)[1], args.n_head * args.d_head]) # (bs, Q_len, n_head, d_head) -> (bs, Q_len, n_head * d_head)
TensorFLow'''
# linear for output
attn_out = self.W_O(attn_out_c) # (bs, Q_len, n_head * d_head) -> (bs, Q_len, d_model)
return attn_out
class PositionWiseFeedForward(nn.Module):
#class PositionWiseFeedForward(tf.keras.layers.Layer):
"""
Position Wise Feed Forward Class
"""
def __init__(self, args, name="PositionWiseFeedForward"):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(PositionWiseFeedForward, self).__init__()
#super().__init__(name=name)
relu_f = torch.nn.ReLU()
relu_W = nn.Linear(args.d_model,args.d_ff)
self.W_1 = torch.nn.Sequential(relu_W,relu_f)
self.W_2 = nn.Linear(args.d_ff,args.d_model)
#self.W_1 = tf.keras.layers.Dense(args.d_ff, activation=tf.nn.relu)
#self.W_2 = tf.keras.layers.Dense(args.d_model)
def forward(self,inputs):
#def call(self, inputs):
"""
layer 실행
:param inputs: inputs
:return ff_val: feed forward 실행 결과
"""
# linear W_1 and W_2
ff_val = self.W_1(inputs)
ff_val = self.W_2(ff_val)
return ff_val
class EncoderLayer(nn.Module):
#class EncoderLayer(tf.keras.layers.Layer):
"""
Encoder Layer Class
"""
def __init__(self, args, name='encoder_layer'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(EncoderLayer, self).__init__()
#super().__init__(name=name)
self.enc_x_size = hidden_enc.size(dim=1)
self.enc_y_size = hidden_enc.size(dim=2)
self.self_attention = MultiHeadAttention(args)
self.norm1 = nn.LayerNorm([self.enc_x_size,self.enc_y_size],eps=args.norm_eps)
self.ffn = PositionWiseFeedForward(args)
self.norm2 = nn.LayerNorm([self.enc_x_size,self.enc_y_size],eps=args.norm_eps)
self.dropout = nn.Dropout(args.dropout)
def forward(self, enc_hidden, self_mask, training):
"""
layer 실행
:param enc_hidden: 이전 layer 출력
:param self_mask: self attention mask
:param training: training flag
:return enc_out: EncoderLayer 실행 결과
"""
# self attention
if training == False :
self.dropout.p = 0.0 #drop out training = False
print('ENCODER')
print(self.enc_x_size,self.enc_y_size,self.dropout.p)
self_attn_val = self.self_attention(enc_hidden, enc_hidden, enc_hidden, self_mask)
# add and layer normal
norm1_val = self.norm1(enc_hidden + self.dropout(self_attn_val))
# feed forward
ffn_val = self.ffn(norm1_val)
# add and layer normal
enc_out = self.norm2(norm1_val + self.dropout(ffn_val))
self.dropout.p = args.dropout
return enc_out
class DecoderLayer(nn.Module):
#class DecoderLayer(tf.keras.layers.Layer):
"""
Decoder Layer Class
"""
def __init__(self, args, name='decoder_layer'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(DecoderLayer, self).__init__()
#super().__init__(name=name)
self.dec_x_size = dec_hidden.size(dim=1)
self.dec_y_size = dec_hidden.size(dim=2)
self.self_attention = MultiHeadAttention(args)
self.norm1 = nn.LayerNorm([self.dec_x_size, self.dec_y_size],eps=args.norm_eps)
self.ende_attn = MultiHeadAttention(args)
self.norm2 = nn.LayerNorm([self.dec_x_size, self.dec_y_size],eps=args.norm_eps)
self.ffn = PositionWiseFeedForward(args)
self.norm3 = nn.LayerNorm([self.dec_x_size, self.dec_y_size],eps=args.norm_eps)
self.dropout = nn.Dropout(args.dropout)
def forward(self, dec_hidden, enc_out, self_mask, ende_mask,training):
#def call(self, dec_hidden, enc_out, self_mask, ende_mask, training):
"""
layer 실행
:param dec_hidden: 이전 layer 출력
:param enc_out: Encoder final 출력
:param self_mask: self attention mask
:param ende_mask: Encoder Decoder attention mask
:param training: training flag
:return dec_out: DecoderLayer 실행 결과
"""
print('DecoderLayer')
print(self.dec_x_size,self.dec_y_size)
if training == False :
self.dropout.p = 0.0 #drop out training = False
# self attention
self_attn_val = self.self_attention(dec_hidden, dec_hidden, dec_hidden, self_mask)
# add and layer normal
norm1_val = self.norm1(dec_hidden + self.dropout(self_attn_val))
# encoder and decoder attention
ende_attn_val = self.ende_attn(norm1_val, enc_out, enc_out, ende_mask)
# add and layer normal
norm2_val = self.norm2(norm1_val + self.dropout(ende_attn_val))
# feed forward
ffn_val = self.ffn(norm2_val)
# add and layer normal
dec_out = self.norm3(norm2_val + self.dropout(ffn_val))
self.dropout.p = args.dropout
return dec_out
class SharedEmbedding(nn.Module):
#class SharedEmbedding(tf.keras.layers.Layer):
"""
Weighed Shaed Embedding Class
"""
def __init__(self, args, name='SharedEmbedding'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(SharedEmbedding, self).__init__()
#super().__init__(name=name)
self.shared_weights = torch.empty(args.n_vocab, args.d_model)
self.shared_weights = torch.nn.init.trunc_normal_(self.shared_weights,std = args.d_model ** -0.5)
#with tf.name_scope('shared_embedding_weight'):
self.n_vocab = args.n_vocab
self.d_model = args.d_model
def forward(self, inputs, mode='embedding'):
#def call(self, inputs, mode='embedding'):
"""
layer 실행
:param inputs: 입력
:param mode: 실행 모드
:return: embedding or linear 실행 결과
"""
# mode가 embedding일 경우 embedding lookup 실행
if mode == 'embedding':
return self._embedding(inputs)
# mode가 linear일 경우 linear 실행
elif mode == 'linear':
return self._linear(inputs)
# mode가 기타일 경우 오류 발생
else:
raise ValueError(f'mode {mode} is not valid.')
def _embedding(self, inputs):
"""
embedding lookup
:param inputs: 입력
"""
# lookup by gather
embed = torch.matmul(nn.functional.one_hot(inputs, len(vocab)).type(torch.float), self.shared_weights )
#embed = tf.gather(self.shared_weights, tf.cast(inputs, tf.int32))
# muliply d_model ** 0.5
embed *= self.d_model ** 0.5
return embed
def _linear(self, inputs): # (bs, n_seq, d_model)
"""
linear 실행
:param inputs: 입력
"""
# matmul inputs, shared_weights (transpose_b=True)
outputs = torch.matmul(inputs, torch.transpose(self.shared_weights, 1, 0))
#outputs = tf.matmul(inputs, self.shared_weights, transpose_b=True)
return outputs
class PositionalEmbedding(nn.Module):
#class PositionalEmbedding(tf.keras.layers.Layer):
"""
Positional Embedding Class
"""
def __init__(self, args, name='position_embedding'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(PositionalEmbedding, self).__init__()
#super().__init__(name=name)
pos_encoding = PositionalEmbedding.get_sinusoid_encoding(args.n_seq, args.d_model)
self.embedding = nn.Embedding(args.n_seq, args.d_model)
self.embedding.weights = [pos_encoding] #weights=[pos_encoding]
self.embedding.weight.requires_grad=False #trainable=False
#self.embedding = tf.keras.layers.Embedding(args.n_seq, args.d_model, trainable=False, weights=[pos_encoding])
def forward(self, inputs):
#def call(self, inputs):
"""
layer 실행
:param inputs: 입력
:return embed: positional embedding lookup 결과
"""
# make position (0...n_seq)
zero_inputs = torch.ones_like(inputs)
x_size = zero_inputs.size(dim=0)
for i in range(0,x_size):
zero_inputs[i][0] = 0
position = torch.cumsum(zero_inputs, dim=1)
#position = tf.math.cumsum(tf.ones_like(inputs), axis=1, exclusive=True)
# embedding lookup
embed = self.embedding(position)
return embed
#staticmethod
def get_sinusoid_encoding(n_seq, d_model):
"""
sinusoid encoding 생성
:param n_seq: sequence number
:param n_seq: model hidden dimension
:return: positional encoding table
"""
# calculate exp
exs = np.array([2 * (i_ang // 2) / d_model for i_ang in range(d_model)])
# exs = np.array([2 * (i_ang // 2) / args.d_model for i_ang in range(args.d_model)])
# calculate power
angles = np.power(10000, exs)
# make position
pos = np.array([[i] for i in range(n_seq)])
# position angle
pos_encoding = pos / angles
# sin even number
pos_encoding[:, 0::2] = np.sin(pos_encoding[:, 0::2])
# cos odd number
pos_encoding[:, 1::2] = np.cos(pos_encoding[:, 1::2])
return pos_encoding
#return tf.cast(pos_encoding, tf.float32)
class Transformer(nn.Module):
#class Transformer(tf.keras.Model):
"""
Transformer Class
"""
def __init__(self, args, name='transformer'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(Transformer, self).__init__()
#super().__init__(name=name)
self.i_pad = args.i_pad
self.embedding = SharedEmbedding(args)
self.position = PositionalEmbedding(args)
self.encoder_layers = [EncoderLayer(args, name=f'encoder_layer_{i}') for i in range(args.n_layer)]
self.decoder_layers = [DecoderLayer(args, name=f'decoder_layer_{i}') for i in range(args.n_layer)]
self.dropout = nn.Dropout(args.dropout)
def forward(self, inputs, training=False):
#def call(self, inputs, training=False):
"""
layer 실행
:param inputs: enc_tokens, dec_tokens
:return logits: dec_tokens에 대한 다음 토큰 예측 결과 logits
"""
enc_tokens, dec_tokens = inputs
# encoder self attention mask
enc_self_mask = get_pad_mask(enc_tokens, self.i_pad)
# decoder self attention mask
dec_self_mask = get_causal_mask(dec_tokens, self.i_pad)
# encoder and decoder attention mask
enc_dec_mask = get_pad_mask(enc_tokens, self.i_pad)
# enc_tokens embedding lookup
enc_hidden = self.embedding(enc_tokens) + self.position(enc_tokens)
enc_hidden = self.dropout(enc_hidden)
# call encoder layers
for encoder_layer in self.encoder_layers:
enc_hidden = encoder_layer(enc_hidden, enc_self_mask, training)
# dec_tokens embedding lookup
dec_hidden = self.embedding(dec_tokens) + self.position(dec_tokens)
if training == False :
self.dropout.p = 0.0 #drop out training = False
dec_hidden = self.dropout(dec_hidden)
# call decoder layers
for decoder_layer in self.decoder_layers:
dec_hidden = decoder_layer(dec_hidden, enc_hidden, dec_self_mask, enc_dec_mask, training)
# call weight shared embedding (model=linear)
logits = self.embedding(dec_hidden, mode='linear')
# softmax
logit_softmax = nn.Softmax(dim=-1)
y_pred = logit_softmax(logits)
self.dropout.p = args.dropout
return y_pred
def lm_loss(logits, labels):
logit_softmax = nn.Softmax(dim=-1)
logits = logit_softmax(logits)
loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
loss = loss_fn(logits, labels)
mask = labels.ne(0)
loss_val = loss.masked_select(mask).sum()
total = mask.sum()
loss = loss_val / torch.maximum(total, torch.tensor(1))
return loss
def lm_acc(y_pred, y_true):
"""
pad 부분을 제외하고 accuracy를 계산하는 함수
:param y_true: 정답
:param y_pred: 예측 값
:retrun loss: pad 부분이 제외된 accuracy 값
"""
y_true_clone = y_true.clone().detach()
y_pred_clone = y_pred.clone().detach()
y_pred_softmax = nn.Softmax(dim=-1)
y_pred_clone = y_pred_softmax(y_pred_clone )
y_pred_class = torch.argmax(y_pred_clone,dim=-1)
matches = torch.eq(y_true_clone, y_pred_class)
matches = matches.to(device).int()
mask = torch.ne(y_true_clone, 0)
mask = mask.to(device).int()
matches *= mask
mask_total = mask.sum()
matches_total = matches.sum()
accuracy = matches_total / torch.maximum(mask_total, torch.tensor(1))
print(y_true_clone)
print(y_pred_class)
print(accuracy)
return accuracy
model = Transformer(args)
function_predict = model((train_enc_inputs[:4], train_dec_inputs[:4]),training=True)
loss = lm_loss(function_predict[:4].view(-1, function_predict.size(-1)), train_dec_labels[:4].view(-1)).to(device)
acc = lm_acc(function_predict[:4].view(-1, function_predict.size(-1)), train_dec_labels[:4].view(-1)).to(device)
input : tensor([7116, 107, 1, ..., 0, 0, 0])
output : tensor([ 2, 7116, 107, ..., 0, 0, 0])
I make transofromer by pytorch, The output same as dec_input starting 2 ending with 3
I challenge 1000 times this code, the result is smae.
I think this code has some problem, This is my first pytorch code.
Can you fix it this stressed code?

Gradient is equal to 'None'

I have two networks. The output of the first network is the input to the other. In order to calculate the loss for the second network, I use vanilla policy gradient. I want to backpropagate this loss into the first network. After checking if the gradeints has changed, I see that they are all none.
I first load the first network (a pre-trained autoencoer in my network this way):
def load_checkpoint(filepath, model):
checkpoint = torch.load(filepath)
model.load_state_dict(checkpoint['state_dict'])
for parameter in model.parameters():
parameter.requires_grad = True
model.train()
return model
Then I define the optimizers for both networks this way:
class MultipleOptimizer(object):
def __init__(self, *op):
self.optimizers = op
def zero_grad(self):
for op in self.optimizers:
op.zero_grad()
def step(self):
for op in self.optimizers:
op.step()
opt = MultipleOptimizer(SGD(model.parameters(), lr=1, momentum=0.9), Adam(logits_net.parameters(), lr=lr))
the reward function is:
#Reward function
def reward(x, act):
#print('action', act)
#print('x type', type(x))
km = KMeans(act, n_init=20, n_jobs=4)
y_pred = km.fit_predict(x.detach().cpu().numpy())# seems we can only get a centre from batch
#print('k-means output type', type(y_pred))
sil_score = sil(x.detach().cpu().numpy(), y_pred)
#print('sil score', sil_score)
return sil_score
The architecture of the second neural net and an alternative to avoid (logits=logits.mean(0)):
def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
# Build a feedforward neural network. outputs are the logits
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
class mlp2(torch.nn.Module):
def __init__(self):
super(mlp2, self).__init__()
self.linear1 = nn.Linear(10,100)
self.relu1 = nn.ReLU(inplace=True)
self.linear2 = torch.nn.Linear(100,100)
self.linear3 = torch.nn.Linear(100,20)
self.linear4 = torch.nn.Linear(2000,100)
self.ident = nn.Identity()
def forward(self, x):
a = self.linear1(x)
a = self.relu1(a)
a = self.linear2(a)
a = self.relu1(a)
a = self.linear3(a)
a = torch.flatten(a)
a = self.linear4(a)
a = self.relu1(a)
a = self.linear3(a)
out = self.ident(a)
return out
Loss is calculated as in the following order:
def get_policy(obs):
logits = logits_net(obs)
return Categorical(logits=logits.mean(0))
def get_action(obs):
return get_policy(obs).sample().item()
def Logp(obs, act):
logp = get_policy(obs).log_prob(act.cuda())
return logp
def compute_loss(logp, weights):
return -(logp * weights).mean()
def train_one_epoch():
# make some empty lists for logging.
batch_obs = [] # for observations
batch_acts = [] # for actions
batch_weights = [] # for R(tau) weighting in policy gradient
batch_logp = []
# reset episode-specific variables
j = 1 # signal from environment that episode is over
ep_rews = [] # list for rewards accrued throughout ep
for i, data in enumerate(train_loader):
#Create the mean image out of those 100 images
x, label = data
x = model(x.cuda())#torch.Size([100, 10])
obs = x.data.cpu().numpy()#[100, 10] - a trajectory with only one state
# Save obs
batch_obs.append(obs.copy())
#act in the environment
#act = get_action(torch.as_tensor(obs, dtype=torch.float32))
act = get_action(x)
print('action type', type(act))
#log probability
#logp = Logp(torch.as_tensor(obs, dtype=torch.float32),act = torch.as_tensor(act, dtype=torch.int32))
logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32))
#rew = reward(obs, act+2)
rew = reward(x, act+2)
# save action, reward
batch_acts.append(act)
batch_weights.append(rew)#episode rewards
batch_logp.append(logp)
opt.zero_grad()
batch_logp = torch.stack(batch_logp, dim=0)
batch_loss = compute_loss(logp = torch.as_tensor(batch_logp, dtype=torch.float32),
weights = torch.as_tensor(batch_weights, dtype=torch.float32))
batch_loss.backward() #does it return anything? gradients? print them!
opt.step()
for name, param in logits_net.named_parameters():
print(name, param.grad)
I applied some changes with the assumption that maybe recreating some of the tensors maybe the issue:
I have the output of the first network, obs, converted like obs = x.data.cpu().numpy() this and then sent to get_action function: act = get_action(torch.as_tensor(obs, dtype=torch.float32)). I changes this to act = get_action(x) so, x is sent directly to this function. Also, change arguments of logp to logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32)).
After these changes, I still get the none value for the gradient. Is there anyway possible to backpropagate the gradient when loss is calculated this way? any changes that I can apply?
any help is appreciated.

Resources