I am trying to make an audio Siamese network while in the training loop I get a size mismatch in my tensors stack expects each tensor to be equal size, but got [1, 128, 121] at entry 0 and [1, 128, 205] at entry 1.
I am unsure where I messed up with my data since while gathering my data I made sure to pad all my audio clips to same size with background audio. So I have to implement a way to pad the audio clips some other way. I thought about padding clips to a static size bigger than all my clips in my custom dataloader but that still causes me to get the same error. Any ideas where I am messing up?
class OHDataset(data.Dataset):
def __init__(self, audio_dir, audio_dataset, transform = "mel_spectrogram"):
self.audio_labels = pd.read_csv(audio_dataset)
self.audio_dir = audio_dir
self.output_format = transform
def __len__(self):
return len(self.audio_labels)
def __getitem__(self, item, n_fft = 200, hop_length = 120):
positive = self.audio_labels.iloc[item, 0]
if(not bool(re.search(r'\d', positive))):
positive = self.audio_labels.iloc[item+1, 0]
anchor = re.sub(r'\d+', '', self.audio_labels.iloc[item, 0])
negative = self.audio_labels.iloc[random.randint(0, len(self.audio_labels)), 0]
pos_audio_path = os.path.join(self.audio_dir, positive + ".wav")
neg_audio_path = os.path.join(self.audio_dir, negative + ".wav")
anchor_audio_path = os.path.join(self.audio_dir, anchor + ".wav")
if(self.output_format == "spectrogram"):
pos_spectrogram = getSpectrogram(pos_audio_path, n_fft, hop_length)
neg_spectrogram = getSpectrogram(neg_audio_path, n_fft, hop_length)
anchor_spectrogram = getSpectrogram(anchor_audio_path, n_fft, hop_length)
return anchor_spectrogram, pos_spectrogram, neg_spectrogram
elif(self.output_format == "mel_spectrogram"):
pos_mel_spectrogram = getMELSpectrogram(pos_audio_path, n_fft, hop_length)
neg_mel_spectrogram = getMELSpectrogram(neg_audio_path, n_fft, hop_length)
anchor_mel_spectrogram = getMELSpectrogram(anchor_audio_path, n_fft, hop_length)
return anchor_mel_spectrogram, pos_mel_spectrogram, neg_mel_spectrogram
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (A, P, N) in enumerate(dataloader):
anchor = model(A).to(device)
positive = model(P).to(device)
negative = model(N).to(device)
loss = loss_fn(anchor, positive, negative)
optimizer.zero_grad()
loss.backward()
optimizer.step()
I am working on a instance segmentation problem on mask rcnn with pytorch. Training part is working with below code but evaluation gives 0 score at every mAP. What's the problem in the code?
More info:
I use Albumentations for transforms and some files from pytorch vision for training.
Some problems I've been through:
When I use coco for bbox format instead of pascal voc it gives following error.
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
When put labels out of the convert_seg_boolMask function, it gives following error.
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
def get_transforms(train=False):
if train:
transform = A.Compose([
ToTensorV2()
],bbox_params=A.BboxParams("pascal_voc",label_fields=["labels","iscrowd"]))
else:
transform = A.Compose([
ToTensorV2()
],bbox_params=A.BboxParams("pascal_voc",label_fields=["labels","iscrowd"]))
return transform
class Dataset(datasets.VisionDataset):
def __init__(self, coco_, data_dir, transform=None, target_transform=None, transforms=None):
super().__init__(data_dir, transforms, transform, target_transform)
self.coco_info = coco_
self.data_dir = data_dir
self.transforms = transforms
if isinstance(self.coco_info,dict):
self.ids = [x["id"] for x in self.coco_info["images"] if len(self._load_target(x["id"]))>0]
def _load_image(self, id: int):
name = loadImgs(self.coco_info["images"],id)[0]['file_name']
image = cv2.imread(os.path.join(self.data_dir, name))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)/255
return image
def _load_target(self, id):
return loadAnns(self.coco_info["annotations"],id)
def n_classes(self):
category_names = list(set(sorted([a["name"] for a in self.coco_info["categories"]])))
self.classes = ["__background__"]+[i for i in category_names]
return self.classes
def __getitem__(self,idx):
id = self.ids[idx]
image = self._load_image(id)
target = copy.deepcopy(self._load_target(id))
image_shape = (image.shape[0],image.shape[1])
img_info = {
"img_shape":image_shape,
"image_id":id,
"labels":[t["category_id"]for t in target],
"segmentation":[t["segmentation"][0] for t in target],
"id": [t["id"] for t in target]
}
mask, labels = self.convert_seg_to_boolMask(img_info)
obj_ids = np.unique(mask)
obj_ids = obj_ids[1:]
masks = torch.tensor(mask == obj_ids[:, None, None])
boxes = []
bbox = np.array([t["bbox"] for t in target])
for xmin,ymin,width,height in bbox:
xmax = xmin+width
ymax = ymin+height
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.tensor(boxes)
labels = torch.tensor(labels)
image_id = torch.tensor([id])
iscrowd = torch.tensor([t["iscrowd"] for t in target])
transformed = self.transforms(image=image, masks=masks, bboxes=boxes, labels=labels, iscrowd=iscrowd)
image = transformed['image']
masks = torch.tensor(transformed["masks"])
boxes = torch.tensor(transformed['bboxes'])
labels = torch.tensor(transformed["labels"])
iscrowd = torch.tensor(transformed["iscrowd"])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
image_id = torch.tensor(image_id)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
return image, target
def __len__(self):
return len(self.ids)
def convert_seg_to_boolMask(self,img_info):
mask = np.zeros(img_info["img_shape"], dtype=np.uint8)
mask = Image.fromarray(mask)
draw = ImageDraw.Draw(mask)
for seg, i in zip(img_info["segmentation"],img_info["id"]):
points = [tuple([k,l]) for k,l in zip(seg[0::2],seg[1::2])]
draw.polygon(xy=points,
outline=tuple([i]),
fill=tuple([i]))
mask = np.array(mask)
labels = img_info["labels"]
return mask, labels
I have found the code below that defines supervised contrastive loss for classification task.
class SupConLoss(nn.Module):
def __init__(self, temperature=0.07, contrast_mode='all',
base_temperature=0.07):
super(SupConLoss, self).__init__()
self.temperature = temperature
self.contrast_mode = contrast_mode
self.base_temperature = base_temperature
def forward(self, features, labels=None, mask=None):
"""Args:
features: hidden vector of shape [bsz, n_views, ...].
labels: ground truth of shape [bsz].
mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
has the same class as sample i. Can be asymmetric.
Returns:
A loss scalar.
"""
device = (torch.device('cuda')
if features.is_cuda
else torch.device('cpu'))
if len(features.shape) < 3:
raise ValueError('`features` needs to be [bsz, n_views, ...],'
'at least 3 dimensions are required')
if len(features.shape) > 3:
features = features.view(features.shape[0], features.shape[1], -1)
batch_size = features.shape[0]
if labels is not None and mask is not None:
raise ValueError('Cannot define both `labels` and `mask`')
elif labels is None and mask is None:
mask = torch.eye(batch_size, dtype=torch.float32).to(device)
elif labels is not None:
labels = labels.contiguous().view(-1, 1)
if labels.shape[0] != batch_size:
raise ValueError('Num of labels does not match num of features')
mask = torch.eq(labels, labels.T).float().to(device)
else:
mask = mask.float().to(device)
contrast_count = features.shape[1]
contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
if self.contrast_mode == 'one':
anchor_feature = features[:, 0]
anchor_count = 1
elif self.contrast_mode == 'all':
anchor_feature = contrast_feature
anchor_count = contrast_count
else:
raise ValueError('Unknown mode: {}'.format(self.contrast_mode))
# compute logits
anchor_dot_contrast = torch.div(
torch.matmul(anchor_feature, contrast_feature.T),
self.temperature)
# for numerical stability
logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
logits = anchor_dot_contrast - logits_max.detach()
# tile mask
mask = mask.repeat(anchor_count, contrast_count)
# mask-out self-contrast cases
logits_mask = torch.scatter(
torch.ones_like(mask),
1,
torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
0
)
mask = mask * logits_mask
# compute log_prob
exp_logits = torch.exp(logits) * logits_mask
log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
# compute mean of log-likelihood over positive
mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
# loss
loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
loss = loss.view(anchor_count, batch_size).mean()
return loss
My question is how I can use this loss for a semantic segmentation task on a pixel-wise level, where the input of the model is of size (batch, channels, height, width) and the labels are masks of size (batch, height, width).
def get_pad_mask(tokens, i_pad=0):
"""
pad mask 계산하는 함수
:param tokens: tokens (bs, n_seq)
:param i_pad: id of pad
:return mask: pad mask (pad: 1, other: 0)
"""
# pad: True, others: False
mask = torch.eq(tokens, i_pad)
# boolean -> float 32
mask = mask.type(torch.FloatTensor)
# expand dimension for Q n_seq
mask = torch.unsqueeze(mask, 1)
return mask
def get_causal_mask(tokens, i_pad=0):
"""
causal mask 계산하는 함수
:param tokens: tokens (bs, n_seq)
:param i_pad: id of pad
:return mask: causal and pad mask (causal or pad: 1, other: 0)
"""
# n_seq 조회
n_seq = tokens.shape[1]
# all one mask
mask = torch.ones((n_seq, n_seq))
# make reverse causal mask
mask = mask.triu(1)
# 0 -> 1, 1 -> 0
# expand dim for bs
mask = torch.unsqueeze(mask, 0)
# get pad_mask
pad_mask = get_pad_mask(tokens, i_pad)
# mask all causal_mask or pad_mask
mask = torch.maximum(mask, pad_mask)
return mask
class ScaleDotProductAttention(nn.Module):
"""
Scale Dot Product Attention Class
"""
def __init__(self, name="scale_dot_product_attention"):
"""
생성자
:param name: layer name
"""
super(ScaleDotProductAttention, self).__init__()
def forward(self, Q, K, V, attn_mask):
"""
layer 실행
:param Q: Query
:param K: Key
:param V: Value
:param attn_mask: attention mask
:return attn_out: attention 실행 결과
"""
# matmul Q, K.T
attn_score = torch.matmul(Q, K.transpose(-2,-1))
# d_k
d_k = torch.tensor(K.shape[-1])
# scale = d_k ** 0.5
scale = torch.sqrt(d_k)
# divide by scale
attn_scale = torch.divide(attn_score, scale)
# do mask (subtract 1e-9 for masked value)
attn_scale -= 1.e9 * attn_mask
# calculate attention prob
attn_prob = torch.softmax(attn_scale, axis=-1)
# weighted sum of V
attn_out = torch.matmul(attn_prob, V)
return attn_out
class MultiHeadAttention(nn.Module):
#class MultiHeadAttention(tf.keras.layers.Layer):
"""
Multi Head Attention Class
"""
def __init__(self, args, name="MultiHeadAttention"):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(MultiHeadAttention, self).__init__()
self.d_model = args.d_model
self.n_head = args.n_head
self.d_head = args.d_head
# Q, K, V input dense layer
self.W_Q = nn.Linear(args.n_head * args.d_head,args.n_head * args.d_head)
self.W_K = nn.Linear(args.n_head * args.d_head,args.n_head * args.d_head)
self.W_V = nn.Linear(args.n_head * args.d_head,args.n_head * args.d_head)
'''TensorFLow
self.W_Q = tf.keras.layers.Dense(self.n_head * self.d_head)
self.W_K = tf.keras.layers.Dense(self.n_head * self.d_head)
self.W_V = tf.keras.layers.Dense(self.n_head * self.d_head)
TensorFLow'''
# Scale Dot Product Attention class
self.attention = ScaleDotProductAttention(name="self_attention")
# output dense layer
#self.W_O = torch.nn.Linear(args.n_head * args.d_head,self.d_model)
self.W_O = torch.nn.Linear(self.d_model,self.d_model)
'''TensorFLow
self.W_O = tf.keras.layers.Dense(self.d_model)
TensorFLow'''
def forward(self, Q, K, V, attn_mask):
"""
layer 실행
:param Q: Query
:param K: Key
:param V: Value
:param attn_mask: attention mask
:return attn_out: attention 실행 결과
"""
# build multihead Q, K, V
self.Q_m = torch.transpose(torch.reshape(self.W_Q(Q), [-1, Q.shape[1], args.n_head, args.d_head]), 2, 1) # (bs, n_head, Q_len, d_head)
self.K_m = torch.transpose(torch.reshape(self.W_K(K), [-1, K.shape[1], args.n_head, args.d_head]), 2, 1) # (bs, n_head, Q_len, d_head)
self.V_m = torch.transpose(torch.reshape(self.W_V(V), [-1, V.shape[1], args.n_head, args.d_head]), 2, 1) # (bs, n_head, Q_len, d_head)
'''TensorFLow
Q_m = tf.transpose(tf.reshape(self.W_Q(Q), [-1, tf.shape(Q)[1], args.n_head, args.d_head]), [0, 2, 1, 3]) # (bs, n_head, Q_len, d_head)
K_m = tf.transpose(tf.reshape(self.W_K(K), [-1, tf.shape(K)[1], args.n_head, args.d_head]), [0, 2, 1, 3]) # (bs, n_head, Q_len, d_head)
V_m = tf.transpose(tf.reshape(self.W_V(V), [-1, tf.shape(V)[1], args.n_head, args.d_head]), [0, 2, 1, 3]) # (bs, n_head, Q_len, d_head)
TensorFLow'''
# build multihead mask
attn_mask_m = torch.unsqueeze(attn_mask, axis=1)
'''TensorFLow
attn_mask_m = tf.expand_dims(attn_mask, axis=1)
TensorFLow'''
# Scale Dot Product Attention with multi head Q, K, V, attn_mask
attn_out_m = self.attention(self.Q_m, self.K_m, self.V_m, attn_mask_m) # (bs, n_head, Q_len, d_head)
# transpose
attn_out_t = torch.transpose(attn_out_m,2, 1)
'''TensorFLow
attn_out_t = tf.transpose(attn_out_m, perm=[0, 2, 1, 3]) # (bs, n_head, Q_len, d_head) -> (bs, Q_len, n_head, d_head)
TensorFLow'''
# reshape
attn_out_c = torch.reshape(attn_out_t, [-1, Q.shape[1], args.n_head * args.d_head]) # (bs, Q_len, n_head * d_head) -> (bs, Q_len, n_head, d_head)
'''TensorFLow
attn_out_c = tf.reshape(attn_out_t, [-1, tf.shape(Q)[1], args.n_head * args.d_head]) # (bs, Q_len, n_head, d_head) -> (bs, Q_len, n_head * d_head)
TensorFLow'''
# linear for output
attn_out = self.W_O(attn_out_c) # (bs, Q_len, n_head * d_head) -> (bs, Q_len, d_model)
return attn_out
class PositionWiseFeedForward(nn.Module):
#class PositionWiseFeedForward(tf.keras.layers.Layer):
"""
Position Wise Feed Forward Class
"""
def __init__(self, args, name="PositionWiseFeedForward"):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(PositionWiseFeedForward, self).__init__()
#super().__init__(name=name)
relu_f = torch.nn.ReLU()
relu_W = nn.Linear(args.d_model,args.d_ff)
self.W_1 = torch.nn.Sequential(relu_W,relu_f)
self.W_2 = nn.Linear(args.d_ff,args.d_model)
#self.W_1 = tf.keras.layers.Dense(args.d_ff, activation=tf.nn.relu)
#self.W_2 = tf.keras.layers.Dense(args.d_model)
def forward(self,inputs):
#def call(self, inputs):
"""
layer 실행
:param inputs: inputs
:return ff_val: feed forward 실행 결과
"""
# linear W_1 and W_2
ff_val = self.W_1(inputs)
ff_val = self.W_2(ff_val)
return ff_val
class EncoderLayer(nn.Module):
#class EncoderLayer(tf.keras.layers.Layer):
"""
Encoder Layer Class
"""
def __init__(self, args, name='encoder_layer'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(EncoderLayer, self).__init__()
#super().__init__(name=name)
self.enc_x_size = hidden_enc.size(dim=1)
self.enc_y_size = hidden_enc.size(dim=2)
self.self_attention = MultiHeadAttention(args)
self.norm1 = nn.LayerNorm([self.enc_x_size,self.enc_y_size],eps=args.norm_eps)
self.ffn = PositionWiseFeedForward(args)
self.norm2 = nn.LayerNorm([self.enc_x_size,self.enc_y_size],eps=args.norm_eps)
self.dropout = nn.Dropout(args.dropout)
def forward(self, enc_hidden, self_mask, training):
"""
layer 실행
:param enc_hidden: 이전 layer 출력
:param self_mask: self attention mask
:param training: training flag
:return enc_out: EncoderLayer 실행 결과
"""
# self attention
if training == False :
self.dropout.p = 0.0 #drop out training = False
print('ENCODER')
print(self.enc_x_size,self.enc_y_size,self.dropout.p)
self_attn_val = self.self_attention(enc_hidden, enc_hidden, enc_hidden, self_mask)
# add and layer normal
norm1_val = self.norm1(enc_hidden + self.dropout(self_attn_val))
# feed forward
ffn_val = self.ffn(norm1_val)
# add and layer normal
enc_out = self.norm2(norm1_val + self.dropout(ffn_val))
self.dropout.p = args.dropout
return enc_out
class DecoderLayer(nn.Module):
#class DecoderLayer(tf.keras.layers.Layer):
"""
Decoder Layer Class
"""
def __init__(self, args, name='decoder_layer'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(DecoderLayer, self).__init__()
#super().__init__(name=name)
self.dec_x_size = dec_hidden.size(dim=1)
self.dec_y_size = dec_hidden.size(dim=2)
self.self_attention = MultiHeadAttention(args)
self.norm1 = nn.LayerNorm([self.dec_x_size, self.dec_y_size],eps=args.norm_eps)
self.ende_attn = MultiHeadAttention(args)
self.norm2 = nn.LayerNorm([self.dec_x_size, self.dec_y_size],eps=args.norm_eps)
self.ffn = PositionWiseFeedForward(args)
self.norm3 = nn.LayerNorm([self.dec_x_size, self.dec_y_size],eps=args.norm_eps)
self.dropout = nn.Dropout(args.dropout)
def forward(self, dec_hidden, enc_out, self_mask, ende_mask,training):
#def call(self, dec_hidden, enc_out, self_mask, ende_mask, training):
"""
layer 실행
:param dec_hidden: 이전 layer 출력
:param enc_out: Encoder final 출력
:param self_mask: self attention mask
:param ende_mask: Encoder Decoder attention mask
:param training: training flag
:return dec_out: DecoderLayer 실행 결과
"""
print('DecoderLayer')
print(self.dec_x_size,self.dec_y_size)
if training == False :
self.dropout.p = 0.0 #drop out training = False
# self attention
self_attn_val = self.self_attention(dec_hidden, dec_hidden, dec_hidden, self_mask)
# add and layer normal
norm1_val = self.norm1(dec_hidden + self.dropout(self_attn_val))
# encoder and decoder attention
ende_attn_val = self.ende_attn(norm1_val, enc_out, enc_out, ende_mask)
# add and layer normal
norm2_val = self.norm2(norm1_val + self.dropout(ende_attn_val))
# feed forward
ffn_val = self.ffn(norm2_val)
# add and layer normal
dec_out = self.norm3(norm2_val + self.dropout(ffn_val))
self.dropout.p = args.dropout
return dec_out
class SharedEmbedding(nn.Module):
#class SharedEmbedding(tf.keras.layers.Layer):
"""
Weighed Shaed Embedding Class
"""
def __init__(self, args, name='SharedEmbedding'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(SharedEmbedding, self).__init__()
#super().__init__(name=name)
self.shared_weights = torch.empty(args.n_vocab, args.d_model)
self.shared_weights = torch.nn.init.trunc_normal_(self.shared_weights,std = args.d_model ** -0.5)
#with tf.name_scope('shared_embedding_weight'):
self.n_vocab = args.n_vocab
self.d_model = args.d_model
def forward(self, inputs, mode='embedding'):
#def call(self, inputs, mode='embedding'):
"""
layer 실행
:param inputs: 입력
:param mode: 실행 모드
:return: embedding or linear 실행 결과
"""
# mode가 embedding일 경우 embedding lookup 실행
if mode == 'embedding':
return self._embedding(inputs)
# mode가 linear일 경우 linear 실행
elif mode == 'linear':
return self._linear(inputs)
# mode가 기타일 경우 오류 발생
else:
raise ValueError(f'mode {mode} is not valid.')
def _embedding(self, inputs):
"""
embedding lookup
:param inputs: 입력
"""
# lookup by gather
embed = torch.matmul(nn.functional.one_hot(inputs, len(vocab)).type(torch.float), self.shared_weights )
#embed = tf.gather(self.shared_weights, tf.cast(inputs, tf.int32))
# muliply d_model ** 0.5
embed *= self.d_model ** 0.5
return embed
def _linear(self, inputs): # (bs, n_seq, d_model)
"""
linear 실행
:param inputs: 입력
"""
# matmul inputs, shared_weights (transpose_b=True)
outputs = torch.matmul(inputs, torch.transpose(self.shared_weights, 1, 0))
#outputs = tf.matmul(inputs, self.shared_weights, transpose_b=True)
return outputs
class PositionalEmbedding(nn.Module):
#class PositionalEmbedding(tf.keras.layers.Layer):
"""
Positional Embedding Class
"""
def __init__(self, args, name='position_embedding'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(PositionalEmbedding, self).__init__()
#super().__init__(name=name)
pos_encoding = PositionalEmbedding.get_sinusoid_encoding(args.n_seq, args.d_model)
self.embedding = nn.Embedding(args.n_seq, args.d_model)
self.embedding.weights = [pos_encoding] #weights=[pos_encoding]
self.embedding.weight.requires_grad=False #trainable=False
#self.embedding = tf.keras.layers.Embedding(args.n_seq, args.d_model, trainable=False, weights=[pos_encoding])
def forward(self, inputs):
#def call(self, inputs):
"""
layer 실행
:param inputs: 입력
:return embed: positional embedding lookup 결과
"""
# make position (0...n_seq)
zero_inputs = torch.ones_like(inputs)
x_size = zero_inputs.size(dim=0)
for i in range(0,x_size):
zero_inputs[i][0] = 0
position = torch.cumsum(zero_inputs, dim=1)
#position = tf.math.cumsum(tf.ones_like(inputs), axis=1, exclusive=True)
# embedding lookup
embed = self.embedding(position)
return embed
#staticmethod
def get_sinusoid_encoding(n_seq, d_model):
"""
sinusoid encoding 생성
:param n_seq: sequence number
:param n_seq: model hidden dimension
:return: positional encoding table
"""
# calculate exp
exs = np.array([2 * (i_ang // 2) / d_model for i_ang in range(d_model)])
# exs = np.array([2 * (i_ang // 2) / args.d_model for i_ang in range(args.d_model)])
# calculate power
angles = np.power(10000, exs)
# make position
pos = np.array([[i] for i in range(n_seq)])
# position angle
pos_encoding = pos / angles
# sin even number
pos_encoding[:, 0::2] = np.sin(pos_encoding[:, 0::2])
# cos odd number
pos_encoding[:, 1::2] = np.cos(pos_encoding[:, 1::2])
return pos_encoding
#return tf.cast(pos_encoding, tf.float32)
class Transformer(nn.Module):
#class Transformer(tf.keras.Model):
"""
Transformer Class
"""
def __init__(self, args, name='transformer'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(Transformer, self).__init__()
#super().__init__(name=name)
self.i_pad = args.i_pad
self.embedding = SharedEmbedding(args)
self.position = PositionalEmbedding(args)
self.encoder_layers = [EncoderLayer(args, name=f'encoder_layer_{i}') for i in range(args.n_layer)]
self.decoder_layers = [DecoderLayer(args, name=f'decoder_layer_{i}') for i in range(args.n_layer)]
self.dropout = nn.Dropout(args.dropout)
def forward(self, inputs, training=False):
#def call(self, inputs, training=False):
"""
layer 실행
:param inputs: enc_tokens, dec_tokens
:return logits: dec_tokens에 대한 다음 토큰 예측 결과 logits
"""
enc_tokens, dec_tokens = inputs
# encoder self attention mask
enc_self_mask = get_pad_mask(enc_tokens, self.i_pad)
# decoder self attention mask
dec_self_mask = get_causal_mask(dec_tokens, self.i_pad)
# encoder and decoder attention mask
enc_dec_mask = get_pad_mask(enc_tokens, self.i_pad)
# enc_tokens embedding lookup
enc_hidden = self.embedding(enc_tokens) + self.position(enc_tokens)
enc_hidden = self.dropout(enc_hidden)
# call encoder layers
for encoder_layer in self.encoder_layers:
enc_hidden = encoder_layer(enc_hidden, enc_self_mask, training)
# dec_tokens embedding lookup
dec_hidden = self.embedding(dec_tokens) + self.position(dec_tokens)
if training == False :
self.dropout.p = 0.0 #drop out training = False
dec_hidden = self.dropout(dec_hidden)
# call decoder layers
for decoder_layer in self.decoder_layers:
dec_hidden = decoder_layer(dec_hidden, enc_hidden, dec_self_mask, enc_dec_mask, training)
# call weight shared embedding (model=linear)
logits = self.embedding(dec_hidden, mode='linear')
# softmax
logit_softmax = nn.Softmax(dim=-1)
y_pred = logit_softmax(logits)
self.dropout.p = args.dropout
return y_pred
def lm_loss(logits, labels):
logit_softmax = nn.Softmax(dim=-1)
logits = logit_softmax(logits)
loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
loss = loss_fn(logits, labels)
mask = labels.ne(0)
loss_val = loss.masked_select(mask).sum()
total = mask.sum()
loss = loss_val / torch.maximum(total, torch.tensor(1))
return loss
def lm_acc(y_pred, y_true):
"""
pad 부분을 제외하고 accuracy를 계산하는 함수
:param y_true: 정답
:param y_pred: 예측 값
:retrun loss: pad 부분이 제외된 accuracy 값
"""
y_true_clone = y_true.clone().detach()
y_pred_clone = y_pred.clone().detach()
y_pred_softmax = nn.Softmax(dim=-1)
y_pred_clone = y_pred_softmax(y_pred_clone )
y_pred_class = torch.argmax(y_pred_clone,dim=-1)
matches = torch.eq(y_true_clone, y_pred_class)
matches = matches.to(device).int()
mask = torch.ne(y_true_clone, 0)
mask = mask.to(device).int()
matches *= mask
mask_total = mask.sum()
matches_total = matches.sum()
accuracy = matches_total / torch.maximum(mask_total, torch.tensor(1))
print(y_true_clone)
print(y_pred_class)
print(accuracy)
return accuracy
model = Transformer(args)
function_predict = model((train_enc_inputs[:4], train_dec_inputs[:4]),training=True)
loss = lm_loss(function_predict[:4].view(-1, function_predict.size(-1)), train_dec_labels[:4].view(-1)).to(device)
acc = lm_acc(function_predict[:4].view(-1, function_predict.size(-1)), train_dec_labels[:4].view(-1)).to(device)
input : tensor([7116, 107, 1, ..., 0, 0, 0])
output : tensor([ 2, 7116, 107, ..., 0, 0, 0])
I make transofromer by pytorch, The output same as dec_input starting 2 ending with 3
I challenge 1000 times this code, the result is smae.
I think this code has some problem, This is my first pytorch code.
Can you fix it this stressed code?
I have two networks. The output of the first network is the input to the other. In order to calculate the loss for the second network, I use vanilla policy gradient. I want to backpropagate this loss into the first network. After checking if the gradeints has changed, I see that they are all none.
I first load the first network (a pre-trained autoencoer in my network this way):
def load_checkpoint(filepath, model):
checkpoint = torch.load(filepath)
model.load_state_dict(checkpoint['state_dict'])
for parameter in model.parameters():
parameter.requires_grad = True
model.train()
return model
Then I define the optimizers for both networks this way:
class MultipleOptimizer(object):
def __init__(self, *op):
self.optimizers = op
def zero_grad(self):
for op in self.optimizers:
op.zero_grad()
def step(self):
for op in self.optimizers:
op.step()
opt = MultipleOptimizer(SGD(model.parameters(), lr=1, momentum=0.9), Adam(logits_net.parameters(), lr=lr))
the reward function is:
#Reward function
def reward(x, act):
#print('action', act)
#print('x type', type(x))
km = KMeans(act, n_init=20, n_jobs=4)
y_pred = km.fit_predict(x.detach().cpu().numpy())# seems we can only get a centre from batch
#print('k-means output type', type(y_pred))
sil_score = sil(x.detach().cpu().numpy(), y_pred)
#print('sil score', sil_score)
return sil_score
The architecture of the second neural net and an alternative to avoid (logits=logits.mean(0)):
def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
# Build a feedforward neural network. outputs are the logits
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
class mlp2(torch.nn.Module):
def __init__(self):
super(mlp2, self).__init__()
self.linear1 = nn.Linear(10,100)
self.relu1 = nn.ReLU(inplace=True)
self.linear2 = torch.nn.Linear(100,100)
self.linear3 = torch.nn.Linear(100,20)
self.linear4 = torch.nn.Linear(2000,100)
self.ident = nn.Identity()
def forward(self, x):
a = self.linear1(x)
a = self.relu1(a)
a = self.linear2(a)
a = self.relu1(a)
a = self.linear3(a)
a = torch.flatten(a)
a = self.linear4(a)
a = self.relu1(a)
a = self.linear3(a)
out = self.ident(a)
return out
Loss is calculated as in the following order:
def get_policy(obs):
logits = logits_net(obs)
return Categorical(logits=logits.mean(0))
def get_action(obs):
return get_policy(obs).sample().item()
def Logp(obs, act):
logp = get_policy(obs).log_prob(act.cuda())
return logp
def compute_loss(logp, weights):
return -(logp * weights).mean()
def train_one_epoch():
# make some empty lists for logging.
batch_obs = [] # for observations
batch_acts = [] # for actions
batch_weights = [] # for R(tau) weighting in policy gradient
batch_logp = []
# reset episode-specific variables
j = 1 # signal from environment that episode is over
ep_rews = [] # list for rewards accrued throughout ep
for i, data in enumerate(train_loader):
#Create the mean image out of those 100 images
x, label = data
x = model(x.cuda())#torch.Size([100, 10])
obs = x.data.cpu().numpy()#[100, 10] - a trajectory with only one state
# Save obs
batch_obs.append(obs.copy())
#act in the environment
#act = get_action(torch.as_tensor(obs, dtype=torch.float32))
act = get_action(x)
print('action type', type(act))
#log probability
#logp = Logp(torch.as_tensor(obs, dtype=torch.float32),act = torch.as_tensor(act, dtype=torch.int32))
logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32))
#rew = reward(obs, act+2)
rew = reward(x, act+2)
# save action, reward
batch_acts.append(act)
batch_weights.append(rew)#episode rewards
batch_logp.append(logp)
opt.zero_grad()
batch_logp = torch.stack(batch_logp, dim=0)
batch_loss = compute_loss(logp = torch.as_tensor(batch_logp, dtype=torch.float32),
weights = torch.as_tensor(batch_weights, dtype=torch.float32))
batch_loss.backward() #does it return anything? gradients? print them!
opt.step()
for name, param in logits_net.named_parameters():
print(name, param.grad)
I applied some changes with the assumption that maybe recreating some of the tensors maybe the issue:
I have the output of the first network, obs, converted like obs = x.data.cpu().numpy() this and then sent to get_action function: act = get_action(torch.as_tensor(obs, dtype=torch.float32)). I changes this to act = get_action(x) so, x is sent directly to this function. Also, change arguments of logp to logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32)).
After these changes, I still get the none value for the gradient. Is there anyway possible to backpropagate the gradient when loss is calculated this way? any changes that I can apply?
any help is appreciated.