Transformer Model only predict Start or End Tokens - pytorch

So I've been trying to build and train a Transformer Model from scratch for empathetic dialogue generation tasks and currently I'm struggling with the training process since the model only seems to predict START and END tokens in the final output layer irrespective of the target token given to the Transformer decoder. I've gone through the implementation multiple times and spotted and corrected some issues (mostly with the MultiHead Attention Layer and tokenization), however still haven't had any luck.
I am using F.cross_entropy to compute the cross entropy between the final logits outputted from the transformer out[:, :-1:, :] and the target sequence in my dataset target[:, 1:]. The shifts are of course necessary since each output of the transformer corresponds to the next predicted token. I tried removing the START and END tokens in this loss function (i.e., out[:, :-2:, :] and target[:, 1:-1]) but this didn't help either. The logits and targets are all shaped according to PyTorch documentation i.e., (batch_size, classes, sequence_length) and (batch_size, sequence_length) respectively with the target containing the class indices (the padding index is hence ignored). The training output looks something like this.
Epoch 0: 1%| | 1/180 [00:05<16:28, 5.53s/it, loss=11, v_num=3, train_loss=11.00]
Epoch 0: 1%| | 2/180 [00:25<37:55, 12.78s/it, loss=11, v_num=3, train_loss=11.00]
...
Epoch 5: 90%|█████████ | 162/180 [00:58<00:06, 2.77it/s, loss=5.54, v_num=3, train_loss=5.520]
Epoch 5: 90%|█████████ | 162/180 [00:58<00:06, 2.77it/s, loss=5.53, v_num=3, train_loss=5.430]
As seen above, the loss decays to a constant loss value between 5-6 and stays constant (even up to the 50th epoch). I printed out the probability tensors at each training step by softmax-ing the logits. Highest probabilities are attributed to the START and END tokens irrespective of the target token into the transformer decoder.
To confirm this behavior, I wrote a script to predict a response from the trained model (using beam search) given a context sequence and setting the first target token to [START]. No matter what context sequence I input into the model or what beam width I use, the next target token is always predicted to be [END]. I'm not sure if this has something to do with tokenization or some weights in the model exploding but I can't seem to get rid of this behaviour. I even included dropout layers to eliminate the latter problem and still not luck. This issue persists even if I remove the emotional embeddings I am adding in the decoder.
Here is the full implementation of the Model for reference:
class MultiHeadAttention(nn.Module):
def __init__(self, embed_size: int, heads: int) -> None:
super().__init__()
self.embed_size = embed_size
self.heads = heads
self.head_dim = self.embed_size // self.heads
assert self.head_dim * self.heads == self.embed_size
self.values = nn.Linear(self.embed_size, self.embed_size, bias=False)
self.keys = nn.Linear(self.embed_size, self.embed_size, bias=False)
self.queries = nn.Linear(self.embed_size, self.embed_size, bias=False)
self.fc_out = nn.Linear(self.embed_size, self.embed_size, bias=False)
def forward(
self,
keys: torch.Tensor,
values: torch.Tensor,
queries: torch.Tensor,
mask: torch.Tensor
) -> torch.Tensor:
N = queries.shape[0]
keys_len, values_len, queries_len = keys.shape[1], values.shape[1], queries.shape[1]
values = self.values(values).reshape(N, values_len, self.heads, self.head_dim)
keys = self.keys(keys).reshape(N, keys_len, self.heads, self.head_dim)
queries = self.queries(queries).reshape(N, queries_len, self.heads, self.head_dim)
scores = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
# Apply mask to attention scores if specified
if mask is not None:
scores = scores.masked_fill(mask == 0, float("-1e20"))
# Normalise with respect to all keys
attention = F.softmax(scores / (self.embed_size ** 0.5), dim=-1)
out = torch.einsum("nhqk,nvhd->nqhd", [attention, values])
out = self.fc_out(out.reshape(N, queries_len, self.embed_size))
return out
class TransformerBlock(nn.Module):
def __init__(
self,
embed_size: int,
heads: int,
dropout: float,
forward_expansion: int
) -> None:
super().__init__()
self.attention = MultiHeadAttention(embed_size, heads)
self.norm1 = nn.LayerNorm(embed_size)
self.norm2 = nn.LayerNorm(embed_size)
self.dropout = nn.Dropout(dropout)
self.ff = nn.Sequential(
nn.Linear(embed_size, embed_size * forward_expansion),
nn.ReLU(),
nn.Linear(embed_size * forward_expansion, embed_size)
)
def forward(
self,
keys: torch.Tensor,
values: torch.Tensor,
queries: torch.Tensor,
mask: torch.Tensor
) -> torch.Tensor:
attention = self.attention(keys, values, queries, mask)
contextualised = self.dropout(self.norm1(attention + queries))
forward = self.ff(contextualised)
out = self.dropout(self.norm2(forward + contextualised))
return out
class Encoder(nn.Module):
def __init__(
self,
vocab_size: int,
padding_idx: int,
num_layers: int,
embed_size: int,
heads: int,
dropout: float,
forward_expansion: int,
max_seq_len: int,
num_of_emo_labels: int
) -> None:
super().__init__()
self.word_embeddings = nn.Embedding(
vocab_size + 1, embed_size, padding_idx=padding_idx)
self.pos_embeddings = nn.Embedding(max_seq_len, embed_size)
self.ds_embeddings = nn.Embedding(2 + 1, embed_size, padding_idx=0)
self.layers = nn.ModuleList(
[TransformerBlock(embed_size, heads, dropout, forward_expansion)
for _ in range(num_layers)]
)
self.dropout = nn.Dropout(dropout)
def forward(
self,
context: torch.Tensor,
context_ds_state: torch.Tensor,
mask: torch.Tensor,
emotion_label: torch.Tensor
) -> torch.Tensor:
N, seq_len = context.shape
positions = torch.arange(0, seq_len, device=context.device).expand(N, seq_len)
word_embeddings = self.word_embeddings(context)
pos_embeddings = self.pos_embeddings(positions)
ds_embeddings = self.ds_embeddings(context_ds_state)
out = self.dropout(word_embeddings + pos_embeddings + ds_embeddings)
for layer in self.layers:
out = layer(out, out, out, mask)
return out
class DecoderBlock(nn.Module):
def __init__(
self,
embed_size: int,
heads: int,
dropout: float,
forward_expansion: int
) -> None:
super().__init__()
self.attention = MultiHeadAttention(embed_size, heads)
self.norm = nn.LayerNorm(embed_size)
self.transformer_block = TransformerBlock(
embed_size,
heads,
dropout,
forward_expansion
)
self.dropout = nn.Dropout(dropout)
def forward(
self,
x: torch.Tensor,
keys: torch.Tensor,
values: torch.Tensor,
target_mask: torch.Tensor,
input_mask: torch.Tensor
) -> torch.Tensor:
attention = self.attention(x, x, x, target_mask)
queries = self.dropout(self.norm(attention + x))
out = self.transformer_block(keys, values, queries, input_mask)
return out
class Decoder(nn.Module):
def __init__(
self,
vocab_size: int,
padding_idx: int,
num_layers: int,
embed_size: int,
heads: int,
dropout: float,
forward_expansion: int,
max_seq_len: int,
num_of_emo_labels: int
) -> None:
super().__init__()
self.word_embeddings = nn.Embedding(
vocab_size + 1, embed_size, padding_idx=padding_idx)
self.pos_embeddings = nn.Embedding(max_seq_len, embed_size)
self.ds_embeddings = nn.Embedding(2 + 1, embed_size, padding_idx=0)
self.emotion_embedding = nn.Embedding(num_of_emo_labels, embed_size)
self.layers = nn.ModuleList(
[DecoderBlock(embed_size, heads, dropout, forward_expansion)
for _ in range(num_layers)]
)
self.dropout = nn.Dropout(dropout)
self.fc_out = nn.Linear(embed_size, vocab_size)
def forward(
self,
target: torch.Tensor,
target_ds_state: torch.Tensor,
encoder_out: torch.Tensor,
target_mask: torch.Tensor,
input_mask: torch.Tensor,
emotion_label: torch.Tensor
) -> torch.Tensor:
N, seq_len = target.shape
positions = torch.arange(0, seq_len, device=target.device).expand(N, seq_len)
word_embeddings = self.word_embeddings(target)
pos_embeddings = self.pos_embeddings(positions)
ds_embeddings = self.ds_embeddings(target_ds_state)
out = self.dropout(word_embeddings + pos_embeddings + ds_embeddings)
for layer in self.layers:
out = layer(out, encoder_out, encoder_out, target_mask, input_mask)
emotion_embedding = self.emotion_embedding(
emotion_label).unsqueeze(1).expand(-1, seq_len, -1)
out = self.fc_out(out + emotion_embedding)
return out
class Transformer(nn.Module):
def __init__(
self,
vocab_size: int,
num_of_emo_labels: int,
max_seq_len: int,
padding_idx: int,
num_layers: int = 6,
embed_size: int = 256,
heads: int = 8,
dropout: float = 0.5,
forward_expansion: int = 4
) -> None:
super().__init__()
self.padding_idx = padding_idx
self.encoder = Encoder(
vocab_size,
padding_idx,
num_layers,
embed_size,
heads,
dropout,
forward_expansion,
max_seq_len,
num_of_emo_labels
)
self.decoder = Decoder(
vocab_size,
padding_idx,
num_layers,
embed_size,
heads,
dropout,
forward_expansion,
max_seq_len,
num_of_emo_labels
)
def create_padding_mask(self, batch_seq):
N = batch_seq.size(dim=0)
padding_mask = (batch_seq != self.padding_idx).unsqueeze(1).unsqueeze(2)
return padding_mask
def create_lookahead_mask(self, batch_seq):
N, seq_len = batch_seq.shape
lookahead_mask = torch.tril(torch.ones(
N, 1, seq_len, seq_len, device=batch_seq.device))
return lookahead_mask
def forward(
self,
context: torch.Tensor,
target: torch.Tensor,
context_ds_state: torch.Tensor,
target_ds_state: torch.Tensor,
emotion_label: torch.Tensor
) -> None:
input_mask = self.create_padding_mask(context)
target_mask = torch.minimum(
self.create_lookahead_mask(target),
self.create_padding_mask(target)
)
encoder_out = self.encoder(
context,
context_ds_state,
input_mask,
emotion_label
)
out = self.decoder(
target,
target_ds_state,
encoder_out,
target_mask,
input_mask,
emotion_label
)
return out
I have used both Adam and AdamW as my optimizers with a StepLR scheduler if that's relevant. I've been stuck on this problem for a while now so any help would be appreciated. Thanks in advance :)

Related

Normalizing multivariate time-series data with different sequence length

I have a multivariate time-series dataset with different sequence lengths. I filled the missing values in the sequences with zeros. I am trying to use a recurrent neural network model for forcasting with Time Series. I noticed my results of the model degrade when the range of the data is outside -1 and 1. I wrote the following normalization class using MinMaxScaler. However, I don't know how to exclude the missing values in the sequences during computation of MinMaxScaler. Here is my code
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
import numpy as np
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def get_mask_from_sequence_lengths(
sequence_lengths: torch.Tensor, max_length: int
) -> torch.BoolTensor:
# (batch_size, max_length)
ones = sequence_lengths.new_ones(sequence_lengths.size(0), max_length)
range_tensor = ones.cumsum(dim=1)
return sequence_lengths.unsqueeze(1) >= range_tensor
class Normalizer1D(nn.Module):
# Data size of (batch_size, seq_len, input_size)
def __init__(self, input_dim, inputs ):
super(Normalizer1D, self).__init__()
self.input_dim = input_dim
self.to(device)
self._norm = self.build_normalizers(inputs)
max_len=inputs.shape[-1]
data = torch.from_numpy(inputs)
length = torch.LongTensor([torch.max((data[i,0,:]!=0).nonzero()).item()+1 for i in range(data.shape[0])])
mask = get_mask_from_sequence_lengths( length, max_len)
def build_normalizers(self, x):
normalizers = OrderedDict()
for i in range(self.input_dim):
if np.min(x[:,i,:])<0:
scaler = MinMaxScaler(feature_range=(-1, 1))
else:
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(x[:,i,:])
normalizers[str(i)] = scaler
return normalizers
def normalize(self, x):
#(B, D, T)
d = x.cpu().detach().numpy()
n_x=[]
for i in range(x.shape[1]):
n_x.append(self._norm[str(i)].fit_transform(d[:,i,:]))
x =np.stack(n_x, axis=1)
return torch.from_numpy(x).to(device)
def unnormalize(self, x):
#(T, B, D)==>(B, T, D)
d = x.cpu().detach().numpy()
n_x=[]
for i in range(x.shape[1]):
n_x.append(self._norm[str(i)].inverse_transform(d[:,i,:]))
x =np.stack(n_x, axis=1)
return torch.from_numpy(x).to(device)
#property
def min_(self):
#(T, B, D)
min_ = []
for i in range(len(self._norm)):
min_.append(self._norm[str(i)].min_)
return torch.from_numpy(np.stack(min_, axis=1))
#property
def scale_(self):
#(T, B, D)
scale_ = []
for i in range(len(self._norm)):
scale_.append(self._norm[str(i)].scale_)
return torch.from_numpy(np.stack(scale_, axis=1))
def unnormalize_mean(self, x_mu):
Xscale = self.scale_()
Xmin = self.min_()
normX = x_mu.mul_(Xscale)
return normX.add_(Xmin)
def unnormalize_sigma(self, x_sigma):
Xscale =self.scale_()
return x_sigma.mul_(Xscale)
# compute the normalizers
def compute_normalizer(loader_train):
##batch_size, input_dim, seq_len
for i, (u, y) in enumerate(loader_train):
if i ==0:
#input u torch.Size([B, D, T])
inputs = u
outputs = y
else:
inputs = torch.cat([inputs,u], dim=0)
outputs = torch.cat([outputs,y], dim=0)
inputs = inputs.cpu().detach().numpy()
outputs = outputs.cpu().detach().numpy()
# initialization
u_normalizer = Normalizer1D(inputs.shape[1], inputs)
y_normalizer = Normalizer1D(outputs.shape[1], outputs)
return u_normalizer, y_normalizer
I will appreciate if someone could suggest a way to exclude the missing values from the normalization process.

how can I predict only 5 days price in this LSTM model (by pytorch)

class StockDataset(Dataset):
#데이터 셋은 i번째 레코드 값을 주는 역할 수행
def __init__(self, symbol, x_frames, y_frames, start, end):
self.symbol = symbol
self.x_frames = x_frames
self.y_frames = y_frames
self.start = datetime.datetime(*start)
self.end = datetime.datetime(*end)
#위에서 지정했던 데이터, 날짜 값들 다 받아옴
self.data = pdr.DataReader(self.symbol, 'yahoo', self.start, self.end)
def __len__(self):
return len(self.data) - (self.x_frames + self.y_frames) + 1
def __getitem__(self, idx):
global data
#global data_set
#데이터셋 i번째 값 입력받았을때 그걸 출력해줌 데이터를 '리스트'화 하는 것
idx += self.x_frames
data = self.data.iloc[idx-self.x_frames:idx+self.y_frames]
data = data[['High', 'Low', 'Open', 'Close', 'Adj Close', 'Volume']]
data = data.apply(lambda x: np.log(x+1) - np.log(x[self.x_frames-1]+1)) #로그수익률 변환 한뒤, 혹시모를 결측값 위해 1더해줌
global x_ex
global y_ex
x_ex= data[:self.x_frames]
y_ex= data[self.x_frames:]
data = data.values #numpy array로 변환한거
X = data[:self.x_frames]
y = data[self.x_frames:]
return X, y
This one is dataset
class LSTM(nn.Module):
#50분 이후부터 모델설명
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, batch_size, dropout, use_bn):
super(LSTM, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.output_dim = output_dim
self.num_layers = num_layers
self.batch_size = batch_size
self.dropout = dropout
self.use_bn = use_bn
self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)
self.hidden = self.init_hidden()
self.regressor = self.make_regressor()
def init_hidden(self):
return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
def make_regressor(self):
layers = []
if self.use_bn:
layers.append(nn.BatchNorm1d(self.hidden_dim))
layers.append(nn.Dropout(self.dropout))
layers.append(nn.Linear(self.hidden_dim, self.hidden_dim // 2))
layers.append(nn.ReLU())
layers.append(nn.Linear(self.hidden_dim // 2, self.output_dim))
regressor = nn.Sequential(*layers)
return regressor
def forward(self, x):
lstm_out, self.hidden = self.lstm(x, self.hidden)
y_pred = self.regressor(lstm_out[-1].view(self.batch_size, -1))
return y_pred
This one is model
def test(model, partition, args):
global y_true
global y_pred
global X
testloader = DataLoader(partition['test'],
batch_size=args.batch_size,
shuffle=False, drop_last=True)
model.eval()
test_acc = 0.0
with torch.no_grad():
for i, (X, y) in enumerate(testloader):
X = X.transpose(0, 1).float().to(args.device)
y_true = y[:, :, 3].float().to(args.device)
model.hidden = [hidden.to(args.device) for hidden in model.init_hidden()]
y_pred = model(X)
test_acc += metric(y_pred, y_true)[0]
test_acc = test_acc / len(testloader)
return test_acc
This is test data loader.
# ====== Random Seed Initialization ====== #
seed = 666
np.random.seed(seed)
torch.manual_seed(seed)
parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.exp_name = "exp1_lr"
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
# ====== Data Loading ====== #
args.symbol = '005930.KS' #원하는 종목
args.batch_size = 4 #배치사이즈
args.x_frames = 5 #수정x 이전 n일치 데이터 이게 너무 길면 1주일 예측 불가능
args.y_frames = 5 #수정y 이후 n일치 데이터 이게 너무 길면 1주일 예측 불가능
# ====== Model Capacity ===== #
args.input_dim = 6
args.hid_dim = 50
args.n_layers = 2 # (은닉층의 레이어 갯수) https://justkode.kr/deep-learning/pytorch-rnn링크 참고
# ====== Regularization ======= #
args.l2 = 0.0001
args.dropout = 0.3
args.use_bn = True
# ====== Optimizer & Training ====== #
args.optim = 'RMSprop' #'RMSprop' #SGD, RMSprop, ADAM...
args.lr = 0.001
args.epoch = 1
# ====== Experiment Variable ====== #
name_var1 = 'lr' # lr=러닝레이트
name_var2 = 'n_layers'#뉴럴 네트워크 몇개를 쌓을것인지?
list_var1 = [0.001, 0.0001, 0.00001]
list_var2 = [1,2,3]
#데이터셋 실제 형성
trainset = StockDataset(args.symbol, args.x_frames, args.y_frames, (2012,1,1), (2021,1,1)) #학습기간
valset = StockDataset(args.symbol, args.x_frames, args.y_frames, (2021,1,2), (2021,12,30)) #검증기간 최소 +6 월, +19 일 안하면 float division by zero 에러 발생 왜?? 21년 기준
testset = StockDataset(args.symbol, args.x_frames, args.y_frames, (2022,1,10), (2022,1,14)) #테스트기간 최소 +6 월, + 25일 안하면 float division by zero 에러 발생. 22년기준
#기간이 일정 영업일 이상을 요구하는듯? <<146 영업일 이상 데이터 요구. 그만큼 안주면 오류남 왜??
partition = {'train': trainset, 'val':valset, 'test':testset}
for var1 in list_var1:
for var2 in list_var2:
setattr(args, name_var1, var1)
setattr(args, name_var2, var2)
print(args)
setting, result = experiment(partition, deepcopy(args))
save_exp_result(setting, result)
#꼭 디렉토리에 있는 파일들 지운다음에 그래프 그려야한다. 안그러면 결과값 전부 겹쳐서 나옴
This one is hyper parameter regulate.
I wonder how can I get result when I set testset length in 5days? (like (2022,1,10) (2022,1,14))
This cord didn't work when I set testset length at least 7month (maybe + 146 trade day)
error is float divided by zero. (when I use lower 146 days.)
if I set length +146 days, then codes work well.
I think this code cause error:
data = data.apply(lambda x: np.log(x+1) - np.log(x[self.x_frames-1]+1))
log data was so small, so error occurred. (my opinion)
data is yahoo finance data. Thanx to read
When I # below code, then data got infinite.
data = data.apply(lambda x: np.log(x+1) - np.log(x[self.x_frames-1]+1))

How to re-use a single weight per group across all channels in pytorch?

Let's suppose I have the following 2D convolution layer:
nn.Conv2d(kernel_size=(1,20), stride=1, groups=5, out_channels=30, in_channels=30, bias=False),
What it does is that it creates a weight of 30x6x1x20 dimension, and in my model it results in overfitting.
Since the data is similar for every group, I want to reuse a single weight per group across all output channels associated with that group.
Ie. I would like my weight to be of only 5x1x1x20 dimension, where 5 corresponds to groups, and then repeat it 6 times for every input and 6 times for every output channel of that group.
How do I do this in pytorch?
Well, I defined a custom dimension weight and then repeated it before the convolution.
import torch
from typing import Optional, List, Tuple, Union
from torch import Tensor
from torch.nn.parameter import Parameter
from torch.nn.common_types import _size_2_t
from torch.nn.modules.utils import _single, _pair, _triple, _reverse_repeat_tuple
from torch.nn import init
import math
from torch.nn import functional as F
class _ConvNd(torch.nn.Module):
__constants__ = ['stride', 'padding', 'dilation', 'groups',
'padding_mode', 'output_padding', 'in_channels',
'out_channels', 'kernel_size']
__annotations__ = {'bias': Optional[torch.Tensor]}
def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:
...
_in_channels: int
_reversed_padding_repeated_twice: List[int]
out_channels: int
kernel_size: Tuple[int, ...]
stride: Tuple[int, ...]
padding: Union[str, Tuple[int, ...]]
dilation: Tuple[int, ...]
transposed: bool
output_padding: Tuple[int, ...]
groups: int
padding_mode: str
weight: Tensor
bias: Optional[Tensor]
def __init__(self,
in_channels: int,
out_channels: int,
kernel_size: Tuple[int, ...],
stride: Tuple[int, ...],
padding: Tuple[int, ...],
dilation: Tuple[int, ...],
transposed: bool,
output_padding: Tuple[int, ...],
groups: int,
bias: bool,
padding_mode: str,
device=None,
dtype=None) -> None:
factory_kwargs = {'device': device, 'dtype': dtype}
super(_ConvNd, self).__init__()
if in_channels % groups != 0:
raise ValueError('in_channels must be divisible by groups')
if out_channels % groups != 0:
raise ValueError('out_channels must be divisible by groups')
valid_padding_strings = {'same', 'valid'}
if isinstance(padding, str):
if padding not in valid_padding_strings:
raise ValueError(
"Invalid padding string {!r}, should be one of {}".format(
padding, valid_padding_strings))
if padding == 'same' and any(s != 1 for s in stride):
raise ValueError("padding='same' is not supported for strided convolutions")
valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
if padding_mode not in valid_padding_modes:
raise ValueError("padding_mode must be one of {}, but got padding_mode='{}'".format(
valid_padding_modes, padding_mode))
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.dilation = dilation
self.transposed = transposed
self.output_padding = output_padding
self.groups = groups
self.padding_mode = padding_mode
# `_reversed_padding_repeated_twice` is the padding to be passed to
# `F.pad` if needed (e.g., for non-zero padding types that are
# implemented as two ops: padding + conv). `F.pad` accepts paddings in
# reverse order than the dimension.
if isinstance(self.padding, str):
self._reversed_padding_repeated_twice = [0, 0] * len(kernel_size)
if padding == 'same':
for d, k, i in zip(dilation, kernel_size,
range(len(kernel_size) - 1, -1, -1)):
total_padding = d * (k - 1)
left_pad = total_padding // 2
self._reversed_padding_repeated_twice[2 * i] = left_pad
self._reversed_padding_repeated_twice[2 * i + 1] = (
total_padding - left_pad)
else:
self._reversed_padding_repeated_twice = _reverse_repeat_tuple(self.padding, 2)
if transposed:
self.weight = Parameter(torch.empty(
(in_channels, out_channels // groups, *kernel_size), **factory_kwargs))
else:
self.weight = Parameter(torch.empty(
(groups, 1, *kernel_size), **factory_kwargs))
if bias:
self.bias = Parameter(torch.empty(out_channels, **factory_kwargs))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self) -> None:
# Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
# uniform(-1/sqrt(k), 1/sqrt(k)), where k = weight.size(1) * prod(*kernel_size)
# For more details see: https://github.com/pytorch/pytorch/issues/15314#issuecomment-477448573
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
if fan_in != 0:
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def extra_repr(self):
s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
', stride={stride}')
if self.padding != (0,) * len(self.padding):
s += ', padding={padding}'
if self.dilation != (1,) * len(self.dilation):
s += ', dilation={dilation}'
if self.output_padding != (0,) * len(self.output_padding):
s += ', output_padding={output_padding}'
if self.groups != 1:
s += ', groups={groups}'
if self.bias is None:
s += ', bias=False'
if self.padding_mode != 'zeros':
s += ', padding_mode={padding_mode}'
return s.format(**self.__dict__)
def __setstate__(self, state):
super(_ConvNd, self).__setstate__(state)
if not hasattr(self, 'padding_mode'):
self.padding_mode = 'zeros'
class SharedConv2d(_ConvNd):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: _size_2_t,
stride: _size_2_t = 1,
padding: Union[str, _size_2_t] = 0,
dilation: _size_2_t = 1,
groups: int = 1,
bias: bool = True,
padding_mode: str = 'zeros', # TODO: refine this type
device=None,
dtype=None
) -> None:
factory_kwargs = {'device': device, 'dtype': dtype}
kernel_size_ = _pair(kernel_size)
stride_ = _pair(stride)
padding_ = padding if isinstance(padding, str) else _pair(padding)
dilation_ = _pair(dilation)
super(SharedConv2d, self).__init__(
in_channels, out_channels, kernel_size_, stride_, padding_, dilation_,
False, _pair(0), groups, bias, padding_mode, **factory_kwargs)
def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
if self.padding_mode != 'zeros':
return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
weight.repeat(self.out_channels//self.groups, self.in_channels//self.groups, 1, 1), bias, self.stride,
_pair(0), self.dilation, self.groups)
return F.conv2d(input, weight.repeat(self.out_channels//self.groups, self.in_channels//self.groups, 1, 1), bias, self.stride,
self.padding, self.dilation, self.groups)
def forward(self, input: Tensor) -> Tensor:
return self._conv_forward(input, self.weight, self.bias)
Then we can use it like this:
SharedConv2d(kernel_size=(1,20), stride=1, groups=5, out_channels=30, in_channels=30, bias=False)
The question is, are gradients going to backpropagate properly with repeat used on the weight?

When I make Transformer for NLP, it doesn't work, how can I fix it?

def get_pad_mask(tokens, i_pad=0):
"""
pad mask 계산하는 함수
:param tokens: tokens (bs, n_seq)
:param i_pad: id of pad
:return mask: pad mask (pad: 1, other: 0)
"""
# pad: True, others: False
mask = torch.eq(tokens, i_pad)
# boolean -> float 32
mask = mask.type(torch.FloatTensor)
# expand dimension for Q n_seq
mask = torch.unsqueeze(mask, 1)
return mask
def get_causal_mask(tokens, i_pad=0):
"""
causal mask 계산하는 함수
:param tokens: tokens (bs, n_seq)
:param i_pad: id of pad
:return mask: causal and pad mask (causal or pad: 1, other: 0)
"""
# n_seq 조회
n_seq = tokens.shape[1]
# all one mask
mask = torch.ones((n_seq, n_seq))
# make reverse causal mask
mask = mask.triu(1)
# 0 -> 1, 1 -> 0
# expand dim for bs
mask = torch.unsqueeze(mask, 0)
# get pad_mask
pad_mask = get_pad_mask(tokens, i_pad)
# mask all causal_mask or pad_mask
mask = torch.maximum(mask, pad_mask)
return mask
class ScaleDotProductAttention(nn.Module):
"""
Scale Dot Product Attention Class
"""
def __init__(self, name="scale_dot_product_attention"):
"""
생성자
:param name: layer name
"""
super(ScaleDotProductAttention, self).__init__()
def forward(self, Q, K, V, attn_mask):
"""
layer 실행
:param Q: Query
:param K: Key
:param V: Value
:param attn_mask: attention mask
:return attn_out: attention 실행 결과
"""
# matmul Q, K.T
attn_score = torch.matmul(Q, K.transpose(-2,-1))
# d_k
d_k = torch.tensor(K.shape[-1])
# scale = d_k ** 0.5
scale = torch.sqrt(d_k)
# divide by scale
attn_scale = torch.divide(attn_score, scale)
# do mask (subtract 1e-9 for masked value)
attn_scale -= 1.e9 * attn_mask
# calculate attention prob
attn_prob = torch.softmax(attn_scale, axis=-1)
# weighted sum of V
attn_out = torch.matmul(attn_prob, V)
return attn_out
class MultiHeadAttention(nn.Module):
#class MultiHeadAttention(tf.keras.layers.Layer):
"""
Multi Head Attention Class
"""
def __init__(self, args, name="MultiHeadAttention"):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(MultiHeadAttention, self).__init__()
self.d_model = args.d_model
self.n_head = args.n_head
self.d_head = args.d_head
# Q, K, V input dense layer
self.W_Q = nn.Linear(args.n_head * args.d_head,args.n_head * args.d_head)
self.W_K = nn.Linear(args.n_head * args.d_head,args.n_head * args.d_head)
self.W_V = nn.Linear(args.n_head * args.d_head,args.n_head * args.d_head)
'''TensorFLow
self.W_Q = tf.keras.layers.Dense(self.n_head * self.d_head)
self.W_K = tf.keras.layers.Dense(self.n_head * self.d_head)
self.W_V = tf.keras.layers.Dense(self.n_head * self.d_head)
TensorFLow'''
# Scale Dot Product Attention class
self.attention = ScaleDotProductAttention(name="self_attention")
# output dense layer
#self.W_O = torch.nn.Linear(args.n_head * args.d_head,self.d_model)
self.W_O = torch.nn.Linear(self.d_model,self.d_model)
'''TensorFLow
self.W_O = tf.keras.layers.Dense(self.d_model)
TensorFLow'''
def forward(self, Q, K, V, attn_mask):
"""
layer 실행
:param Q: Query
:param K: Key
:param V: Value
:param attn_mask: attention mask
:return attn_out: attention 실행 결과
"""
# build multihead Q, K, V
self.Q_m = torch.transpose(torch.reshape(self.W_Q(Q), [-1, Q.shape[1], args.n_head, args.d_head]), 2, 1) # (bs, n_head, Q_len, d_head)
self.K_m = torch.transpose(torch.reshape(self.W_K(K), [-1, K.shape[1], args.n_head, args.d_head]), 2, 1) # (bs, n_head, Q_len, d_head)
self.V_m = torch.transpose(torch.reshape(self.W_V(V), [-1, V.shape[1], args.n_head, args.d_head]), 2, 1) # (bs, n_head, Q_len, d_head)
'''TensorFLow
Q_m = tf.transpose(tf.reshape(self.W_Q(Q), [-1, tf.shape(Q)[1], args.n_head, args.d_head]), [0, 2, 1, 3]) # (bs, n_head, Q_len, d_head)
K_m = tf.transpose(tf.reshape(self.W_K(K), [-1, tf.shape(K)[1], args.n_head, args.d_head]), [0, 2, 1, 3]) # (bs, n_head, Q_len, d_head)
V_m = tf.transpose(tf.reshape(self.W_V(V), [-1, tf.shape(V)[1], args.n_head, args.d_head]), [0, 2, 1, 3]) # (bs, n_head, Q_len, d_head)
TensorFLow'''
# build multihead mask
attn_mask_m = torch.unsqueeze(attn_mask, axis=1)
'''TensorFLow
attn_mask_m = tf.expand_dims(attn_mask, axis=1)
TensorFLow'''
# Scale Dot Product Attention with multi head Q, K, V, attn_mask
attn_out_m = self.attention(self.Q_m, self.K_m, self.V_m, attn_mask_m) # (bs, n_head, Q_len, d_head)
# transpose
attn_out_t = torch.transpose(attn_out_m,2, 1)
'''TensorFLow
attn_out_t = tf.transpose(attn_out_m, perm=[0, 2, 1, 3]) # (bs, n_head, Q_len, d_head) -> (bs, Q_len, n_head, d_head)
TensorFLow'''
# reshape
attn_out_c = torch.reshape(attn_out_t, [-1, Q.shape[1], args.n_head * args.d_head]) # (bs, Q_len, n_head * d_head) -> (bs, Q_len, n_head, d_head)
'''TensorFLow
attn_out_c = tf.reshape(attn_out_t, [-1, tf.shape(Q)[1], args.n_head * args.d_head]) # (bs, Q_len, n_head, d_head) -> (bs, Q_len, n_head * d_head)
TensorFLow'''
# linear for output
attn_out = self.W_O(attn_out_c) # (bs, Q_len, n_head * d_head) -> (bs, Q_len, d_model)
return attn_out
class PositionWiseFeedForward(nn.Module):
#class PositionWiseFeedForward(tf.keras.layers.Layer):
"""
Position Wise Feed Forward Class
"""
def __init__(self, args, name="PositionWiseFeedForward"):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(PositionWiseFeedForward, self).__init__()
#super().__init__(name=name)
relu_f = torch.nn.ReLU()
relu_W = nn.Linear(args.d_model,args.d_ff)
self.W_1 = torch.nn.Sequential(relu_W,relu_f)
self.W_2 = nn.Linear(args.d_ff,args.d_model)
#self.W_1 = tf.keras.layers.Dense(args.d_ff, activation=tf.nn.relu)
#self.W_2 = tf.keras.layers.Dense(args.d_model)
def forward(self,inputs):
#def call(self, inputs):
"""
layer 실행
:param inputs: inputs
:return ff_val: feed forward 실행 결과
"""
# linear W_1 and W_2
ff_val = self.W_1(inputs)
ff_val = self.W_2(ff_val)
return ff_val
class EncoderLayer(nn.Module):
#class EncoderLayer(tf.keras.layers.Layer):
"""
Encoder Layer Class
"""
def __init__(self, args, name='encoder_layer'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(EncoderLayer, self).__init__()
#super().__init__(name=name)
self.enc_x_size = hidden_enc.size(dim=1)
self.enc_y_size = hidden_enc.size(dim=2)
self.self_attention = MultiHeadAttention(args)
self.norm1 = nn.LayerNorm([self.enc_x_size,self.enc_y_size],eps=args.norm_eps)
self.ffn = PositionWiseFeedForward(args)
self.norm2 = nn.LayerNorm([self.enc_x_size,self.enc_y_size],eps=args.norm_eps)
self.dropout = nn.Dropout(args.dropout)
def forward(self, enc_hidden, self_mask, training):
"""
layer 실행
:param enc_hidden: 이전 layer 출력
:param self_mask: self attention mask
:param training: training flag
:return enc_out: EncoderLayer 실행 결과
"""
# self attention
if training == False :
self.dropout.p = 0.0 #drop out training = False
print('ENCODER')
print(self.enc_x_size,self.enc_y_size,self.dropout.p)
self_attn_val = self.self_attention(enc_hidden, enc_hidden, enc_hidden, self_mask)
# add and layer normal
norm1_val = self.norm1(enc_hidden + self.dropout(self_attn_val))
# feed forward
ffn_val = self.ffn(norm1_val)
# add and layer normal
enc_out = self.norm2(norm1_val + self.dropout(ffn_val))
self.dropout.p = args.dropout
return enc_out
class DecoderLayer(nn.Module):
#class DecoderLayer(tf.keras.layers.Layer):
"""
Decoder Layer Class
"""
def __init__(self, args, name='decoder_layer'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(DecoderLayer, self).__init__()
#super().__init__(name=name)
self.dec_x_size = dec_hidden.size(dim=1)
self.dec_y_size = dec_hidden.size(dim=2)
self.self_attention = MultiHeadAttention(args)
self.norm1 = nn.LayerNorm([self.dec_x_size, self.dec_y_size],eps=args.norm_eps)
self.ende_attn = MultiHeadAttention(args)
self.norm2 = nn.LayerNorm([self.dec_x_size, self.dec_y_size],eps=args.norm_eps)
self.ffn = PositionWiseFeedForward(args)
self.norm3 = nn.LayerNorm([self.dec_x_size, self.dec_y_size],eps=args.norm_eps)
self.dropout = nn.Dropout(args.dropout)
def forward(self, dec_hidden, enc_out, self_mask, ende_mask,training):
#def call(self, dec_hidden, enc_out, self_mask, ende_mask, training):
"""
layer 실행
:param dec_hidden: 이전 layer 출력
:param enc_out: Encoder final 출력
:param self_mask: self attention mask
:param ende_mask: Encoder Decoder attention mask
:param training: training flag
:return dec_out: DecoderLayer 실행 결과
"""
print('DecoderLayer')
print(self.dec_x_size,self.dec_y_size)
if training == False :
self.dropout.p = 0.0 #drop out training = False
# self attention
self_attn_val = self.self_attention(dec_hidden, dec_hidden, dec_hidden, self_mask)
# add and layer normal
norm1_val = self.norm1(dec_hidden + self.dropout(self_attn_val))
# encoder and decoder attention
ende_attn_val = self.ende_attn(norm1_val, enc_out, enc_out, ende_mask)
# add and layer normal
norm2_val = self.norm2(norm1_val + self.dropout(ende_attn_val))
# feed forward
ffn_val = self.ffn(norm2_val)
# add and layer normal
dec_out = self.norm3(norm2_val + self.dropout(ffn_val))
self.dropout.p = args.dropout
return dec_out
class SharedEmbedding(nn.Module):
#class SharedEmbedding(tf.keras.layers.Layer):
"""
Weighed Shaed Embedding Class
"""
def __init__(self, args, name='SharedEmbedding'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(SharedEmbedding, self).__init__()
#super().__init__(name=name)
self.shared_weights = torch.empty(args.n_vocab, args.d_model)
self.shared_weights = torch.nn.init.trunc_normal_(self.shared_weights,std = args.d_model ** -0.5)
#with tf.name_scope('shared_embedding_weight'):
self.n_vocab = args.n_vocab
self.d_model = args.d_model
def forward(self, inputs, mode='embedding'):
#def call(self, inputs, mode='embedding'):
"""
layer 실행
:param inputs: 입력
:param mode: 실행 모드
:return: embedding or linear 실행 결과
"""
# mode가 embedding일 경우 embedding lookup 실행
if mode == 'embedding':
return self._embedding(inputs)
# mode가 linear일 경우 linear 실행
elif mode == 'linear':
return self._linear(inputs)
# mode가 기타일 경우 오류 발생
else:
raise ValueError(f'mode {mode} is not valid.')
def _embedding(self, inputs):
"""
embedding lookup
:param inputs: 입력
"""
# lookup by gather
embed = torch.matmul(nn.functional.one_hot(inputs, len(vocab)).type(torch.float), self.shared_weights )
#embed = tf.gather(self.shared_weights, tf.cast(inputs, tf.int32))
# muliply d_model ** 0.5
embed *= self.d_model ** 0.5
return embed
def _linear(self, inputs): # (bs, n_seq, d_model)
"""
linear 실행
:param inputs: 입력
"""
# matmul inputs, shared_weights (transpose_b=True)
outputs = torch.matmul(inputs, torch.transpose(self.shared_weights, 1, 0))
#outputs = tf.matmul(inputs, self.shared_weights, transpose_b=True)
return outputs
class PositionalEmbedding(nn.Module):
#class PositionalEmbedding(tf.keras.layers.Layer):
"""
Positional Embedding Class
"""
def __init__(self, args, name='position_embedding'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(PositionalEmbedding, self).__init__()
#super().__init__(name=name)
pos_encoding = PositionalEmbedding.get_sinusoid_encoding(args.n_seq, args.d_model)
self.embedding = nn.Embedding(args.n_seq, args.d_model)
self.embedding.weights = [pos_encoding] #weights=[pos_encoding]
self.embedding.weight.requires_grad=False #trainable=False
#self.embedding = tf.keras.layers.Embedding(args.n_seq, args.d_model, trainable=False, weights=[pos_encoding])
def forward(self, inputs):
#def call(self, inputs):
"""
layer 실행
:param inputs: 입력
:return embed: positional embedding lookup 결과
"""
# make position (0...n_seq)
zero_inputs = torch.ones_like(inputs)
x_size = zero_inputs.size(dim=0)
for i in range(0,x_size):
zero_inputs[i][0] = 0
position = torch.cumsum(zero_inputs, dim=1)
#position = tf.math.cumsum(tf.ones_like(inputs), axis=1, exclusive=True)
# embedding lookup
embed = self.embedding(position)
return embed
#staticmethod
def get_sinusoid_encoding(n_seq, d_model):
"""
sinusoid encoding 생성
:param n_seq: sequence number
:param n_seq: model hidden dimension
:return: positional encoding table
"""
# calculate exp
exs = np.array([2 * (i_ang // 2) / d_model for i_ang in range(d_model)])
# exs = np.array([2 * (i_ang // 2) / args.d_model for i_ang in range(args.d_model)])
# calculate power
angles = np.power(10000, exs)
# make position
pos = np.array([[i] for i in range(n_seq)])
# position angle
pos_encoding = pos / angles
# sin even number
pos_encoding[:, 0::2] = np.sin(pos_encoding[:, 0::2])
# cos odd number
pos_encoding[:, 1::2] = np.cos(pos_encoding[:, 1::2])
return pos_encoding
#return tf.cast(pos_encoding, tf.float32)
class Transformer(nn.Module):
#class Transformer(tf.keras.Model):
"""
Transformer Class
"""
def __init__(self, args, name='transformer'):
"""
생성자
:param args: Args 객체
:param name: layer name
"""
super(Transformer, self).__init__()
#super().__init__(name=name)
self.i_pad = args.i_pad
self.embedding = SharedEmbedding(args)
self.position = PositionalEmbedding(args)
self.encoder_layers = [EncoderLayer(args, name=f'encoder_layer_{i}') for i in range(args.n_layer)]
self.decoder_layers = [DecoderLayer(args, name=f'decoder_layer_{i}') for i in range(args.n_layer)]
self.dropout = nn.Dropout(args.dropout)
def forward(self, inputs, training=False):
#def call(self, inputs, training=False):
"""
layer 실행
:param inputs: enc_tokens, dec_tokens
:return logits: dec_tokens에 대한 다음 토큰 예측 결과 logits
"""
enc_tokens, dec_tokens = inputs
# encoder self attention mask
enc_self_mask = get_pad_mask(enc_tokens, self.i_pad)
# decoder self attention mask
dec_self_mask = get_causal_mask(dec_tokens, self.i_pad)
# encoder and decoder attention mask
enc_dec_mask = get_pad_mask(enc_tokens, self.i_pad)
# enc_tokens embedding lookup
enc_hidden = self.embedding(enc_tokens) + self.position(enc_tokens)
enc_hidden = self.dropout(enc_hidden)
# call encoder layers
for encoder_layer in self.encoder_layers:
enc_hidden = encoder_layer(enc_hidden, enc_self_mask, training)
# dec_tokens embedding lookup
dec_hidden = self.embedding(dec_tokens) + self.position(dec_tokens)
if training == False :
self.dropout.p = 0.0 #drop out training = False
dec_hidden = self.dropout(dec_hidden)
# call decoder layers
for decoder_layer in self.decoder_layers:
dec_hidden = decoder_layer(dec_hidden, enc_hidden, dec_self_mask, enc_dec_mask, training)
# call weight shared embedding (model=linear)
logits = self.embedding(dec_hidden, mode='linear')
# softmax
logit_softmax = nn.Softmax(dim=-1)
y_pred = logit_softmax(logits)
self.dropout.p = args.dropout
return y_pred
def lm_loss(logits, labels):
logit_softmax = nn.Softmax(dim=-1)
logits = logit_softmax(logits)
loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
loss = loss_fn(logits, labels)
mask = labels.ne(0)
loss_val = loss.masked_select(mask).sum()
total = mask.sum()
loss = loss_val / torch.maximum(total, torch.tensor(1))
return loss
def lm_acc(y_pred, y_true):
"""
pad 부분을 제외하고 accuracy를 계산하는 함수
:param y_true: 정답
:param y_pred: 예측 값
:retrun loss: pad 부분이 제외된 accuracy 값
"""
y_true_clone = y_true.clone().detach()
y_pred_clone = y_pred.clone().detach()
y_pred_softmax = nn.Softmax(dim=-1)
y_pred_clone = y_pred_softmax(y_pred_clone )
y_pred_class = torch.argmax(y_pred_clone,dim=-1)
matches = torch.eq(y_true_clone, y_pred_class)
matches = matches.to(device).int()
mask = torch.ne(y_true_clone, 0)
mask = mask.to(device).int()
matches *= mask
mask_total = mask.sum()
matches_total = matches.sum()
accuracy = matches_total / torch.maximum(mask_total, torch.tensor(1))
print(y_true_clone)
print(y_pred_class)
print(accuracy)
return accuracy
model = Transformer(args)
function_predict = model((train_enc_inputs[:4], train_dec_inputs[:4]),training=True)
loss = lm_loss(function_predict[:4].view(-1, function_predict.size(-1)), train_dec_labels[:4].view(-1)).to(device)
acc = lm_acc(function_predict[:4].view(-1, function_predict.size(-1)), train_dec_labels[:4].view(-1)).to(device)
input : tensor([7116, 107, 1, ..., 0, 0, 0])
output : tensor([ 2, 7116, 107, ..., 0, 0, 0])
I make transofromer by pytorch, The output same as dec_input starting 2 ending with 3
I challenge 1000 times this code, the result is smae.
I think this code has some problem, This is my first pytorch code.
Can you fix it this stressed code?

PyTorch: GRU, one-to-many / many-to-one

I would like to implement a GRU able to encode a sequence of vectors to one vector (many-to-one), and then another GRU able to decode a vector to a sequence of vector (one-to-many). The size of the vectors wouldn't be changed. I would like to have an opinion about what I implemented.
Here is the code:
class AEGRU(nn.Module):
def __init__(self, opt):
super(AEGRU, self).__init__()
self.length = 256
self.latent_space = 256
self.num_layers = 1
self.GRU_enc = nn.GRU(input_size=3, hidden_size=self.latent_space, num_layers=self.num_layers, batch_first=True)
self.fc_enc = nn.Linear(self.latent_space, self.latent_space)
self.GRU_dec = nn.GRU(input_size=self.latent_space, hidden_size=3, num_layers=self.num_layers, batch_first=True)
self.fc_dec = nn.Linear(3, 3)
def enc(self, x):
# x has shape: Batch_size x self.length x 3
h0 = torch.zeros(self.num_layers, x.shape[0], self.latent_space).cuda()
out, _ = self.GRU_enc(x, h0)
out = out[:, -1, :]
out = self.fc_enc(out)
return out
def dec(self, x):
# x has shape: Batch_size x self.latent_space
x = x[:, None, :]
h = torch.zeros(self.num_layers, x.shape[0], 3).cuda()
# method 1 ??
'''outputs = torch.zeros(x.shape[0], self.length, 3).cuda()
for i in range(self.length):
out, h = self.GRU_dec(x, h)
outputs[:, i, :] = out[:, 0, :]'''
# method 2 ??
x = x.repeat(1, self.length, 1)
outputs, _ = self.GRU_dec(x, h)
# linear layer
outputs = self.fc_dec(outputs)
return outputs
def forward(self, x):
self.indices = []
latent = self.enc(x)
output = self.dec(latent)
return output
I am not sure whether this is the good way to do a one-to-many GRU. Could I have some opinions about this?
Thanks for reading!

Resources