Standard interpretation: in the original RNN, the hidden state and output are calculated as
In other words, we obtain the the output from the hidden state.
According to Wiki, the RNN architecture can be unfolded like this:
And the code I have been using is like:
class Model(nn.Module):
def __init__(self, input_size, output_size, hidden_dim, n_layers):
super(Model, self).__init__()
self.hidden_dim = hidden_dim
self.rnn = nn.RNN(input_size, hidden_dim, 1)
self.fc = nn.Linear(hidden_dim, output_size)
def forward(self, x):
batch_size = x.size(0)
out, hidden = self.rnn(x)
# getting output from the hidden state
out = out..view(-1, self.hidden_dim)
out = self.fc(out)
return out, hidden
RNN as "pure" feed-forward layers: but today, I see another implementation from the Pytorch Tutorial
And their code is like
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
return output, hidden
def initHidden(self):
return torch.zeros(1, self.hidden_size)
The hidden layer calculation is same as the standard interpretation, but the output is is calculated independently from the current hidden state h.
To me, the math behind this implementation is:
So, this implementation is different from the original RNN implementation?
I have been using RNN for almost 1 year and I thought I understand it, not until today when I see this post from Pytorch. I am really confused now.
Related
I'm working on a basic RNN-NLP classifier using PyTorch, and trying to use CUDA for acceleration.(On Google_Colab)
but, I can't solve this error.
The code is written like this.
error message
Input and hidden tensors are not at the same device, found input tensor at cuda:0 and hidden tensor at cpu
RNN class
class RNN(nn.Module):
def __init__(self, vocab_size, emb_size, hidden_size, output_size):
super().__init__()
self.hidden_size = hidden_size
self.emb = nn.Embedding(vocab_size, emb_size)
self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity='tanh', batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
self.batch_size = x.size()[0]
hidden = self.init_hidden()
emb = self.emb(x)
out, hidden = self.rnn(emb, hidden)
out = self.fc(out[:, -1, :])
return out
def init_hidden(self):
hidden = torch.zeros(1, self.batch_size, self.hidden_size)
return hidden
device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Setting var
VOCAB_SIZE = len(word_id.keys()) +1
EMB_SIZE = 300
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
model = RNN(VOCAB_SIZE,EMB_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)
model = model.to(device)
Predict
for i in range(10):
# datasetet の、リスト0indexに入力要素
X, y = dataset_train[i]
X = X.to(device)
print(torch.softmax(model(X.unsqueeze(0)), dim=1))
This code works on CPU. but, can't works on "GPU".
Follow this error, I try to make some fix code.
ex) hidden.to(device),,,,
but,I can't solve...
Pleas someone tell me how to solve.
Thank you very much for my question.
Doesn't doing something like the following work?
device = torch.device("cuda" if torch.cuda.is_available() else "CPU")
class RNN(nn.Module):
def __init__(self, vocab_size, emb_size, hidden_size, output_size):
super().__init__()
self.hidden_size = hidden_size
self.emb = nn.Embedding(vocab_size, emb_size)
self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity='tanh', batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
self.to(device)
def forward(self, x):
self.batch_size = x.size()[0]
hidden = self.init_hidden()
emb = self.emb(x)
out, hidden = self.rnn(emb, hidden)
out = self.fc(out[:, -1, :])
return out
def init_hidden(self):
hidden = torch.zeros(1, self.batch_size, self.hidden_size).to(device)
return hidden
I am trying to use TBPTT on a multivariate time series, and I am facing a problem, my loss doesn’t decrease, and I don’t know what I am doing wrong.
Inputs shape (Batch_size,1270,6)
Output shape (Batch_size,1270)
There is a particularity with the Inputs:
6 Features correspond to A-B A-C A-D where A is the time step,
Between two inputs (Inputs[0] and Inputs[1]) features don’t have the same length, I padded all the Inputs using
torch.nn.utils.rnn.pad_sequence(Mise_en_donnees,padding_value=-1,batch_first=True)
I tried padding_value=0. But it doesn’t change anything)
All Inputs are normalized using get_mean_std
def get_mean_std(loader,ignore_idx=-1.):
channels_sum,channels_squared_sum,num_batches=0,0,0
for data in loader:
a=torch.sum((data[:,0]!=ignore_idx)).item()-1
channels_sum+=torch.mean(data[:a],dim=[0])
channels_squared_sum+=torch.mean(data[:a]**2,dim=[0])
num_batches+=1
mean=channels_sum/num_batches
std=(channels_squared_sum/num_batches -mean**2)**0.5
return mean,std
There is my Model
#A classic Conv_Block
class conv_block (nn.Module):
def __init__(self, in_channels, out_channels, **kwargs):
super(conv_block, self).__init__()
self.relu = nn.LeakyReLU()
self.conv = nn.Conv1d(in_channels, out_channels, **kwargs)
self.batchnorm = nn.BatchNorm1d(out_channels)
def forward(self, x):
x=self.conv(x)
x= self.batchnorm(x)
return self.relu(x)
class Test (nn.Module):
def __init__(self,in_channels,num_layers,hidden_size, p,out_size):
super(Test ,self).__init__()
self.CNN=nn.Sequential(
#I am trying to apply filters on every two columns (A-B A-C A-D) using groups
conv_block(in_channels,3,kernel_size=2,stride=1,padding=1,groups=3),#,padding_mode="reflect"),
conv_block(3,32,kernel_size=2,stride=1,padding=0),
#SqueezeExcitation(32,16), #i tried but same results
conv_block(32,16,kernel_size=3,stride=1,padding=1),
conv_block(16,8,kernel_size=3,stride=1,padding=1),
)
self.rnn = nn.LSTM(8, hidden_size, num_layers)
self.rnn1 = nn.LSTM(hidden_size, hidden_size, num_layers)
#self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size) # in case of using bidirectional
#self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
self.dropout = nn.Dropout(p)
self.num_layers=num_layers
self.fc_f=nn.Linear(out_size*hidden_size,out_size)
def forward(self,x,hidden, cell):
x=x.permute(0,2,1)
x=self.CNN(x)
x=x.permute(2,0,1)
x, (hidden, cell) = self.rnn(x) #i tried bidirectional but same results
#hidden = self.dropout(self.fc_hidden(torch.cat((hidden[0:self.num_layers], hidden[self.num_layers:2*self.num_layers]), dim=2)))
#cell = self.dropout(self.fc_cell(torch.cat((cell[0:self.num_layers], cell[self.num_layers:2*self.num_layers]), dim=2)))
x, (hidden, cell) = self.rnn1(x, (hidden, cell))
#hidden=hidden.repeat(2,1,1)
#cell=cell.repeat(2,1,1)
x=x.permute(1,0,2)
x=x.reshape(x.shape[0],-1)
x=self.fc_f(x) #final result
return x, hidden, cell
#hyperparameters
in_channels=6
num_layers=64
hidden_size=90
p=0.2
out_size=tbptt_steps=20 #truncated bptt steps
split_dim=1
nb_epoch=100
learning_rate=3e-4
Model=Test(in_channels,num_layers,hidden_size, p,out_size).to(device)
optimizer = optim.Adam(Model.parameters(), lr=learning_rate)
# I tired to test my model on the same inputs
X=Inputs[:5,:500,:-1].to(device)
Y=Inputs[:5,:500,-1].to(device)
#training loop
hidden=None
cell=None
for ep in range (nb_epoch):
Losses=0
for i, (x_, y_) in enumerate(zip(X.split(tbptt_steps, dim=split_dim), Y.split(tbptt_steps, dim=split_dim))):
optimizer.zero_grad()
#Model.train()
# Detach last hidden state, so the backprop-graph will be cut
if hidden is not None:
hidden.detach_()
if cell is not None:
cell.detach_()
# Forward path
y_pred, hidden, cell = Model(x_, hidden, cell)
#print("predict",y_pred.shape,y_.shape)
# Compute loss
loss = nn.functional.mse_loss(y_, y_pred)
# Backward path
loss.backward()
Losses+=loss.item()
# Update weights
optimizer.step()
if i==0:
print("Epoch ",ep," Loss ",loss.item())
print("#################################################")
print(Losses)
print("#################################################")
There is two problems with this Model:
It doesn’t catch the padding_value
-The loss is high and didn’t decrease
I really hope that the Model is understandable, and we will correct it.
As you can see I am not a professional in Machine learning, I am really eager to understand more about my errors .
Thank you very much for your help
I'm creating an LSTM Autoencoder for feature extraction for my master's thesis. However, I'm having a lot of trouble with combining dropout with LSTM layers.
Since it's an Autoencoder, I'm having a bottleneck which is achieved by having two separate LSTM layers, each with num_layers=1, and a dropout in between. I have time series with very different lengths and have found packed sequences to be a good idea for that reason.
But, from my experiments, I must pack the data before the first LSTM, unpack before the dropout, then pack again before the second LSTM. This seems wildly inefficient. Is there a better way? I'm providing some example code and an alternative way to implement it below.
Current, working, but possibly suboptimal solution:
class Encoder(nn.Module):
def __init__(self, seq_len, n_features, embedding_dim, hidden_dim, dropout):
super(Encoder, self).__init__()
self.seq_len = seq_len
self.n_features = n_features
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.lstm1 = nn.LSTM(
input_size=n_features,
hidden_size=self.hidden_dim,
num_layers=1,
batch_first=True,
)
self.lstm2 = nn.LSTM(
input_size=self.hidden_dim,
hidden_size=embedding_dim,
num_layers=1,
batch_first=True,
)
self.drop1 = nn.Dropout(p=dropout, inplace=False)
def forward(self, x):
x, (_, _) = self.lstm1(x)
x, lens = pad_packed_sequence(x, batch_first=True, total_length=self.seq_len)
x = self.drop1(x)
x = pack_padded_sequence(x, lens, batch_first=True, enforce_sorted=False)
x, (hidden_n, _) = self.lstm2(x)
return hidden_n.reshape((-1, self.n_features, self.embedding_dim)), lens
Alternative, possibly better, but currently not working solution;
class Encoder2(nn.Module):
def __init__(self, seq_len, n_features, embedding_dim, hidden_dim, dropout):
super(Encoder2, self).__init__()
self.seq_len = seq_len
self.n_features = n_features
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.lstm1 = nn.LSTM(
input_size=n_features,
hidden_size=self.hidden_dim,
num_layers=2,
batch_first=True,
dropout=dropout,
proj_size=self.embedding_dim,
)
def forward(self, x):
_, (h_n, _) = self.lstm1(x)
return h_n[-1].unsqueeze(1), lens
Any help and tips about working with time-series, packed sequences, lstm-cells and dropout would be immensely appreciated, as I'm not finding much documentation/guidance elsewhere on the internet. Thank you!
Best, Lars Ankile
For the hereafter, after a lot of trial and error, the following full code for the Autoencoder seems to work very well. Getting the packing and unpacking to work correctly was the main hurdle. The clue is, I think, to try to utilize the LSTM modules for what they're worth by using the proj_size, num_layers, and dropout parameters.
class EncoderV4(nn.Module):
def __init__(
self, seq_len, n_features, embedding_dim, hidden_dim, dropout, num_layers
):
super().__init__()
self.seq_len = seq_len
self.n_features = n_features
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.lstm1 = nn.LSTM(
input_size=n_features,
hidden_size=self.hidden_dim,
num_layers=num_layers,
batch_first=True,
dropout=dropout,
proj_size=self.embedding_dim,
)
def forward(self, x):
_, (h_n, _) = self.lstm1(x)
return h_n[-1].unsqueeze(1)
class DecoderV4(nn.Module):
def __init__(self, seq_len, input_dim, hidden_dim, n_features, num_layers):
super().__init__()
self.seq_len = seq_len
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.n_features = n_features
self.num_layers = num_layers
self.lstm1 = nn.LSTM(
input_size=input_dim,
hidden_size=hidden_dim,
num_layers=num_layers,
proj_size=n_features,
batch_first=True,
)
def forward(self, x, lens):
x = x.repeat(1, self.seq_len, 1)
x = pack_padded_sequence(x, lens, batch_first=True, enforce_sorted=False)
x, _ = self.lstm1(x)
return x
class RecurrentAutoencoderV4(nn.Module):
def __init__(
self, seq_len, n_features, embedding_dim, hidden_dim, dropout, num_layers
):
super().__init__()
self.encoder = EncoderV4(
seq_len, n_features, embedding_dim, hidden_dim, dropout, num_layers
)
self.decoder = DecoderV4(
seq_len, embedding_dim, hidden_dim, n_features, num_layers
)
def forward(self, x, lens):
x = self.encoder(x)
x = self.decoder(x, lens)
return x
The full code and a paper using this Autoencoder can be found at GitHub and arXiv, respectively.
I looked through different implementations of BERT's Masked Language Model.
For pre-training there are two common versions:
Decoder would simply take the final embedding of the [MASK]ed token and pass it throught a linear layer (without any modifications):
class LMPrediction(nn.Module):
def __init__(self, hidden_size, vocab_size):
super().__init__()
self.decoder = nn.Linear(hidden_size, vocab_size, bias = False)
self.bias = nn.Parameter(torch.zeros(vocab_size))
self.decoder.bias = self.bias
def forward(self, x):
return self.decoder(x)
Some implementations would use the weights of the input embeddings as weights of the decoder-linear-layer:
class LMPrediction(nn.Module):
def __init__(self, hidden_size, vocab_size, embeddings):
super().__init__()
self.decoder = nn.Linear(hidden_size, vocab_size, bias = False)
self.bias = nn.Parameter(torch.zeros(vocab_size))
self.decoder.weight = embeddings.weight ## <- THIS LINE
self.decoder.bias = self.bias
def forward(self, x):
return self.decoder(x)
Which one is correct? Mostly, I see the first implementation. However, the second one makes sense as well - but I cannot find it mentioned in any papers (I would like to see if the second version is somehow superior to the first one)
For those who are interested, it is called weight tying or joint input-output embedding. There are two papers that argue for the benefit of this approach:
Beyond Weight Tying: Learning Joint Input-Output Embeddings for Neural Machine Translation
Using the Output Embedding to Improve Language Models
i'm trying to understand if my two layer LSTM is like the one in the image. Someone can help me?
p.s. i have cleared mean pooling layer.
input layer(green), LSTM layer hidden(blue), linear layer(yellow)
https://ibb.co/1qN6q1m
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.num_layer=2
self.i2h = nn.LSTM(input_size, hidden_size, self.num_layer, dropout=0.5)
self.i2o = nn.Linear(hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
self.hidden = self.initHidden()
def forward(self, input):
lstmout, self.hidden= self.i2h(input.view(len(input), 1, -1), self.hidden)
output = self.i2o(lstmout.view(len(input), -1))
output = self.softmax(output)
return output
def initHidden(self):
return (torch.zeros(self.num_layer, 1, self.hidden_size).to(device),torch.zeros(self.num_layer, 1, self.hidden_size).to(device))