RuntimeError: Expected hidden[0] size (1, 1, 512), got (1, 128, 512) for LSTM pytorch - pytorch

I trained the LSTM with a batch size of 128 and during testing my batch size is 1, why do I get this error? I'm suppose to initialize the hidden size when doing testing?
Here is the code that i'm using, I initialize the hidden state init_hidden function as (number_of_layers, batch_size, hidden_size) since batch_first=True
class ImageLSTM(nn.Module):
def __init__(self, n_inputs:int=49,
n_outputs:int=4096,
n_hidden:int=256,
n_layers:int=1,
bidirectional:bool=False):
"""
Takes a 1D flatten images.
"""
super(ImageLSTM, self).__init__()
self.n_inputs = n_inputs
self.n_hidden = n_hidden
self.n_outputs = n_outputs
self.n_layers = n_layers
self.bidirectional = bidirectional
self.lstm = nn.LSTM( input_size=self.n_inputs,
hidden_size=self.n_hidden,
num_layers=self.n_layers,
dropout = 0.5 if self.n_layers>1 else 0,
bidirectional=self.bidirectional,
batch_first=True)
if (self.bidirectional):
self.FC = nn.Sequential(
nn.Linear(self.n_hidden*2, self.n_outputs),
nn.Dropout(p=0.5),
nn.Sigmoid()
)
else:
self.FC = nn.Sequential(
nn.Linear(self.n_hidden, self.n_outputs),
# nn.Dropout(p=0.5),
nn.Sigmoid()
)
def init_hidden(self, batch_size, device=None): # input 4D tensor: (batch size, channels, width, height)
# initialize the hidden and cell state to zero
# vectors:(number of layer, batch size, number of hidden nodes)
if (self.bidirectional):
h0 = torch.zeros(2*self.n_layers, batch_size, self.n_hidden)
c0 = torch.zeros(2*self.n_layers, batch_size, self.n_hidden)
else:
h0 = torch.zeros(self.n_layers, batch_size, self.n_hidden)
c0 = torch.zeros(self.n_layers, batch_size, self.n_hidden)
if device is not None:
h0 = h0.to(device)
c0 = c0.to(device)
self.hidden = (h0,c0)
def forward(self, X): # X: tensor of shape (batch_size, channels, width, height)
# forward propagate LSTM
lstm_out, self.hidden = self.lstm(X, self.hidden) # lstm_out: tensor of shape (batch_size, seq_length, hidden_size)
# Decode the hidden state of the last time step
out = self.FC(lstm_out[:, -1, :])
return out

please edit your post and add code. How did you initialize the hidden-state? What does you model look like.
hidden[0] is not your hidden-size, its the hidden-state of the lstm. The shape of the hidden-state has to be initialized like this:
hidden = ( torch.zeros((batch_size, layers, hidden_size)), torch.zeros((layers, batch_size, hidden_size)) )
You seem to have done this correctly. But the error tells you that you gave a batch of size 1 (because as you said you want to test with only one sample) but the hidden-state is initialized with batch-size=128.
So I guess (please add code) that you hard-coded that the batch-size = 128. Dont do that. Since you have to reinitialize the hidden-state every forward pass you can do this:
...
def forward(self, x):
batch_size = x.shape[0]
hidden = (torch.zeros(self.layers, batch_size, self.hidden_size).to(device=device), torch.zeros(self.layers, batch_size, self.hidden_size).to(device=device))
output, hidden = lstm(x, hidden)
# then do what every you want with the output
I guess that this is what causes this error but please post your code, too!

Related

mat1 and mat2 shapes cannot be multiplied (1x110 and 100x256)

I am trying to build a GAN which accepts a label to generate new image.
class Generator(nn.Module):
def __init__(self):
super(Generator, self).__init__()
self.fc1 = nn.Linear(100, 256)
self.fc2 = nn.Linear(256, 512)
self.fc3 = nn.Linear(512, 1024)
self.fc4 = nn.Linear(1024, 784)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = torch.tanh(self.fc4(x))
return x
# set label
label = 3
# create one-hot encoded vector
one_hot = torch.zeros(1, 10)
one_hot[0][label] = 1
# set noise vector
noise = torch.randn(1, 100)
# concatenate label and noise
noise_with_label = torch.cat([one_hot, noise], dim=1)
# pass through generator
generated_image = generator(noise_with_label)
But its throwing :
112
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)
115
116 def extra_repr(self) -> str:
RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x110 and 100x256)
I am using MNIST dataset.
I tried to resolve it, but couldn’t find a way to fix it.
Original comment by #jodag helped to fix the issue:
The input you're providing to your model has shape [1,110] because you're concatenating noise (shape [1,100]) and one_hot (shape [1,10]). But the first layer expects inputs of shape [..., 100]. One way to get rid of the error would be to change the first layer of the model to self.fc1 = nn.Linear(110, 256). An alternative would be to define noise as noise = torch.randn(1, 90)

LSTM Autoencoder set-up for multiple features using Pytorch

I am building an LSTM autoencoder to denoise signals and will take more than 1 feature as it's input.
I have setup the model Encoder part as follows which works for single feature inputs (i.e. sequences with just one feature):
class Encoder(nn.Module):
def __init__(self, seq_len, n_features, num_layers=1, embedding_dim=64):
super(Encoder, self).__init__()
self.seq_len = seq_len
self.n_features = n_features
self.num_layers = num_layers
self.embedding_dim = embedding_dim
self.hidden_dim = 2 * embedding_dim
# input: batch_size, seq_len, features
self.lstm1 = nn.LSTM(
input_size=self.n_features,
hidden_size=self.hidden_dim,
num_layers=self.num_layers,
batch_first=True
) # output: batch size, seq_len, hidden_dim
# input: batch_size, seq_len, hidden_dim
self.lstm2 = nn.LSTM(
input_size=self.hidden_dim,
hidden_size = self.embedding_dim,
num_layers = self.num_layers,
batch_first=True
) # output: batch_size, seq_len, embedding_dim
def forward(self, x):
print(x)
x = x.reshape((1, self.seq_len, self.n_features))
print(x.shape)
x, (_, _) = self.lstm1(x)
print(x.shape)
x, (hidden_n, _) = self.lstm2(x)
print(x.shape, hidden_n.shape)
print(hidden_n)
return hidden_n.reshape((self.n_features, self.embedding_dim))
When I test this setup as follows:
model = Encoder(1024, 1)
model.forward(torch.randn(1024, 1))
with the 1 representing a single feature all is well. However, when I do the following (where 2 represents a sequence of 2 features):
model = Encoder(1024, 2)
model.forward(torch.randn(1024, 2))
I get the following error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Input In [296], in <cell line: 1>()
----> 1 model.forward(torch.randn(1024, 2))
Input In [294], in Encoder.forward(self, x)
36 print(hidden_n)
37 # print(hidden_n.reshape((self.n_features, self.embedding_dim)).shape)
---> 39 return hidden_n.reshape((self.n_features, self.embedding_dim))
RuntimeError: shape '[2, 64]' is invalid for input of size 64
The hidden_n shape comes out as torch.Size([1, 1, 64]). I would like to understand that if we have more than 1 feature, e.g. 2, do we want to get that shape into the format of 1, 2, 64 such that the hidden state has weights for both features?
Can someone please explain why reshape is not liking the way I'm trying to restructure the output of the Encoder and how I should do it such that the model is able to take any feature size into account.
What am I missing/ perhaps misunderstanding here?

Extracting Autoencoder features from the hidden layer

I have developed some code to apply Autoencoder on my dataset, in order to extract hidden features from it. I have a dataset that consists of 84 variables, and they have been normalised.
epochs = 10
batch_size = 128
lr = 0.008
# Convert Input and Output data to Tensors and create a TensorDataset
input = torch.Tensor(input.to_numpy())
output = torch.tensor(output.to_numpy())
data = torch.utils.data.TensorDataset(input, output)
# Split to Train, Validate and Test sets using random_split
number_rows = len(input) # The size of our dataset or the number of rows in excel table.
test_split = int(number_rows*0.3)
train_split = number_rows - test_split
train_set, test_set = random_split(data, [train_split, test_split])
# Create Dataloader to read the data within batch sizes and put into memory.
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)
The model structure:
# Model structure
class AutoEncoder(nn.Module):
def __init__(self):
super(AutoEncoder, self).__init__()
# Encoder
self.encoder = nn.Sequential(
nn.Linear(84, 128),
nn.Tanh(),
nn.Linear(128, 64),
nn.Tanh(),
nn.Linear(64, 16),
nn.Tanh(),
nn.Linear(16, 2),
)
# Decoder
self.decoder = nn.Sequential(
nn.Linear(2, 16),
nn.Tanh(),
nn.Linear(16, 64),
nn.Tanh(),
nn.Linear(64, 128),
nn.Tanh(),
nn.Linear(128, 84),
nn.Sigmoid()
)
def forward(self, inputs):
codes = self.encoder(inputs)
decoded = self.decoder(codes)
return codes, decoded
Optimiser and Loss function
# Optimizer and loss function
model = AutoEncoder()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.MSELoss()
The training steps:
# Train
for epoch in range(epochs):
for data, labels in train_loader:
inputs = data.view(-1, 84)
# Forward
codes, decoded = model(inputs)
# Backward
optimizer.zero_grad()
loss = loss_function(decoded, inputs)
loss.backward()
optimizer.step()
# Show progress
print('[{}/{}] Loss:'.format(epoch+1, epochs), loss.item())
The Autoencoder model is saved as:
# Save
torch.save(model,'autoencoder.pth')
At this point, I would like to ask some help to understand how I could extract the features from the hidden layer. These features extracted from the hidden layer will be used in another classification algorithm.
You need to place an hook to your model. And you can use this hook to extract features from any layer. However it is a lot easier if you don't use nn.Sequential because it combines the layer together and they act as one. I run your code using this function:
There is a function for Feature Extraction which basically takes model as an input and place a hook using index of layer.
class FE(nn.Module):
def __init__(self,model_instance, output_layers, *args):
super().__init__(*args)
self.output_layers = output_layers
self.selected_out = OrderedDict()
self.pretrained = model_instance
self.fhooks = []
print("model_instance._modules.keys():",model_instance._modules.keys())
for i,l in enumerate(list(self.pretrained._modules.keys())):
print("index:",i, ", keys:",l )
if i in self.output_layers:
print("------------------------ > Hook is placed output of :" , l )
self.fhooks.append(getattr(self.pretrained,l).register_forward_hook(self.forward_hook(l)))
def forward_hook(self,layer_name):
def hook(module, input, output):
self.selected_out[layer_name] = output
return hook
def forward(self, x):
out = self.pretrained(x,None)
return out, self.selected_out
And to use:
model_hooked=FE(model ,output_layers = [0])
model_instance._modules.keys(): odict_keys(['encoder', 'decoder'])
index: 0 , keys: encoder
------------------------ > Hook is placed output of : encoder
index: 1 , keys: decoder
After placing the hook you can simply put data to new hooked model and it will output 2 values.First one is original output from last layer and second output will be the output from hooked layer
out, layerout = model_hooked(data_sample)
If you want to extract features from a loaders you can use this function:
def extract_features(FE ,layer_name, train_loader, test_loader):
extracted_features=[]
lbls=[]
extracted_features_test=[]
lbls_test=[]
for data , target in train_loader:
out, layerout = FE(data)
a=layerout[layer_name]
extracted_features.extend(a)
lbls.extend(target)
for data , target in test_loader:
out, layerout = FE(data)
a=layerout[layer_name]
extracted_features_test.extend(a)
lbls_test.extend(target)
extracted_features = torch.stack(extracted_features)
extracted_features_test = torch.stack(extracted_features_test)
lbls = torch.stack(lbls)
lbls_test = torch.stack(lbls_test)
return extracted_features, lbls ,extracted_features_test, lbls_test
And usage is like this :
Features_TRAINLOADER , lbls , Features_TESTLOADER, lbls_test =extract_features(model_hooked, "encoder", train_loader, test_loader)

Expected input batch_size (18) to match target batch_size (6)

Is RNN for image classification available only for gray image?
The following program works for gray image classification.
If RGB images are used, I have this error:
Expected input batch_size (18) to match target batch_size (6)
at this line loss = criterion(outputs, labels).
My data loading for train, valid and test are as follows.
input_size = 300
inputH = 300
inputW = 300
#Data transform (normalization & data augmentation)
stats = ((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_resize_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
train_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.RandomHorizontalFlip(),
tt.ToTensor(),
tt.Normalize(*stats)])
valid_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
test_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
#Create dataset
train_ds = ImageFolder('./data/train', train_tfms)
valid_ds = ImageFolder('./data/valid', valid_tfms)
test_ds = ImageFolder('./data/test', test_tfms)
from torch.utils.data.dataloader import DataLoader
batch_size = 6
#Training data loader
train_dl = DataLoader(train_ds, batch_size, shuffle = True, num_workers = 8, pin_memory=True)
#Validation data loader
valid_dl = DataLoader(valid_ds, batch_size, shuffle = True, num_workers = 8, pin_memory=True)
#Test data loader
test_dl = DataLoader(test_ds, 1, shuffle = False, num_workers = 1, pin_memory=True)
My model is as follows.
num_steps = 300
hidden_size = 256 #size of hidden layers
num_classes = 5
num_epochs = 20
learning_rate = 0.001
# Fully connected neural network with one hidden layer
num_layers = 2 # 2 RNN layers are stacked
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)#batch must have first dimension
#our inpyt needs to have shape
#x -> (batch_size, seq, input_size)
self.fc = nn.Linear(hidden_size, num_classes)#this fc is after RNN. So needs the last hidden size of RNN
def forward(self, x):
#according to ducumentation of RNN in pytorch
#rnn needs input, h_0 for inputs at RNN (h_0 is initial hidden state)
#the following one is initial hidden layer
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)#first one is number of layers and second one is batch size
#output has two outputs. The first tensor contains the output features of the hidden last layer for all time steps
#the second one is hidden state f
out, _ = self.rnn(x, h0)
#output has batch_size, num_steps, hidden size
#we need to decode hidden state only the last time step
#out (N, 30, 128)
#Since we need only the last time step
#Out (N, 128)
out = out[:, -1, :] #-1 for last time step, take all for N and 128
out = self.fc(out)
return out
stacked_rnn_model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()#cross entropy has softmax at output
#optimizer = torch.optim.Adam(stacked_rnn_model.parameters(), lr=learning_rate) #optimizer used gradient optimization using Adam
optimizer = torch.optim.SGD(stacked_rnn_model.parameters(), lr=learning_rate)
# Train the model
n_total_steps = len(train_dl)
for epoch in range(num_epochs):
t_losses=[]
for i, (images, labels) in enumerate(train_dl):
# origin shape: [6, 3, 300, 300]
# resized: [6, 300, 300]
images = images.reshape(-1, num_steps, input_size).to(device)
print('images shape')
print(images.shape)
labels = labels.to(device)
# Forward pass
outputs = stacked_rnn_model(images)
print('outputs shape')
print(outputs.shape)
loss = criterion(outputs, labels)
t_losses.append(loss)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
Printing images and outputs shapes are
images shape
torch.Size([18, 300, 300])
outputs shape
torch.Size([18, 5])
Where is the mistake?
Tl;dr: You are flattening the first two axes, namely batch and channels.
I am not sure you are taking the right approach but I will write about that layer.
In any case, let's look at the issue you are facing. You have a data loader that produces (6, 3, 300, 300), i.e. batches of 6 three-channel 300x300 images. By the look of it you are looking to reshape each batch element (3, 300, 300) into (step_size=300, -1).
However instead of that you are affecting the first axis - which you shouldn't - with images.reshape(-1, num_steps, input_size). This will have the desired effect when working with a single-channel images since dim=1 wouldn't be the "channel axis". In your case your have 3 channels, therefore, the resulting shape is: (6*3*300*300//300//300, 300, 300) which is (18, 300, 300) since num_steps=300 and input_size=300. As a result you are left with 18 batch elements instead of 6.
Instead what you want is to reshape with (batch_size, num_steps, -1). Leaving the last axis (a.k.a. seq_length) of variable size. This will result in a shape (6, 300, 900).
Here is a corrected and reduced snippet:
batch_size = 6
channels = 3
inputH, inputW = 300, 300
train_ds = TensorDataset(torch.rand(100, 3, inputH, inputW), torch.rand(100, 5))
train_dl = DataLoader(train_ds, batch_size)
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
# (batch_size, seq, input_size)
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
# (batch_size, hidden_size)
self.fc = nn.Linear(hidden_size, num_classes)
# (batch_size, num_classes)
def forward(self, x):
out, _ = self.rnn(x)
out = out[:, -1, :]
out = self.fc(out)
return out
num_steps = 300
input_size = inputH*inputW*channels//num_steps
hidden_size = 256
num_classes = 5
num_layers = 2
rnn = RNN(input_size, hidden_size, num_layers, num_classes)
for x, y in train_dl:
print(x.shape, y.shape)
images = images.reshape(batch_size, num_steps, -1)
print(images.shape)
outputs = rnn(images)
print(outputs.shape)
break
As I said in the beginning I am a bit wary about this approach because you are essentially feeding your RNN a RGB 300x300 image in the form of a sequence of 300 flattened vectors... I can't say if that makes sense and terms of training and if the model will be able to learn from that. I could be wrong!

Using LSTM stateful for passing context b/w batches; may be some error in context passing, not getting good results?

I have checked the data before giving it to the network. The data is correct.
Using LSTM and passing the context b/w batches. per_class_accuracy is changing, but the loss is not going down. Been stuck for long, not sure if there is an error in the Code?
I have multi-class classification problem based upon an imbalanced dataset
Dataset_type: CSV
Dataset_size: 20000
Based upon CSV data of sensors
X = 0.6986111111111111,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0
Y = leaveHouse
Per class accuracy:
{'leaveHouse': 0.34932855, 'getDressed': 1.0, 'idle': 0.8074534, 'prepareBreakfast': 0.8, 'goToBed': 0.35583413, 'getDrink': 0.0, 'takeShower': 1.0, 'useToilet': 0.0, 'eatBreakfast': 0.8857143}
Training:
# Using loss weights, the inverse of class frequency
criterion = nn.CrossEntropyLoss(weight = class_weights)
hn, cn = model.init_hidden(batch_size)
for i, (input, label) in enumerate(trainLoader):
hn.detach_()
cn.detach_()
input = input.view(-1, seq_dim, input_dim)
if torch.cuda.is_available():
input = input.float().cuda()
label = label.cuda()
else:
input = input.float()
label = label
# Forward pass to get output/logits
output, (hn, cn) = model((input, (hn, cn)))
# Calculate Loss: softmax --> cross entropy loss
loss = criterion(output, label)#weig pram
running_loss += loss
loss.backward() # Backward pass
optimizer.step() # Now we can do an optimizer step
optimizer.zero_grad() # Reset gradients tensors
Network
class LSTMModel(nn.Module):
def init_hidden(self, batch_size):
self.batch_size = batch_size
if torch.cuda.is_available():
hn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).cuda()
# Initialize cell state
cn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).cuda()
else:
hn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim)
# Initialize cell state
cn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim)
return hn, cn
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, seq_dim):
super(LSTMModel, self).__init__()
# Hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
self.input_dim = input_dim
# Building your LSTM
# batch_first=True causes input/output tensors to be of shape
# (batch_dim, seq_dim, feature_dim)
self.lstm = nn.LSTM(self.input_dim, hidden_dim, layer_dim, batch_first=True)
# Readout layer
self.fc = nn.Linear(hidden_dim, output_dim)
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
self.seq_dim = seq_dim
def forward(self, inputs):
# Initialize hidden state with zeros
input, (hn, cn) = inputs
input = input.view(-1, self.seq_dim, self.input_dim)
# time steps
out, (hn, cn) = self.lstm(input, (hn, cn))
# Index hidden state of last time step
out = self.fc(out[:, -1, :])
out = self.softmax(out)
return out, (hn,cn)
One problem you might have is CrossEntropyLoss combines a log softmax operation with negative log likelihood loss, but you're applying a softmax in your model. You should pass the raw logits out of the final layer to CrossEntropyLoss.
Also I an't say without seeing the models forward pass, but it looks like you're applying the softmax on dimension 1 to a tensor that (I'm inferring) has shape batch_size, sequence_length, output_dim, when you should be applying it along the output dim.

Resources