PyTorch multi-class: ValueError: Expected input batch_size (416) to match target batch_size (32) - nlp

I have created a mutli-class classification neural network. Training, and validation iterators where created with BigBucketIterator method with fields {'text_normalized_tweet':TEXT, 'label': LABEL}
TEXT = a tweet
LABEL = a float number (with 3 values: 0,1,2)
Below I execute a dummy example of my neural network:
import torch.nn as nn
class MultiClassClassifer(nn.Module):
#define all the layers used in model
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
#Constructor
super(MultiClassClassifer, self).__init__()
#embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_dim)
#dense layer
self.hiddenLayer = nn.Linear(embedding_dim, hidden_dim)
#Batch normalization layer
self.batchnorm = nn.BatchNorm1d(hidden_dim)
#output layer
self.output = nn.Linear(hidden_dim, output_dim)
#activation layer
self.act = nn.Softmax(dim=1) #2d-tensor
#initialize weights of embedding layer
self.init_weights()
def init_weights(self):
initrange = 1.0
self.embedding.weight.data.uniform_(-initrange, initrange)
def forward(self, text, text_lengths):
embedded = self.embedding(text)
#packed sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
tensor, batch_size = packed_embedded[0], packed_embedded[1]
hidden_1 = self.batchnorm(self.hiddenLayer(tensor))
return self.act(self.output(hidden_1))
Instantiate the model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 64
OUTPUT_DIM = 3
model = MultiClassClassifer(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
When I call
text, text_lengths = batch.text_normalized_tweet
predictions = model(text, text_lengths).squeeze()
loss = criterion(predictions, batch.label)
it returns,
ValueError: Expected input batch_size (416) to match target batch_size (32).
model(text, text_lengths).squeeze() = torch.Size([416, 3])
batch.label = torch.Size([32])
I can see that the two objects have different sizes, but I have no clue how to fix this?
You may find the Google Colab notebook here
Shapes of each in, out tensor of my forward() method:
torch.Size([32, 10, 100]) #self.embedding(text)
torch.Size([320, 100]) #nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
torch.Size([320, 64]) #self.batchnorm(self.hiddenLayer(tensor))
torch.Size([320, 3]) #self.act(self.output(hidden_1))

You shouldn't be using the squeeze function after the forward pass, that doesn't make sense.
After removing the squeeze function, as you see, the shape of your final output is [320,3] whereas it is expecting [32,3]. One way to fix this is to average out the embeddings you obtain for each word after the self.Embedding function like shown below:
def forward(self, text, text_lengths):
embedded = self.embedding(text)
embedded = torch.mean(embedded, dim=1, keepdim=True)
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
tensor, batch_size = packed_embedded[0], packed_embedded[1]
hidden_1 = self.batchnorm(self.hiddenLayer(tensor))
return self.act(self.output(hidden_1))

Related

CNN-LSTM for image sequences classification | high loss

I'm working on a project where I need to classify image sequences of some plants (growing over time). I tried implementing a CNN-LSTM with a pretrained ResNet18 as a feature extractor and then feeding those feature sequences to the LSTM.
The issue is that I'm not used to train LSTMs, and I'm afraid I'm doing something wrong. I made a clear architecture and everything seems ok, but the loss is not decreasing.
here's the architecture:
class RecurrentCNN(nn.Module):
def __init__(self, embed_dim, hidden_size, num_layers, num_classes):
super(RecurrentCNN, self).__init__()
self.embed_dim = embed_dim
self.hidden_size = hidden_size
self.num_layers = num_layers
self.num_classes = num_classes
self.cnn = torchvision.models.resnet18(weights='DEFAULT')
self.cnn.fc = nn.Sequential(
nn.Linear(in_features=512, out_features=self.embed_dim, bias=False),
nn.BatchNorm1d(num_features=self.embed_dim)
)
self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
self.fc = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.BatchNorm1d(num_features=hidden_size),
nn.Dropout(0.2),
nn.Linear(hidden_size, num_classes)
)
def forward(self, x):
batch_size, img_size = x.shape[0], x.shape[2:]
x = x.reshape(-1, *img_size) # i merge the batch_size and num_seq in order to feed everything to the cnn
x = self.cnn(x)
x = x.reshape(batch_size, -1, self.embed_dim) # then i comeback the original shape
# lstm part
h_0 = torch.autograd.Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
c_0 = torch.autograd.Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
x, (hn, cn) = self.lstm(x, (h_0, c_0))
x = x[:, -1, :]
x = self.fc(x)
return x
I have 40 classes to output. My sequences are of different lengths, so I was forced to pad with some black images sometimes! (mean seq length: 39, max: 55, min: 15)
I'm feeding the model with sequences of shape (batch_size, seq_len=55, 3, 112, 112).
It may be wrong but for now I just want to make sure that the model is at least working correctly, then I'll probably change the strategy of learning.
here's the training code:
EPOCHS = 10
BATCH_SIZE = 4
dataset = PlantDataset(data_path, max_sequence_len=55, transform=None)
train_loader = torch.utils.data.DataLoader(
dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True
)
rcnn = RecurrentCNN(embed_dim=128, hidden_size=256, num_layers=2, num_classes=len(class_list)).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(rcnn.parameters(), lr=0.0001)
loss_am = list() #AverageMeter()
rcnn.train()
for epoch in range(EPOCHS):
progress = tqdm(range(dataset.__len__() * BATCH_SIZE))
for i, data in enumerate(train_loader):
optimizer.zero_grad()
sequences, targets = data
sequences, targets = sequences.to(device, dtype=torch.float), torch.Tensor(targets).to(device)
output = torch.nn.functional.log_softmax(rcnn(sequences), dim=1)
loss_value = criterion(output, targets)
loss_value.backward()
optimizer.step()
with torch.no_grad():
loss_am.append(loss_value.item())
progress.update(i)
progress.set_description('Epoch: {}, Loss: {:.4f}'.format(epoch, loss_value.item()))
progress.close()
The loss on each batch goes like
3.53 => 4.22 => 4.62 => 3.83 => 3.75 => 3.80 => 3.70, etc
Do you have any idea ?
I am facing the same issue. But I am able to find the problem. Since I am using the Image-sequences dataset, my model is not able to predict the tokens, instead, I ended up with a whole set of garbage tokens. I am still trying to figure out why this is happening.

Extracting Autoencoder features from the hidden layer

I have developed some code to apply Autoencoder on my dataset, in order to extract hidden features from it. I have a dataset that consists of 84 variables, and they have been normalised.
epochs = 10
batch_size = 128
lr = 0.008
# Convert Input and Output data to Tensors and create a TensorDataset
input = torch.Tensor(input.to_numpy())
output = torch.tensor(output.to_numpy())
data = torch.utils.data.TensorDataset(input, output)
# Split to Train, Validate and Test sets using random_split
number_rows = len(input) # The size of our dataset or the number of rows in excel table.
test_split = int(number_rows*0.3)
train_split = number_rows - test_split
train_set, test_set = random_split(data, [train_split, test_split])
# Create Dataloader to read the data within batch sizes and put into memory.
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)
The model structure:
# Model structure
class AutoEncoder(nn.Module):
def __init__(self):
super(AutoEncoder, self).__init__()
# Encoder
self.encoder = nn.Sequential(
nn.Linear(84, 128),
nn.Tanh(),
nn.Linear(128, 64),
nn.Tanh(),
nn.Linear(64, 16),
nn.Tanh(),
nn.Linear(16, 2),
)
# Decoder
self.decoder = nn.Sequential(
nn.Linear(2, 16),
nn.Tanh(),
nn.Linear(16, 64),
nn.Tanh(),
nn.Linear(64, 128),
nn.Tanh(),
nn.Linear(128, 84),
nn.Sigmoid()
)
def forward(self, inputs):
codes = self.encoder(inputs)
decoded = self.decoder(codes)
return codes, decoded
Optimiser and Loss function
# Optimizer and loss function
model = AutoEncoder()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.MSELoss()
The training steps:
# Train
for epoch in range(epochs):
for data, labels in train_loader:
inputs = data.view(-1, 84)
# Forward
codes, decoded = model(inputs)
# Backward
optimizer.zero_grad()
loss = loss_function(decoded, inputs)
loss.backward()
optimizer.step()
# Show progress
print('[{}/{}] Loss:'.format(epoch+1, epochs), loss.item())
The Autoencoder model is saved as:
# Save
torch.save(model,'autoencoder.pth')
At this point, I would like to ask some help to understand how I could extract the features from the hidden layer. These features extracted from the hidden layer will be used in another classification algorithm.
You need to place an hook to your model. And you can use this hook to extract features from any layer. However it is a lot easier if you don't use nn.Sequential because it combines the layer together and they act as one. I run your code using this function:
There is a function for Feature Extraction which basically takes model as an input and place a hook using index of layer.
class FE(nn.Module):
def __init__(self,model_instance, output_layers, *args):
super().__init__(*args)
self.output_layers = output_layers
self.selected_out = OrderedDict()
self.pretrained = model_instance
self.fhooks = []
print("model_instance._modules.keys():",model_instance._modules.keys())
for i,l in enumerate(list(self.pretrained._modules.keys())):
print("index:",i, ", keys:",l )
if i in self.output_layers:
print("------------------------ > Hook is placed output of :" , l )
self.fhooks.append(getattr(self.pretrained,l).register_forward_hook(self.forward_hook(l)))
def forward_hook(self,layer_name):
def hook(module, input, output):
self.selected_out[layer_name] = output
return hook
def forward(self, x):
out = self.pretrained(x,None)
return out, self.selected_out
And to use:
model_hooked=FE(model ,output_layers = [0])
model_instance._modules.keys(): odict_keys(['encoder', 'decoder'])
index: 0 , keys: encoder
------------------------ > Hook is placed output of : encoder
index: 1 , keys: decoder
After placing the hook you can simply put data to new hooked model and it will output 2 values.First one is original output from last layer and second output will be the output from hooked layer
out, layerout = model_hooked(data_sample)
If you want to extract features from a loaders you can use this function:
def extract_features(FE ,layer_name, train_loader, test_loader):
extracted_features=[]
lbls=[]
extracted_features_test=[]
lbls_test=[]
for data , target in train_loader:
out, layerout = FE(data)
a=layerout[layer_name]
extracted_features.extend(a)
lbls.extend(target)
for data , target in test_loader:
out, layerout = FE(data)
a=layerout[layer_name]
extracted_features_test.extend(a)
lbls_test.extend(target)
extracted_features = torch.stack(extracted_features)
extracted_features_test = torch.stack(extracted_features_test)
lbls = torch.stack(lbls)
lbls_test = torch.stack(lbls_test)
return extracted_features, lbls ,extracted_features_test, lbls_test
And usage is like this :
Features_TRAINLOADER , lbls , Features_TESTLOADER, lbls_test =extract_features(model_hooked, "encoder", train_loader, test_loader)

Different training result obtained from training simple LSTM in Keras and Pytorch

I’m trying to implement my LSTM model from Keras to Pytorch, but the results in Pytorch seem really bad at the moment. The network is really simple as below.
model = Sequential()
model.add(LSTM(10, input_length=shape[1], input_dim=shape[2]))
# output shape: (1, 1)
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(1,activation="linear"))
model.compile(loss="mse", optimizer="adam")
model.summary()
And I migrate it to the Pytorch framework,
class LSTM(nn.Module):
def __init__(self, input_dim, hidden_dim, num_layers, output_dim,bilstm=False):
super(LSTM, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.isBi = bilstm
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True,bidirectional=bilstm).double()
# for name, param in self.lstm.named_parameters():
# if name.startswith("weight"):
# nn.init.orthogonal_(param)
# else:
# pass
self.fc1 = nn.Sequential(nn.Linear(hidden_dim, 10).double(),nn.Tanh())
self.final_layer1 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer2 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer3 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer4 = nn.Sequential(nn.Linear(10,output_dim).double())
def forward(self, x):
out, (hn, cn) = self.lstm(x)
out = out[:, -1, :]
out = self.fc1(out)
out = self.final_layer1(out)
out = self.final_layer2(out)
out = self.final_layer3(out)
out = self.final_layer4(out)
return out
The result is really bad. I was wondering if the initializing methods/activation functions used in Keras are different from the one I used in Pytorch(Keras seems to be using hard_sigmoid where Pytorch uses sigmoid?).
Would really appreciate it if somebody could help me with this problem!
UPDATED
My training code in Pytorch.
criterion = nn.MSELoss()
model = LSTM(input_dim,hidden_dim,num_layers,output_dim,bilstm)
model = model.cuda()
optimizer = optim.Adam(model.parameters(),lr=0.001)
for epoch in range(1,epoch_number+1):
model.train()
iteration = 0
for i,data in enumerate(train_loader):
dat, label = data
dat = dat.double()
label = label.double()
if torch.cuda.is_available():
dat = dat.cuda()
label = label.cuda()
else:
dat = Variable(dat)
label = Variable(label)
out = model(dat)
optimizer.zero_grad()
loss = criterion(out, label)
loss.backward()
optimizer.step()

Expected input batch_size (18) to match target batch_size (6)

Is RNN for image classification available only for gray image?
The following program works for gray image classification.
If RGB images are used, I have this error:
Expected input batch_size (18) to match target batch_size (6)
at this line loss = criterion(outputs, labels).
My data loading for train, valid and test are as follows.
input_size = 300
inputH = 300
inputW = 300
#Data transform (normalization & data augmentation)
stats = ((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_resize_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
train_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.RandomHorizontalFlip(),
tt.ToTensor(),
tt.Normalize(*stats)])
valid_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
test_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
#Create dataset
train_ds = ImageFolder('./data/train', train_tfms)
valid_ds = ImageFolder('./data/valid', valid_tfms)
test_ds = ImageFolder('./data/test', test_tfms)
from torch.utils.data.dataloader import DataLoader
batch_size = 6
#Training data loader
train_dl = DataLoader(train_ds, batch_size, shuffle = True, num_workers = 8, pin_memory=True)
#Validation data loader
valid_dl = DataLoader(valid_ds, batch_size, shuffle = True, num_workers = 8, pin_memory=True)
#Test data loader
test_dl = DataLoader(test_ds, 1, shuffle = False, num_workers = 1, pin_memory=True)
My model is as follows.
num_steps = 300
hidden_size = 256 #size of hidden layers
num_classes = 5
num_epochs = 20
learning_rate = 0.001
# Fully connected neural network with one hidden layer
num_layers = 2 # 2 RNN layers are stacked
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)#batch must have first dimension
#our inpyt needs to have shape
#x -> (batch_size, seq, input_size)
self.fc = nn.Linear(hidden_size, num_classes)#this fc is after RNN. So needs the last hidden size of RNN
def forward(self, x):
#according to ducumentation of RNN in pytorch
#rnn needs input, h_0 for inputs at RNN (h_0 is initial hidden state)
#the following one is initial hidden layer
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)#first one is number of layers and second one is batch size
#output has two outputs. The first tensor contains the output features of the hidden last layer for all time steps
#the second one is hidden state f
out, _ = self.rnn(x, h0)
#output has batch_size, num_steps, hidden size
#we need to decode hidden state only the last time step
#out (N, 30, 128)
#Since we need only the last time step
#Out (N, 128)
out = out[:, -1, :] #-1 for last time step, take all for N and 128
out = self.fc(out)
return out
stacked_rnn_model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()#cross entropy has softmax at output
#optimizer = torch.optim.Adam(stacked_rnn_model.parameters(), lr=learning_rate) #optimizer used gradient optimization using Adam
optimizer = torch.optim.SGD(stacked_rnn_model.parameters(), lr=learning_rate)
# Train the model
n_total_steps = len(train_dl)
for epoch in range(num_epochs):
t_losses=[]
for i, (images, labels) in enumerate(train_dl):
# origin shape: [6, 3, 300, 300]
# resized: [6, 300, 300]
images = images.reshape(-1, num_steps, input_size).to(device)
print('images shape')
print(images.shape)
labels = labels.to(device)
# Forward pass
outputs = stacked_rnn_model(images)
print('outputs shape')
print(outputs.shape)
loss = criterion(outputs, labels)
t_losses.append(loss)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
Printing images and outputs shapes are
images shape
torch.Size([18, 300, 300])
outputs shape
torch.Size([18, 5])
Where is the mistake?
Tl;dr: You are flattening the first two axes, namely batch and channels.
I am not sure you are taking the right approach but I will write about that layer.
In any case, let's look at the issue you are facing. You have a data loader that produces (6, 3, 300, 300), i.e. batches of 6 three-channel 300x300 images. By the look of it you are looking to reshape each batch element (3, 300, 300) into (step_size=300, -1).
However instead of that you are affecting the first axis - which you shouldn't - with images.reshape(-1, num_steps, input_size). This will have the desired effect when working with a single-channel images since dim=1 wouldn't be the "channel axis". In your case your have 3 channels, therefore, the resulting shape is: (6*3*300*300//300//300, 300, 300) which is (18, 300, 300) since num_steps=300 and input_size=300. As a result you are left with 18 batch elements instead of 6.
Instead what you want is to reshape with (batch_size, num_steps, -1). Leaving the last axis (a.k.a. seq_length) of variable size. This will result in a shape (6, 300, 900).
Here is a corrected and reduced snippet:
batch_size = 6
channels = 3
inputH, inputW = 300, 300
train_ds = TensorDataset(torch.rand(100, 3, inputH, inputW), torch.rand(100, 5))
train_dl = DataLoader(train_ds, batch_size)
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
# (batch_size, seq, input_size)
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
# (batch_size, hidden_size)
self.fc = nn.Linear(hidden_size, num_classes)
# (batch_size, num_classes)
def forward(self, x):
out, _ = self.rnn(x)
out = out[:, -1, :]
out = self.fc(out)
return out
num_steps = 300
input_size = inputH*inputW*channels//num_steps
hidden_size = 256
num_classes = 5
num_layers = 2
rnn = RNN(input_size, hidden_size, num_layers, num_classes)
for x, y in train_dl:
print(x.shape, y.shape)
images = images.reshape(batch_size, num_steps, -1)
print(images.shape)
outputs = rnn(images)
print(outputs.shape)
break
As I said in the beginning I am a bit wary about this approach because you are essentially feeding your RNN a RGB 300x300 image in the form of a sequence of 300 flattened vectors... I can't say if that makes sense and terms of training and if the model will be able to learn from that. I could be wrong!

Using LSTM stateful for passing context b/w batches; may be some error in context passing, not getting good results?

I have checked the data before giving it to the network. The data is correct.
Using LSTM and passing the context b/w batches. per_class_accuracy is changing, but the loss is not going down. Been stuck for long, not sure if there is an error in the Code?
I have multi-class classification problem based upon an imbalanced dataset
Dataset_type: CSV
Dataset_size: 20000
Based upon CSV data of sensors
X = 0.6986111111111111,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0
Y = leaveHouse
Per class accuracy:
{'leaveHouse': 0.34932855, 'getDressed': 1.0, 'idle': 0.8074534, 'prepareBreakfast': 0.8, 'goToBed': 0.35583413, 'getDrink': 0.0, 'takeShower': 1.0, 'useToilet': 0.0, 'eatBreakfast': 0.8857143}
Training:
# Using loss weights, the inverse of class frequency
criterion = nn.CrossEntropyLoss(weight = class_weights)
hn, cn = model.init_hidden(batch_size)
for i, (input, label) in enumerate(trainLoader):
hn.detach_()
cn.detach_()
input = input.view(-1, seq_dim, input_dim)
if torch.cuda.is_available():
input = input.float().cuda()
label = label.cuda()
else:
input = input.float()
label = label
# Forward pass to get output/logits
output, (hn, cn) = model((input, (hn, cn)))
# Calculate Loss: softmax --> cross entropy loss
loss = criterion(output, label)#weig pram
running_loss += loss
loss.backward() # Backward pass
optimizer.step() # Now we can do an optimizer step
optimizer.zero_grad() # Reset gradients tensors
Network
class LSTMModel(nn.Module):
def init_hidden(self, batch_size):
self.batch_size = batch_size
if torch.cuda.is_available():
hn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).cuda()
# Initialize cell state
cn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).cuda()
else:
hn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim)
# Initialize cell state
cn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim)
return hn, cn
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, seq_dim):
super(LSTMModel, self).__init__()
# Hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
self.input_dim = input_dim
# Building your LSTM
# batch_first=True causes input/output tensors to be of shape
# (batch_dim, seq_dim, feature_dim)
self.lstm = nn.LSTM(self.input_dim, hidden_dim, layer_dim, batch_first=True)
# Readout layer
self.fc = nn.Linear(hidden_dim, output_dim)
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
self.seq_dim = seq_dim
def forward(self, inputs):
# Initialize hidden state with zeros
input, (hn, cn) = inputs
input = input.view(-1, self.seq_dim, self.input_dim)
# time steps
out, (hn, cn) = self.lstm(input, (hn, cn))
# Index hidden state of last time step
out = self.fc(out[:, -1, :])
out = self.softmax(out)
return out, (hn,cn)
One problem you might have is CrossEntropyLoss combines a log softmax operation with negative log likelihood loss, but you're applying a softmax in your model. You should pass the raw logits out of the final layer to CrossEntropyLoss.
Also I an't say without seeing the models forward pass, but it looks like you're applying the softmax on dimension 1 to a tensor that (I'm inferring) has shape batch_size, sequence_length, output_dim, when you should be applying it along the output dim.

Resources