PyTorch: LSTM predicts the same constant value - pytorch

I want to predict one variable using 7 features with time steps of 4:
# Shape X_train: torch.Size([24433, 4, 7]
# Shape Y_train: torch.Size([24433, 4, 1]
# Shape X_test: torch.Size([6109, 4, 7]
# Shape Y_test: torch.Size([6109, 4, 1]
train_dataset = TensorDataset(X_train, Y_train)
test_dataset = TensorDataset(X_test, Y_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)
My (initial) LSTM model:
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size)
self.linear = nn.Linear(hidden_size, output_size)
def forward(self, x):
x, _ = self.lstm(x)
x = self.linear(x)
return x
model = LSTMModel(input_size=7, hidden_size=256, output_size=1)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
Apply model:
# Loop over the training set
for X, Y in train_loader:
optimizer.zero_grad()
Y_pred = model(X)
loss = loss_fn(Y_pred, Y)
loss.backward()
optimizer.step()
model.eval()
# Loop over the test set
for X, Y in test_loader:
Y_pred = model(X)
loss = loss_fn(Y_pred, Y)
An example of Y (true data):
tensor([[[59.],
[59.],
[59.],
[59.]],
[[70.],
[70.],
[70.],
[70.]],
[[ 100.],
[ 0.],
[ 0.],
[ 0.]],
# etc.
However, my Y_pred is somewhat like this:
tensor([[[15.8224],
[15.8224],
[15.8224],
[15.8224]],
[[16.1654],
[16.1654],
[16.1654],
[16.1654]],
[[16.2127],
[16.2127],
[16.2127],
[16.2127]],
# etc.
I have tried numerous different things:
Changing the model architecture (different batch size, different number of layers)
Adding dropout and decay parameters
Using epochs and changing the number of epochs when looping over training and test data
Different optimizers (Adam, SGD) with different learning rates
Log transforming my input data
Examples of my data in a previous question.
I am fairly new with PyTorch and LSTMs so I might do it wrong, but, whatever I change, I keep getting a (near) constant value from the predictions. What am I doing wrong/what should I be doing?

I solved this by normalizing my input data. I now obtain different predictions for every output:
# Calculate the mean and standard deviation of each feature in the training set
X_mean = X_train.mean(dim=0)
X_std = X_train.std(dim=0)
# Standardize the training set
X_train = (X_train - X_mean) / X_std
# Standardize the test set using the mean and standard deviation of the training set
X_test = (X_test - X_mean) / X_std

Related

LSTM: calculating MSELoss in for loop returns NAN when backward pass

I am new with LSTM and ran into a problem. I'm trying to predict a variable using 7 features in time steps of 4. I am working with PyTorch.
Data
From my initial data frame (traindf), I created tensors for every feature and the target (Y) by:
featureX_train = torch.tensor(traindf.featureX[:test].values).view(-1, 4, 1)
Y_train = torch.tensor(traindf.Y[:test].values).view(-1, 4, 1)
...
featureX_test = torch.tensor(traindf.featureX[test:].values).view(-1, 4, 1)
Y_test = torch.tensor(traindf.Y[test:].values).view(-1, 4, 1)
I concatenated all the feature tensors into one X_train and one X_test. All tensors are float32:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)
torch.Size([24436, 4, 7]) torch.Size([24436, 4, 1])
torch.Size([6109, 4, 7]) torch.Size([6109, 4, 1])
Eventually, I have a train and test data set:
train_dataset = TensorDataset(X_train, Y_train)
test_dataset = TensorDataset(X_test, Y_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)
Preview of my data:
print(train_dataset[0])
print(test_dataset[0])
(tensor([[ 7909.0000, 8094.0000, 9119.0000, 8666.0000, 17599.0000, 13657.0000,
10158.0000],
[ 7909.0000, 8073.0000, 9119.0000, 8636.0000, 17609.0000, 13975.0000,
10109.0000],
[ 7939.5000, 8083.5000, 9166.5000, 8659.5000, 18124.5000, 13971.0000,
10142.0000],
[ 7951.0000, 8064.0000, 9201.0000, 8663.0000, 17985.0000, 13967.0000,
10076.0000]]), tensor([[41.],
[41.],
[41.],
[41.]]))
(tensor([[ 8411.0000, 8530.0000, 9439.0000, 9101.0000, 17368.0000, 14174.0000,
11111.0000],
[ 8460.0000, 8651.5000, 9579.5000, 9355.5000, 17402.0000, 14509.0000,
11474.5000],
[ 8436.0000, 8617.0000, 9579.0000, 9343.0000, 17318.0000, 14288.0000,
11404.0000],
[ 8519.0000, 8655.0000, 9580.0000, 9348.0000, 17566.0000, 14640.0000,
11404.0000]]), tensor([[59.],
[59.],
[59.],
[59.]]))
Applying LSTM model
My LSTM model:
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size)
self.linear = nn.Linear(hidden_size, output_size)
def forward(self, x):
x, _ = self.lstm(x)
# x = self.linear(x[:, -1, :])
x = self.linear(x)
return x
model = LSTMModel(input_size=7, hidden_size=32, output_size=1)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())
model.train()
When I try:
for X, Y in train_loader:
optimizer.zero_grad()
Y_pred = model(X)
loss = loss_fn(Y_pred, Y)
print(loss)
I get (correctly I assume) Loss: tensor(1318.9419, grad_fn=<MseLossBackward0>)
However, when I run:
for X, Y in train_loader:
optimizer.zero_grad()
Y_pred = model(X)
loss = loss_fn(Y_pred, Y)
# Now apply backward pass
loss.backward()
optimizer.step()
print(loss)
I get: tensor(nan, grad_fn=<MseLossBackward0>)
Tried normalizing
I have tried normalizing the data:
mean = X.mean()
std = X.std()
X_normalized = (X - mean) / std
Y_pred = model(X_normalized)
But it yields the same result. Why do I yield 'nan' after applying loss.backward() in such a loop? How can I fix this? Thanks in advance!
My X_train contained few nan values. By removing the matrices with nan values, I solved this issue:
mask = torch.isnan(X_train).any(dim=1).any(dim=1)
X_train = X_train[~mask]
# Do the same for Y_train as it needs to be the same size
Y_train = Y_train[~mask]
# Create the TensorDataset for the training set
train_dataset = TensorDataset(X_train, Y_train)

Unable to assign all tensors to the GPU

I am trying to predict the outcome of a football fixture (using backpropagation) across 3 classes: home team wins, draw or away team wins; they are encoded as 0, 1, and 2, respectively.
Features: home_team, away_team, home_score, away_score, home_adv, match_imp
Target: outcome_final
Training, validation and test tensors:
X_train: torch.Size([25365, 554])
y_train: torch.Size([25365])
X_test: torch.Size([5436, 554])
y_test: torch.Size([5436])
X_val: torch.Size([5436, 554])
y_val: torch.Size([5436])
Network architecture:
Net(
(fc1): Linear(in_features=555, out_features=100, bias=True)
(fc2): Linear(in_features=100, out_features=3, bias=True)
(dropout): Dropout(p=0.2, inplace=False)
)
Weights and biases are generated at first:
fc1.weight: torch.Size([100, 554])
fc1.bias: torch.Size([100])
fc2.weight: torch.Size([3, 100])
fc2.bias: torch.Size([3])
ReLU activation function is used for the hidden layer, and Softmax activation function is used for the output layer.
The following code returns the error below.
# Creating the class for the neural network
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(554, 100)
self.fc2 = nn.Linear(100, 3)
self.dropout = nn.Dropout(p = 0.2)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.softmax(self.fc2(x), dim = 1)
return x
# Initializing model
model = Net().to(device)
X_train.to(device)
y_train.to(device)
X_val.to(device)
y_val.to(device)
# Initializing weights and biases
model.fc1.weight.data.normal_(0, 0.01)
model.fc1.bias.data.normal_(0, 0.01)
model.fc2.weight.data.normal_(0, 0.01)
model.fc2.bias.data.normal_(0, 0.01)
# TRAIN the model
def train_model(model, X_train, y_train, X_val, y_val, epochs = 10, learning_rate = 0.003):
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
# Losses and accuracies
train_losses = []
val_losses = []
train_accs = []
val_accs = []
# Training happens here
for epoch in range(epochs):
# Shuffling data
permutation = torch.randperm(X_train.size()[0]).to(device)
X_train = X_train[permutation]
y_train = y_train[permutation]
# Creating batches
batch_size = 5
n_batches = X_train.size()[0] // batch_size
for i in range(n_batches):
# Zeroing gradients
optimizer.zero_grad()
# Forward pass
output = model(X_train[i * batch_size : (i + 1) * batch_size])
loss = criterion(output, y_train[i * batch_size : (i + 1) * batch_size].long())
# Backward pass
loss.backward()
# Updating weights and biases
optimizer.step()
# Sending to CPU
model.to('cpu')
# Training loss and accuracy
train_loss = criterion(model(X_train), y_train.long())
train_losses.append(train_loss)
train_acc = accuracy_score(y_train, torch.argmax(model(X_train), dim = 1))
train_accs.append(train_acc)
print('Epoch: ', epoch + 1, 'Training Loss: ', train_loss, 'Training Accuracy: ', train_acc)
# Validation loss and accuracy
val_loss = criterion(model(X_val), y_val.long())
val_losses.append(val_loss)
val_acc = accuracy_score(y_val, torch.argmax(model(X_val), dim = 1))
val_accs.append(val_acc)
print('Epoch: ', epoch + 1, 'Validation Loss: ', val_loss, 'Validation Accuracy: ', val_acc)
# Sending back to GPU
model.to(device)
X_train.to(device)
y_train.to(device)
X_val.to(device)
y_val.to(device)
return train_losses, val_losses, train_accs, val_accs
# Let's train the model
model = Net().to(device)
train_losses, val_losses, train_accs, val_accs = train_model(model, X_train, y_train, X_val, y_val)
ERROR:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)
I have tried ensuring all training and validation sets are converted as tensors and sent to the GPU. Yet, I am still getting this error.
Am I missing something here? Thanks in advance.

CNN-LSTM for image sequences classification | high loss

I'm working on a project where I need to classify image sequences of some plants (growing over time). I tried implementing a CNN-LSTM with a pretrained ResNet18 as a feature extractor and then feeding those feature sequences to the LSTM.
The issue is that I'm not used to train LSTMs, and I'm afraid I'm doing something wrong. I made a clear architecture and everything seems ok, but the loss is not decreasing.
here's the architecture:
class RecurrentCNN(nn.Module):
def __init__(self, embed_dim, hidden_size, num_layers, num_classes):
super(RecurrentCNN, self).__init__()
self.embed_dim = embed_dim
self.hidden_size = hidden_size
self.num_layers = num_layers
self.num_classes = num_classes
self.cnn = torchvision.models.resnet18(weights='DEFAULT')
self.cnn.fc = nn.Sequential(
nn.Linear(in_features=512, out_features=self.embed_dim, bias=False),
nn.BatchNorm1d(num_features=self.embed_dim)
)
self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
self.fc = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.BatchNorm1d(num_features=hidden_size),
nn.Dropout(0.2),
nn.Linear(hidden_size, num_classes)
)
def forward(self, x):
batch_size, img_size = x.shape[0], x.shape[2:]
x = x.reshape(-1, *img_size) # i merge the batch_size and num_seq in order to feed everything to the cnn
x = self.cnn(x)
x = x.reshape(batch_size, -1, self.embed_dim) # then i comeback the original shape
# lstm part
h_0 = torch.autograd.Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
c_0 = torch.autograd.Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
x, (hn, cn) = self.lstm(x, (h_0, c_0))
x = x[:, -1, :]
x = self.fc(x)
return x
I have 40 classes to output. My sequences are of different lengths, so I was forced to pad with some black images sometimes! (mean seq length: 39, max: 55, min: 15)
I'm feeding the model with sequences of shape (batch_size, seq_len=55, 3, 112, 112).
It may be wrong but for now I just want to make sure that the model is at least working correctly, then I'll probably change the strategy of learning.
here's the training code:
EPOCHS = 10
BATCH_SIZE = 4
dataset = PlantDataset(data_path, max_sequence_len=55, transform=None)
train_loader = torch.utils.data.DataLoader(
dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True
)
rcnn = RecurrentCNN(embed_dim=128, hidden_size=256, num_layers=2, num_classes=len(class_list)).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(rcnn.parameters(), lr=0.0001)
loss_am = list() #AverageMeter()
rcnn.train()
for epoch in range(EPOCHS):
progress = tqdm(range(dataset.__len__() * BATCH_SIZE))
for i, data in enumerate(train_loader):
optimizer.zero_grad()
sequences, targets = data
sequences, targets = sequences.to(device, dtype=torch.float), torch.Tensor(targets).to(device)
output = torch.nn.functional.log_softmax(rcnn(sequences), dim=1)
loss_value = criterion(output, targets)
loss_value.backward()
optimizer.step()
with torch.no_grad():
loss_am.append(loss_value.item())
progress.update(i)
progress.set_description('Epoch: {}, Loss: {:.4f}'.format(epoch, loss_value.item()))
progress.close()
The loss on each batch goes like
3.53 => 4.22 => 4.62 => 3.83 => 3.75 => 3.80 => 3.70, etc
Do you have any idea ?
I am facing the same issue. But I am able to find the problem. Since I am using the Image-sequences dataset, my model is not able to predict the tokens, instead, I ended up with a whole set of garbage tokens. I am still trying to figure out why this is happening.

Using LSTM stateful for passing context b/w batches; may be some error in context passing, not getting good results?

I have checked the data before giving it to the network. The data is correct.
Using LSTM and passing the context b/w batches. per_class_accuracy is changing, but the loss is not going down. Been stuck for long, not sure if there is an error in the Code?
I have multi-class classification problem based upon an imbalanced dataset
Dataset_type: CSV
Dataset_size: 20000
Based upon CSV data of sensors
X = 0.6986111111111111,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0
Y = leaveHouse
Per class accuracy:
{'leaveHouse': 0.34932855, 'getDressed': 1.0, 'idle': 0.8074534, 'prepareBreakfast': 0.8, 'goToBed': 0.35583413, 'getDrink': 0.0, 'takeShower': 1.0, 'useToilet': 0.0, 'eatBreakfast': 0.8857143}
Training:
# Using loss weights, the inverse of class frequency
criterion = nn.CrossEntropyLoss(weight = class_weights)
hn, cn = model.init_hidden(batch_size)
for i, (input, label) in enumerate(trainLoader):
hn.detach_()
cn.detach_()
input = input.view(-1, seq_dim, input_dim)
if torch.cuda.is_available():
input = input.float().cuda()
label = label.cuda()
else:
input = input.float()
label = label
# Forward pass to get output/logits
output, (hn, cn) = model((input, (hn, cn)))
# Calculate Loss: softmax --> cross entropy loss
loss = criterion(output, label)#weig pram
running_loss += loss
loss.backward() # Backward pass
optimizer.step() # Now we can do an optimizer step
optimizer.zero_grad() # Reset gradients tensors
Network
class LSTMModel(nn.Module):
def init_hidden(self, batch_size):
self.batch_size = batch_size
if torch.cuda.is_available():
hn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).cuda()
# Initialize cell state
cn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).cuda()
else:
hn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim)
# Initialize cell state
cn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim)
return hn, cn
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, seq_dim):
super(LSTMModel, self).__init__()
# Hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
self.input_dim = input_dim
# Building your LSTM
# batch_first=True causes input/output tensors to be of shape
# (batch_dim, seq_dim, feature_dim)
self.lstm = nn.LSTM(self.input_dim, hidden_dim, layer_dim, batch_first=True)
# Readout layer
self.fc = nn.Linear(hidden_dim, output_dim)
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
self.seq_dim = seq_dim
def forward(self, inputs):
# Initialize hidden state with zeros
input, (hn, cn) = inputs
input = input.view(-1, self.seq_dim, self.input_dim)
# time steps
out, (hn, cn) = self.lstm(input, (hn, cn))
# Index hidden state of last time step
out = self.fc(out[:, -1, :])
out = self.softmax(out)
return out, (hn,cn)
One problem you might have is CrossEntropyLoss combines a log softmax operation with negative log likelihood loss, but you're applying a softmax in your model. You should pass the raw logits out of the final layer to CrossEntropyLoss.
Also I an't say without seeing the models forward pass, but it looks like you're applying the softmax on dimension 1 to a tensor that (I'm inferring) has shape batch_size, sequence_length, output_dim, when you should be applying it along the output dim.

TensorFlow 2.0 GradientTape with EarlyStopping

I am using Python 3.7.5 and TensorFlow 2.0's 'GradientTape' API for classification of MNIST dataset using 300 100 dense fully connected architecture. I would like to use TensorFlow's 'EarlyStopping' with GradientTape() so that the training stops according to the variable being watched or monitored and according to patience parameters.
The code I have is below:
# Use tf.data to batch and shuffle the dataset
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(100).batch(batch_size)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
# Choose an optimizer and loss function for training-
loss_fn = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(lr = 0.001)
def create_nn_gradienttape():
"""
Function to create neural network for use
with GradientTape API following MNIST
300 100 architecture
"""
model = Sequential()
model.add(
Dense(
units = 300, activation = 'relu',
kernel_initializer = tf.keras.initializers.GlorotNormal,
input_shape = (784,)
)
)
model.add(
Dense(
units = 100, activation = 'relu',
kernel_initializer = tf.keras.initializers.GlorotNormal
)
)
model.add(
Dense(
units = 10, activation = 'softmax'
)
)
return model
# Instantiate the model to be trained using GradientTape-
model = create_nn_gradienttape()
# Select metrics to measure the error & accuracy of model.
# These metrics accumulate the values over epochs and then
# print the overall result-
train_loss = tf.keras.metrics.Mean(name = 'train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name = 'train_accuracy')
test_loss = tf.keras.metrics.Mean(name = 'test_loss')
test_accuracy = tf.keras.metrics.BinaryAccuracy(name = 'train_accuracy')
# Use tf.GradientTape to train the model-
#tf.function
def train_step(data, labels):
"""
Function to perform one step of Gradient
Descent optimization
"""
with tf.GradientTape() as tape:
# 'training=True' is only needed if there are layers with different
# behavior during training versus inference (e.g. Dropout).
# predictions = model(data, training=True)
predictions = model(data)
loss = loss_fn(labels, predictions)
# 'gradients' is a list variable!
gradients = tape.gradient(loss, model.trainable_variables)
# IMPORTANT:
# Multiply mask with computed gradients-
# List to hold element-wise multiplication between-
# computed gradient and masks-
grad_mask_mul = []
# Perform element-wise multiplication between computed gradients and masks-
for grad_layer, mask in zip(gradients, mask_model_stripped.trainable_weights):
grad_mask_mul.append(tf.math.multiply(grad_layer, mask))
# optimizer.apply_gradients(zip(gradients, model.trainable_variables))
optimizer.apply_gradients(zip(grad_mask_mul, model.trainable_variables))
train_loss(loss)
train_accuracy(labels, predictions)
#tf.function
def test_step(data, labels):
"""
Function to test model performance
on testing dataset
"""
# training=False is only needed if there are layers with different
# behavior during training versus inference (e.g. Dropout).
predictions = model(data)
t_loss = loss_fn(labels, predictions)
test_loss(t_loss)
test_accuracy(labels, predictions)
EPOCHS = 15
for epoch in range(EPOCHS):
# Reset the metrics at the start of the next epoch
train_loss.reset_states()
train_accuracy.reset_states()
test_loss.reset_states()
test_accuracy.reset_states()
for x, y in train_ds:
train_step(x, y)
for x_t, y_t in test_ds:
test_step(x_t, y_t)
template = 'Epoch {0}, Loss: {1:.4f}, Accuracy: {2:.4f}, Test Loss: {3:.4f}, Test Accuracy: {4:4f}'
print(template.format(epoch + 1,
train_loss.result(), train_accuracy.result()*100,
test_loss.result(), test_accuracy.result()*100))
# Count number of non-zero parameters in each layer and in total-
# print("layer-wise manner model, number of nonzero parameters in each layer are: \n")
model_sum_params = 0
for layer in model.trainable_weights:
# print(tf.math.count_nonzero(layer, axis = None).numpy())
model_sum_params += tf.math.count_nonzero(layer, axis = None).numpy()
print("Total number of trainable parameters = {0}\n".format(model_sum_params))
In the code above, How can I use 'tf.keras.callbacks.EarlyStopping' with GradientTape() API ?
Thanks!

Resources