Exploding gradient Problem with LSTM model build with LSTMCell (PyTorch Implementation) - pytorch

I'm tried to solve timeseries prediction. Where my input is multivariate. My input has 4 variable, and my target is another variable.
I've processed the data as following.
4 variables and 60 timesteps input sequence.
So, each input shape is (1, 240).
I'll try to predict the next n-steps future output. During training, it will be 60 steps.
So, the target shape is (1,60)
Here is my LSTMPredictor class.
class LSTMPredictor(nn.Module):
def __init__(self,n_feature, n_hidden=51):
super(LSTMPredictor, self).__init__()
self.n_hidden = n_hidden
# lstm1, lstm2, linear
self.lstm1 = nn.LSTMCell(n_feature, self.n_hidden)
self.lstm2 = nn.LSTMCell(self.n_hidden, self.n_hidden)
self.lstm3 = nn.LSTMCell(self.n_hidden, self.n_hidden)
self.linear = nn.Linear(self.n_hidden, 1)
def forward(self, x, future=0):
outputs = []
# lstm1
h_t = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
c_t = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
# lstm2
h_t2 = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
c_t2 = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
# lstm3
h_t3 = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
c_t3 = torch.zeros(1, self.n_hidden, dtype=torch.float32).cuda()
h_t, c_t = self.lstm1(x, (h_t, c_t))
h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
output = None
for i in range(future):
if i == 0:
# first prediction
output = self.linear(h_t3) # h_t3?
outputs.append(output)
continue
h_t3, c_t3 = self.lstm3(h_t3, (h_t3, c_t3))
output = self.linear(h_t3)
outputs.append(output)
output = torch.cat(outputs, dim=1)
return output
Here, lstm1 and lstm2 receives the input with shape (1, 240), and then lstm3 is used to generate prediction to the future n steps successively. During training it is 60 steps.
However, my model is facing exploding gradient in the first step.
Model Initialization is shown bellow:
n_hidden = 512
n_feature = 240
model = LSTMPredictor(n_feature, n_hidden).to(device)
criterion = nn.MSELoss().to(device)
optimizer = optim.LBFGS(model.parameters(), lr=0.8)
Training Loop:
n_steps = 1
losses = []
print("--- Training Start ---")
for i in tqdm(range(n_steps)):
print("Step", i)
for i, sample_i in enumerate(train_input):
def closure():
optimizer.zero_grad()
out = model(sample_i.cuda(),future=60)
loss = criterion(out[0], train_target[i].cuda())
losses.append(loss.item())
loss.backward()
return loss
optimizer.step(closure)
print("loss", losses[-1])
Is there anything wrong in my implementation?

Related

Why loss is not decreasing in a Siamese BERT-Network training (Entity matching task)

I'm trying to finetune a model for an entity matching task (kind of a sentence similarity task).
The idea is that if I give as input two sentences the model should output if they represent the same entity or not. I'm interested in the products' domain.
So for example:
sentences_left = ('logitech harmony 890 advanced universal remote control h890', 'sony silver digital voice recorder icdb600')
sentences_right = ('logitech harmony 890 advanced universal remote hdtv , tv , dvd player ( s ) , lighting , audio system 100 ft universal remote 966193-0403', 'canon black ef 70-300mm f/4 -5.6 is usm telephoto zoom lens 0345b002')
The output should be 1 for the first left-right pair of sentences and 0 for the second.
I want to test two approaches. The first is a sequence classification setup. So I take a pair of sentences, concat them with a [SEP] token in-between, encode it and feed it to BERT.
This approach kind of work, but I wanted to explore a second one that, in theory, should work too.
In few words, using mpnet as pre-trained language model I'm trying to implement this setup:
This is taken from the paper Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. The idea is to compute not only a single embedding as before, but two separate embeddings for each of the sentences. Then concatenate the embeddings and feeds them to a softmax classifier.
After lots of struggles I'm still unable to make it work, since the loss has no intention of decreasing. It starts at 0.25 and never goes up neither down.
I'm using the Abt-Buy, Amazon-Google and Walmart-Amazon datasets.
This is my model:
class FinalClassifier(nn.Module):
def __init__(self, pos_neg=None, frozen=False):
super(FinalClassifier, self).__init__()
use_cuda = torch.cuda.is_available()
self.device = torch.device("cuda" if use_cuda else "cpu")
self.encoder = AutoModel.from_pretrained(
'all-mpnet-base-v2')
if frozen:
for param in self.encoder.parameters():
param.requires_grad = False
self.tokenizer = AutoTokenizer.from_pretrained(
'all-mpnet-base-v2')
if pos_neg:
self.criterion = BCEWithLogitsLoss(pos_weight=torch.Tensor([pos_neg]))
self.linear = nn.Linear(3*768, 1)
self.relu = nn.ReLu()
def forward(self, texts_left, texts_right, labels=None):
encoded_inputs_left = self.tokenizer(texts_left, padding='max_length',
truncation=True, return_tensors='pt')
encoded_inputs_left = encoded_inputs_left.to(self.device)
output_left = self.encoder(**encoded_inputs_left)
output_left = _mean_pooling(output_left, encoded_inputs_left['attention_mask'])
# output_left = F.normalize(output_left, p=2, dim=1)
encoded_inputs_right = self.tokenizer(texts_right, padding='max_length',
truncation=True, return_tensors='pt')
encoded_inputs_right = encoded_inputs_right.to(self.device)
output_right = self.encoder(**encoded_inputs_right)
output_right = _mean_pooling(output_right, encoded_inputs_right['attention_mask'])
# output_right = F.normalize(output_right, p=2, dim=1)
# Look at sBERT paper (u, v, |u-v|)
pooled_output = torch.cat((output_left, output_right, torch.abs(output_left - output_right)), -1)
linear_output = self.linear(pooled_output)
relu_output = self.relu(linear_output)
labels = labels.to(self.device)
loss = self.criterion(linear_output.view(-1), labels.float())
return (loss, relu_output)
Here's the Dataset
class FinalDataset(torch.utils.data.Dataset):
def __init__(self, df):
self.labels = [int(label) for label in df['label']]
self.examples = df
def classes(self):
return self.labels
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
examples = self.examples.iloc[idx]
text_left = examples['text_left']
text_right = examples['text_right']
label = np.array(self.labels[idx])
return text_left, text_right, label
and finally the training loop
def train(model, train, val, learning_rate=1e-6, epochs=5, batch_size=8):
train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=8)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
optimizer = Adam(model.parameters(), lr= learning_rate)
if use_cuda:
model = model.cuda()
for epoch_num in range(epochs):
total_loss_train = 0
tmp_loss = 0
step = 0
model.train()
for i, data in enumerate(tqdm(train_dataloader)):
left_batch, right_batch, labels = data
(batch_loss, _) = model(left_batch, right_batch, labels)
total_loss_train += batch_loss
tmp_loss += batch_loss
model.zero_grad()
batch_loss.backward()
optimizer.step()
# every 100 mini-batches
if i % 100 == 99:
print(f' Loss/train at epoch {epoch_num+1} (batch {i}): {tmp_loss/500}')
writer.add_scalar('Loss/train',
tmp_loss / 100,
epoch_num * len(train_dataloader) + i)
tmp_loss = 0
total_loss_val = 0
predictions = None
total_labels = None
step = 0
model.eval()
with torch.no_grad():
for i, data in enumerate(val_dataloader):
left_batch, right_batch, labels = data
(batch_loss, linear_output) = model(left_batch, right_batch, labels)
labels = labels.detach().cpu().numpy()
linear_output = linear_output.detach().cpu().numpy()
if predictions is None:
predictions = np.where(linear_output>0.5, 1, 0)
total_labels = labels
else:
predictions = np.append(predictions, np.where(linear_output>0.5, 1, 0), axis=0)
total_labels = np.append(total_labels, labels, axis=0)
total_loss_val += batch_loss.item()
tmp_loss += batch_loss.item()
# every 100 mini-batches
if i % 100 == 99:
print(f' Loss/val at epoch {epoch_num+1} (batch {i}): {tmp_loss/500}')
writer.add_scalar('Loss/val',
tmp_loss / 100,
epoch_num * len(val_dataloader) + i)
writer.add_scalar('F1/val',
f1_score(y_true=total_labels.flatten()[step:i], y_pred=predictions.flatten()[step:i]),
epoch_num * len(val_dataloader) + i)
tmp_loss = 0
step += 100
f1 = f1_score(y_true=total_labels.flatten(), y_pred=predictions.flatten())
report = classification_report(total_labels, predictions, zero_division=0)
# plot all the pr curves
for i in range(len([0, 1])):
add_pr_curve_tensorboard(i, predictions.flatten(), total_labels.flatten())
for name, p in model.named_parameters():
writer.add_histogram(name, p, bins='auto')
print(
f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train): .3f} \
| Val Loss: {total_loss_val / len(val): .3f} \
| Val F1: {f1: .3f}')
tqdm.write(report)
writer = SummaryWriter(log_dir=tensorboard_path)
EPOCHS = 5
LR = 1e-6
train_pos_neg_ratio = 9
model = FinalClassifier(train_pos_neg_ratio, frozen=False)
train_data, val_data = FinalDataset(df_train), FinalDataset(df_dev)
train(model, train_data, val_data, LR, EPOCHS)
writer.flush()
writer.close()
The issue is that the loss does NOT decrease, and the F1 accuracy as a result. I tried to normalize the outputs, add a dropout layer, analized the dataset to be sure that the problem wasn't there but now I ran out of ideas. An help would be extremely valuable.

LSTM multi-sequence input to one sequence output

I am new with neural networks and am currently trying to make an LSTM model that predicts an output sequence based on multiple parameters. Excuse my ignorance and dummyness in advance.
I have obtained training and validation datasets, which look somewhat like the following:
For every ID four rows are recorded, which uses columns holding certain parameters and the corresponding Y output. Practically, there are thus ~122,000 / 4 = ~30,500 samples (I mistakenly put 122,000 as ID, it is in fact the number of rows). Since the parameter values and the corresponding Y values follow temporal patterns, I am interested if a model such as LSTM improves the prediction.
I want to predict the Y in my validation dataset (~73,000/4 = ~18,000 samples), based on the temporal patterns of the parameters. But is this possible? Most tutorials I followed use a single sequence, for which an LSTM is used to extend a similar input sequence. I thus want an LSTM with 'multi-sequence' input, which outputs one sequence. How do I go about this?
I'm using PyTorch as framework. A simple LSTM model I created using a tutorial, which would not incorporate the parameters:
training_y = traindf.reset_index()['Y']
validation_y = traindf.reset_index()['Y']
Then create a dataset for this:
class YDataset(Dataset):
def __init__(self, data, seq_len = 100):
self.data = data
self.data = torch.from_numpy(data).float().view(-1)
self.seq_len = seq_len
def __len__(self):
return len(self.data)-self.seq_len-1
def __getitem__(self,index):
return self.data[index : index+self.seq_len] , self.data[index+self.seq_len]
train_y = YDataset(training_y_df)
vali_y = YDataset(validation_y_df)
batch_size = 64
train_dataloader = DataLoader(train_y, batch_size, drop_last=True)
vali_dataloader = DataLoader(vali_y, batch_size, drop_last=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
Then create the model:
class Lstm_model(nn.Module):
def __init__(self, input_dim, hidden_size, num_layers):
super(Lstm_model, self).__init__()
self.num_layers = num_layers
self.input_size = input_dim
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size=input_dim, hidden_size = hidden_size, num_layers = num_layers)
self.fc = nn.Linear(hidden_size, 1)
def forward(self,x,hn,cn):
out , (hn,cn) = self.lstm(x, (hn, cn))
final_out = self.fc(out[-1])
return final_out, hn,cn
def predict(self,x):
hn, cn = self.init()
final_out = self.fc(out[-1])
return final_out
def init(self):
h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
return h0 , c0
input_dim = 1
hidden_size = 50
num_layers = 3
model = Lstm_model(input_dim , hidden_size , num_layers).to(device)
Loss function and training loop (more or less same as for validation):
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
def train(dataloader):
hn, cn = model.init()
model.train()
for batch , item in enumerate(dataloader):
x , y = item
x = x.to(device)
y = y.to(device)
out , hn , cn = model(x.reshape(100,batch_size,1),hn,cn)
loss = loss_fn(out.reshape(batch_size), y)
hn = hn.detach()
cn = cn.detach()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch == len(dataloader)-1:
loss = loss.item
print(f"train loss: {loss:>7f} ")
Epochs and loss metrics:
epochs = 200 # Takes really long for me
for epoch in range(epochs):
print(f"epoch {epoch} ")
train(train_dataloader)
test(vali_dataloader)
Final metrics:
import math
from sklearn.metrics import mean_squared_error
import numpy as np
def calculate_metrics(data_loader):
pred_arr = []
y_arr = []
with torch.no_grad():
hn , cn = model.init()
for batch , item = in enumerate(data_loader):
x , y = item
x , y = x.to(device) , y.to(device)
x = x.view(100,64,1)
pred = model(x, hn, cn)[0]
pred = scalar.inverse_transform(pred.detach().cpu().numpy().reshape(-1))
y = scalar.inverse_transform(y.detach().cpu().numpy().reshape(1,-1)).reshape(-1)
pred_arr = pred_arr + list(pred)
y_arr = y_arr + list(y)
return math.sqrt(mean_squared_error(y_arr,pred_arr))
I used this code more as an example of how LSTM would work. Nevertheless, I don't know if this is the right track for me. Does someone know what I should do or a tutorial that does work for my example? Thanks in advance!

Pytorch couldn't build multi scaled kernel nested model

I'm trying to create a modified MNIST model which takes input 1x28x28 MNIST tensor images, and it kind of branches into different models with different sized kernels, and accumulates at the end, so as to give a multi-scale-kerneled response in the spatial domain of the images. I'm worried about the model, since, I'm unable to construct it.
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
from torchvision import datasets, transforms
import torch.nn.functional as F
import timeit
import unittest
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)
# check availability of GPU and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# define a transforms for preparing the dataset
transform = transforms.Compose([
transforms.ToTensor(), # convert the image to a pytorch tensor
transforms.Normalize((0.1307,), (0.3081,)) # normalise the images with mean and std of the dataset
])
# Load the MNIST training, test datasets using `torchvision.datasets.MNIST` using the transform defined above
train_dataset = datasets.MNIST('./data',train=True,transform=transform,download=True)
test_dataset = datasets.MNIST('./data',train=False,transform=transform,download=True)
# create dataloaders for training and test datasets
# use a batch size of 32 and set shuffle=True for the training set
train_dataloader = Data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_dataloader = Data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=True)
# My Net
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# define a conv layer with output channels as 16, kernel size of 3 and stride of 1
self.conv11 = nn.Conv2d(1, 16, 3, 1) # Input = 1x28x28 Output = 16x26x26
self.conv12 = nn.Conv2d(1, 16, 5, 1) # Input = 1x28x28 Output = 16x24x24
self.conv13 = nn.Conv2d(1, 16, 7, 1) # Input = 1x28x28 Output = 16x22x22
# define a conv layer with output channels as 32, kernel size of 3 and stride of 1
self.conv21 = nn.Conv2d(16, 32, 3, 1) # Input = 16x26x26 Output = 32x24x24
self.conv22 = nn.Conv2d(16, 32, 5, 1) # Input = 16x24x24 Output = 32x20x20
self.conv23 = nn.Conv2d(16, 32, 7, 1) # Input = 16x22x22 Output = 32x16x16
# define a conv layer with output channels as 64, kernel size of 3 and stride of 1
self.conv31 = nn.Conv2d(32, 64, 3, 1) # Input = 32x24x24 Output = 64x22x22
self.conv32 = nn.Conv2d(32, 64, 5, 1) # Input = 32x20x20 Output = 64x16x16
self.conv33 = nn.Conv2d(32, 64, 7, 1) # Input = 32x16x16 Output = 64x10x10
# define a max pooling layer with kernel size 2
self.maxpool = nn.MaxPool2d(2), # Output = 64x11x11
# define dropout layer with a probability of 0.25
self.dropout1 = nn.Dropout(0.25)
# define dropout layer with a probability of 0.5
self.dropout2 = nn.Dropout(0.5)
# define a linear(dense) layer with 128 output features
self.fc11 = nn.Linear(64*11*11, 128)
self.fc12 = nn.Linear(64*8*8, 128) # after maxpooling 2x2
self.fc13 = nn.Linear(64*5*5, 128)
# define a linear(dense) layer with output features corresponding to the number of classes in the dataset
self.fc21 = nn.Linear(128, 10)
self.fc22 = nn.Linear(128, 10)
self.fc23 = nn.Linear(128, 10)
self.fc33 = nn.Linear(30,10)
def forward(self, x1):
# Use the layers defined above in a sequential way (folow the same as the layer definitions above) and
# write the forward pass, after each of conv1, conv2, conv3 and fc1 use a relu activation.
x = F.relu(self.conv11(x1))
x = F.relu(self.conv21(x))
x = F.relu(self.maxpool(self.conv31(x)))
#x = torch.flatten(x, 1)
x = x.view(-1,64*11*11)
x = self.dropout1(x)
x = F.relu(self.fc11(x))
x = self.dropout2(x)
x = self.fc21(x)
y = F.relu(self.conv12(x1))
y = F.relu(self.conv22(y))
y = F.relu(self.maxpool(self.conv32(y)))
#x = torch.flatten(x, 1)
y = y.view(-1,64*8*8)
y = self.dropout1(y)
y = F.relu(self.fc12(y))
y = self.dropout2(y)
y = self.fc22(y)
z = F.relu(self.conv13(x1))
z = F.relu(self.conv23(z))
z = F.relu(self.maxpool(self.conv33(z)))
#x = torch.flatten(x, 1)
z = z.view(-1,64*5*5)
z = self.dropout1(z)
z = F.relu(self.fc13(z))
z = self.dropout2(z)
z = self.fc23(z)
out = self.fc33(torch.cat((x, y, z), 0))
output = F.log_softmax(out, dim=1)
return output
import unittest
class TestImplementations(unittest.TestCase):
# Dataloading tests
def test_dataset(self):
self.dataset_classes = ['0 - zero',
'1 - one',
'2 - two',
'3 - three',
'4 - four',
'5 - five',
'6 - six',
'7 - seven',
'8 - eight',
'9 - nine']
self.assertTrue(train_dataset.classes == self.dataset_classes)
self.assertTrue(train_dataset.train == True)
def test_dataloader(self):
self.assertTrue(train_dataloader.batch_size == 32)
self.assertTrue(test_dataloader.batch_size == 32)
def test_total_parameters(self):
model = Net().to(device)
#self.assertTrue(sum(p.numel() for p in model.parameters()) == 1015946)
suite = unittest.TestLoader().loadTestsFromModule(TestImplementations())
unittest.TextTestRunner().run(suite)
def train(model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
# send the image, target to the device
data, target = data.to(device), target.to(device)
# flush out the gradients stored in optimizer
optimizer.zero_grad()
# pass the image to the model and assign the output to variable named output
output = model(data)
# calculate the loss (use nll_loss in pytorch)
loss = F.nll_loss(output, target)
# do a backward pass
loss.backward()
# update the weights
optimizer.step()
if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
# send the image, target to the device
data, target = data.to(device), target.to(device)
# pass the image to the model and assign the output to variable named output
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
model = Net().to(device)
## Define Adam Optimiser with a learning rate of 0.01
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
start = timeit.default_timer()
for epoch in range(1, 11):
train(model, device, train_dataloader, optimizer, epoch)
test(model, device, test_dataloader)
stop = timeit.default_timer()
print('Total time taken: {} seconds'.format(int(stop - start)) )
Here is my full code. I couldn't understand what could possibly go wrong...
It is giving
<ipython-input-72-194680537dcc> in forward(self, x1)
46 x = F.relu(self.conv11(x1))
47 x = F.relu(self.conv21(x))
---> 48 x = F.relu(self.maxpool(self.conv31(x)))
49 #x = torch.flatten(x, 1)
50 x = x.view(-1,64*11*11)
TypeError: 'tuple' object is not callable
Error.
P.S.: Pytorch Noob here.
You have mistakenly placed a comma at the end of the line where you define self.maxpool : self.maxpool = nn.MaxPool2d(2), # Output = 64x11x11 see?
This comma makes self.maxpool a tuple instead of a torch.nn.modules.pooling.MaxPool2d. Drop the comma at the end and this error is fixed.
I see you haven't given the stride argument in you definition of self.maxpool = nn.MaxPool2d(2). Choose one: e.g. self.maxpool = nn.MaxPool2d(2, stride = 2).

Using LSTM stateful for passing context b/w batches; may be some error in context passing, not getting good results?

I have checked the data before giving it to the network. The data is correct.
Using LSTM and passing the context b/w batches. per_class_accuracy is changing, but the loss is not going down. Been stuck for long, not sure if there is an error in the Code?
I have multi-class classification problem based upon an imbalanced dataset
Dataset_type: CSV
Dataset_size: 20000
Based upon CSV data of sensors
X = 0.6986111111111111,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0
Y = leaveHouse
Per class accuracy:
{'leaveHouse': 0.34932855, 'getDressed': 1.0, 'idle': 0.8074534, 'prepareBreakfast': 0.8, 'goToBed': 0.35583413, 'getDrink': 0.0, 'takeShower': 1.0, 'useToilet': 0.0, 'eatBreakfast': 0.8857143}
Training:
# Using loss weights, the inverse of class frequency
criterion = nn.CrossEntropyLoss(weight = class_weights)
hn, cn = model.init_hidden(batch_size)
for i, (input, label) in enumerate(trainLoader):
hn.detach_()
cn.detach_()
input = input.view(-1, seq_dim, input_dim)
if torch.cuda.is_available():
input = input.float().cuda()
label = label.cuda()
else:
input = input.float()
label = label
# Forward pass to get output/logits
output, (hn, cn) = model((input, (hn, cn)))
# Calculate Loss: softmax --> cross entropy loss
loss = criterion(output, label)#weig pram
running_loss += loss
loss.backward() # Backward pass
optimizer.step() # Now we can do an optimizer step
optimizer.zero_grad() # Reset gradients tensors
Network
class LSTMModel(nn.Module):
def init_hidden(self, batch_size):
self.batch_size = batch_size
if torch.cuda.is_available():
hn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).cuda()
# Initialize cell state
cn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).cuda()
else:
hn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim)
# Initialize cell state
cn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim)
return hn, cn
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, seq_dim):
super(LSTMModel, self).__init__()
# Hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
self.input_dim = input_dim
# Building your LSTM
# batch_first=True causes input/output tensors to be of shape
# (batch_dim, seq_dim, feature_dim)
self.lstm = nn.LSTM(self.input_dim, hidden_dim, layer_dim, batch_first=True)
# Readout layer
self.fc = nn.Linear(hidden_dim, output_dim)
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
self.seq_dim = seq_dim
def forward(self, inputs):
# Initialize hidden state with zeros
input, (hn, cn) = inputs
input = input.view(-1, self.seq_dim, self.input_dim)
# time steps
out, (hn, cn) = self.lstm(input, (hn, cn))
# Index hidden state of last time step
out = self.fc(out[:, -1, :])
out = self.softmax(out)
return out, (hn,cn)
One problem you might have is CrossEntropyLoss combines a log softmax operation with negative log likelihood loss, but you're applying a softmax in your model. You should pass the raw logits out of the final layer to CrossEntropyLoss.
Also I an't say without seeing the models forward pass, but it looks like you're applying the softmax on dimension 1 to a tensor that (I'm inferring) has shape batch_size, sequence_length, output_dim, when you should be applying it along the output dim.

PyTorch network produces constant output

I am trying to train a simple MLP to approximate y=f(a,b,c).
My code is as below.
import torch
import torch.nn as nn
from torch.autograd import Variable
# hyper parameters
input_size = 3
output_size = 1
num_epochs = 50
learning_rate = 0.001
# Network definition
class FeedForwardNet(nn.Module):
def __init__(self, l1_size, l2_size):
super(FeedForwardNet, self).__init__()
self.fc1 = nn.Linear(input_size, l1_size)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(l1_size, l2_size)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(l2_size, output_size)
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.fc2(out)
out = self.relu2(out)
out = self.fc3(out)
return out
model = FeedForwardNet(5 , 3)
# sgd optimizer
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=0.9)
for epoch in range(11):
print ('Epoch ', epoch)
for i in range(trainX_light.shape[0]):
X = Variable( torch.from_numpy(trainX_light[i]).view(-1, 3) )
Y = Variable( torch.from_numpy(trainY_light[i]).view(-1, 1) )
# forward
optimizer.zero_grad()
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
loss.backward()
optimizer.step()
totalnorm = 0
for p in model.parameters():
modulenorm = p.grad.data.norm()
totalnorm += modulenorm ** 2
totalnorm = math.sqrt(totalnorm)
print (totalnorm)
# validation code
if (epoch + 1) % 5 == 0:
print (' test points',testX_light.shape[0])
total_loss = 0
for t in range(testX_light.shape[0]):
X = Variable( torch.from_numpy(testX_light[t]).view(-1, 3) )
Y = Variable( torch.from_numpy(testY_light[t]).view(-1, 1) )
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
total_loss += loss
print ('epoch ', epoch, 'avg_loss ', total_loss.data[0] / testX_light.shape[0])
print ('Done')
The problem that I have now is, the validation code
output = model(X)
is always producing an exact same output value (I guess this value is some sort of garbage). I am not sure what mistake I am doing in this part. Could some help me figure out the mistake in my code?
The reason that network produced random values (and inf later) was the exploding gradient problem. Clipping the gradient (torch.nn.utils.clip_grad_norm(model.parameters(), 0.1)) helped.

Resources