TBPTT with a multivariate time series - pytorch

I am trying to use TBPTT on a multivariate time series, and I am facing a problem, my loss doesn’t decrease, and I don’t know what I am doing wrong.
Inputs shape (Batch_size,1270,6)
Output shape (Batch_size,1270)
There is a particularity with the Inputs:
6 Features correspond to A-B A-C A-D where A is the time step,
Between two inputs (Inputs[0] and Inputs[1]) features don’t have the same length, I padded all the Inputs using
torch.nn.utils.rnn.pad_sequence(Mise_en_donnees,padding_value=-1,batch_first=True)
I tried padding_value=0. But it doesn’t change anything)
All Inputs are normalized using get_mean_std
def get_mean_std(loader,ignore_idx=-1.):
channels_sum,channels_squared_sum,num_batches=0,0,0
for data in loader:
a=torch.sum((data[:,0]!=ignore_idx)).item()-1
channels_sum+=torch.mean(data[:a],dim=[0])
channels_squared_sum+=torch.mean(data[:a]**2,dim=[0])
num_batches+=1
mean=channels_sum/num_batches
std=(channels_squared_sum/num_batches -mean**2)**0.5
return mean,std
There is my Model
#A classic Conv_Block
class conv_block (nn.Module):
def __init__(self, in_channels, out_channels, **kwargs):
super(conv_block, self).__init__()
self.relu = nn.LeakyReLU()
self.conv = nn.Conv1d(in_channels, out_channels, **kwargs)
self.batchnorm = nn.BatchNorm1d(out_channels)
def forward(self, x):
x=self.conv(x)
x= self.batchnorm(x)
return self.relu(x)
class Test (nn.Module):
def __init__(self,in_channels,num_layers,hidden_size, p,out_size):
super(Test ,self).__init__()
self.CNN=nn.Sequential(
#I am trying to apply filters on every two columns (A-B A-C A-D) using groups
conv_block(in_channels,3,kernel_size=2,stride=1,padding=1,groups=3),#,padding_mode="reflect"),
conv_block(3,32,kernel_size=2,stride=1,padding=0),
#SqueezeExcitation(32,16), #i tried but same results
conv_block(32,16,kernel_size=3,stride=1,padding=1),
conv_block(16,8,kernel_size=3,stride=1,padding=1),
)
self.rnn = nn.LSTM(8, hidden_size, num_layers)
self.rnn1 = nn.LSTM(hidden_size, hidden_size, num_layers)
#self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size) # in case of using bidirectional
#self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
self.dropout = nn.Dropout(p)
self.num_layers=num_layers
self.fc_f=nn.Linear(out_size*hidden_size,out_size)
def forward(self,x,hidden, cell):
x=x.permute(0,2,1)
x=self.CNN(x)
x=x.permute(2,0,1)
x, (hidden, cell) = self.rnn(x) #i tried bidirectional but same results
#hidden = self.dropout(self.fc_hidden(torch.cat((hidden[0:self.num_layers], hidden[self.num_layers:2*self.num_layers]), dim=2)))
#cell = self.dropout(self.fc_cell(torch.cat((cell[0:self.num_layers], cell[self.num_layers:2*self.num_layers]), dim=2)))
x, (hidden, cell) = self.rnn1(x, (hidden, cell))
#hidden=hidden.repeat(2,1,1)
#cell=cell.repeat(2,1,1)
x=x.permute(1,0,2)
x=x.reshape(x.shape[0],-1)
x=self.fc_f(x) #final result
return x, hidden, cell
#hyperparameters
in_channels=6
num_layers=64
hidden_size=90
p=0.2
out_size=tbptt_steps=20 #truncated bptt steps
split_dim=1
nb_epoch=100
learning_rate=3e-4
Model=Test(in_channels,num_layers,hidden_size, p,out_size).to(device)
optimizer = optim.Adam(Model.parameters(), lr=learning_rate)
# I tired to test my model on the same inputs
X=Inputs[:5,:500,:-1].to(device)
Y=Inputs[:5,:500,-1].to(device)
#training loop
hidden=None
cell=None
for ep in range (nb_epoch):
Losses=0
for i, (x_, y_) in enumerate(zip(X.split(tbptt_steps, dim=split_dim), Y.split(tbptt_steps, dim=split_dim))):
optimizer.zero_grad()
#Model.train()
# Detach last hidden state, so the backprop-graph will be cut
if hidden is not None:
hidden.detach_()
if cell is not None:
cell.detach_()
# Forward path
y_pred, hidden, cell = Model(x_, hidden, cell)
#print("predict",y_pred.shape,y_.shape)
# Compute loss
loss = nn.functional.mse_loss(y_, y_pred)
# Backward path
loss.backward()
Losses+=loss.item()
# Update weights
optimizer.step()
if i==0:
print("Epoch ",ep," Loss ",loss.item())
print("#################################################")
print(Losses)
print("#################################################")
There is two problems with this Model:
It doesn’t catch the padding_value
-The loss is high and didn’t decrease
I really hope that the Model is understandable, and we will correct it.
As you can see I am not a professional in Machine learning, I am really eager to understand more about my errors .
Thank you very much for your help

Related

Loss does not decrease

I'm a beginner and just trying to get in pytorch and neural networks. Therefore I created some dataset. The dataset consists of two input variables and one output variable (basicly the output is a linear function with some noise). Now I want to set up a neural network and train it with the dataset. I followed some tutorial and wrote this code:
df = pd.read_csv(r" ... .csv")
X = df[["x", "y"]]
y = df[["goal"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
# Convert data to torch tensors
class Data(Dataset):
def __init__(self, X, y):
self.X = torch.from_numpy(X.astype(np.float32))
self.y = torch.from_numpy(y.astype(np.float32))
self.len = self.X.shape[0]
def __getitem__(self, index):
return self.X[index], self.y[index]
def __len__(self):
return self.len
batch_size = 32
# Instantiate training and test data
train_data = Data(X_train, y_train)
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_data = Data(X_test, y_test)
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)
input_dim = 2
hidden_dim_1 = 2
output_dim = 1
class NeuralNetwork(nn.Module):
def __init__(self, input_dim, hidden_dim_1, output_dim):
super(NeuralNetwork, self).__init__()
self.layer_1 = nn.Linear(input_dim, hidden_dim_1)
self.layer_out = nn.Linear(hidden_dim_1, output_dim)
def forward(self, x):
x = F.relu(self.layer_1(x))
x = self.layer_out(x)
return x
model = NeuralNetwork(input_dim, hidden_dim_1, output_dim)
optimizer = optim.SGD(model.parameters(), lr=0.01)
def train(epoch):
model.train()
for batch_id, (data, target) in enumerate(train_data):
data = Variable(data)
target = Variable(target)
target = target.to(dtype=torch.float32)
optimizer.zero_grad()
out = model(data)
criterion = F.mse_loss
loss = criterion(out, target)
print(loss.detach().numpy())
loss.backward()
optimizer.step()
for epoch in range(1, 30):
train(epoch)
My problem is that the printed loss is extremly high (e8-area) and does not decrease.
I tried to change some settings of the neural network, changed the batchsize, learningrate and tried other optimizers and loss functions. But none of the changes really helped. My research also didn't bring any success. Seems to me that there is a more basic mistake in my coding. What did I wrong?
Thanks in advance!
Your code seems fine to me (although I might miss a bug). It is in general never safe to say which networks will be successful and which won't, but here are some suggestions if you can't see any progress:
Check the input data. Maybe try plotting it to make sure that it actually contains what you think it does. You may print out the inputs, predicted and expected values (or better, view them in a debugger) to see what's wrong.
Normalize the input data. If there are high values in the input / output data, losses may explode. Ensure that most of the values are roughly between -1 and 1.
Lower the learning rate. 0.01 is generally a good starting point, but who knows.
Try training for more epochs. Depending on the noise in your data, this could be necessary.
Try adding more neurons. A linear function should in theory be fine with not that many, but maybe the noise is too 'complex'.

What should I think about when writing a custom loss function?

I'm trying to get my toy network to learn a sine wave.
I output (via tanh) a number between -1 and 1, and I want the network to minimise the following loss, where self(x) are the predictions.
loss = -torch.mean(self(x)*y)
This should be equivalent to trading a stock with a sinusoidal price, where self(x) is our desired position, and y are the returns of the next time step.
The issue I'm having is that the network doesn't learn anything. It does work if I change the loss function to be torch.mean((self(x)-y)**2) (MSE), but this isn't what I want. I'm trying to focus the network on 'making a profit', not making a prediction.
I think the issue may be related to the convexity of the loss function, but I'm not sure, and I'm not certain how to proceed. I've experimented with differing learning rates, but alas nothing works.
What should I be thinking about?
Actual code:
%load_ext tensorboard
import matplotlib.pyplot as plt; plt.rcParams["figure.figsize"] = (30,8)
import torch;from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F;import pytorch_lightning as pl
from torch import nn, tensor
def piecewise(x): return 2*(x>0)-1
class TsDs(torch.utils.data.Dataset):
def __init__(self, s, l=5): super().__init__();self.l,self.s=l,s
def __len__(self): return self.s.shape[0] - 1 - self.l
def __getitem__(self, i): return self.s[i:i+self.l], torch.log(self.s[i+self.l+1]/self.s[i+self.l])
def plt(self): plt.plot(self.s)
class TsDm(pl.LightningDataModule):
def __init__(self, length=5000, batch_size=1000): super().__init__();self.batch_size=batch_size;self.s = torch.sin(torch.arange(length)*0.2) + 5 + 0*torch.rand(length)
def train_dataloader(self): return DataLoader(TsDs(self.s[:3999]), batch_size=self.batch_size, shuffle=True)
def val_dataloader(self): return DataLoader(TsDs(self.s[4000:]), batch_size=self.batch_size)
dm = TsDm()
class MyModel(pl.LightningModule):
def __init__(self, learning_rate=0.01):
super().__init__();self.learning_rate = learning_rate
super().__init__();self.learning_rate = learning_rate
self.conv1 = nn.Conv1d(1,5,2)
self.lin1 = nn.Linear(20,3);self.lin2 = nn.Linear(3,1)
# self.network = nn.Sequential(nn.Conv1d(1,5,2),nn.ReLU(),nn.Linear(20,3),nn.ReLU(),nn.Linear(3,1), nn.Tanh())
# self.network = nn.Sequential(nn.Linear(5,5),nn.ReLU(),nn.Linear(5,3),nn.ReLU(),nn.Linear(3,1), nn.Tanh())
def forward(self, x):
out = x.unsqueeze(1)
out = self.conv1(out)
out = out.reshape(-1,20)
out = nn.ReLU()(out)
out = self.lin1(out)
out = nn.ReLU()(out)
out = self.lin2(out)
return nn.Tanh()(out)
def step(self, batch, batch_idx, stage):
x, y = batch
loss = -torch.mean(self(x)*y)
# loss = torch.mean((self(x)-y)**2)
print(loss)
self.log("loss", loss, prog_bar=True)
return loss
def training_step(self, batch, batch_idx): return self.step(batch, batch_idx, "train")
def validation_step(self, batch, batch_idx): return self.step(batch, batch_idx, "val")
def configure_optimizers(self): return torch.optim.SGD(self.parameters(), lr=self.learning_rate)
#logger = pl.loggers.TensorBoardLogger(save_dir="/content/")
mm = MyModel(0.1);trainer = pl.Trainer(max_epochs=10)
# trainer.tune(mm, dm)
trainer.fit(mm, datamodule=dm)
#
If I understand you correctly, I think that you were trying to maximize the unnormalized correlation between the network's prediction, self(x), and the target value y.
As you mention, the problem is the convexity of the loss wrt the model weights. One way to see the problem is to consider that the model is a simple linear predictor w'*x, where w is the model weights, w' it's transpose, and x the input feature vector (assume a scalar prediction for now). Then, if you look at the derivative of the loss wrt the weight vector (i.e., the gradient), you'll find that it no longer depends on w!
One way to fix this is change the loss to,
loss = -torch.mean(torch.square(self(x)*y))
or
loss = -torch.mean(torch.abs(self(x)*y))
You will have another big problem, however: these loss functions encourage unbound growth of the model weights. In the linear case, one solves this by a Lagrangian relaxation of a hard constraint on, for example, the norm of the model weight vector. I'm not sure how this would be done with neural networks as each layer would need it's own Lagrangian parameter...

How to check accuracy on BCELoss Pytorch?

I'm trying to use Pytorch to take a HeartDisease.csv and predict whether the patient has heart disease or not... the .csv provides 13 inputs and 1 target
I'm using BCELoss and I'm having trouble understanding how to write an accuracy check function.
My num_samples is correct but not my num_correct. I think this is a result of not understanding the predictions tensor. Right now my num_correct is usually over 8000 while my num_samples is 303...
Any insight on how to write this check accuracy function is much appreciated
I wrote this on a google co lab
#imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
#create fully connected network
class NN(nn.Module):
def __init__(self, input_size, num_classes):
super(NN, self).__init__()
self.outputs = nn.Linear(input_size, 1)
def forward(self, x):
x = self.outputs(x)
return torch.sigmoid(x)
#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#hyperparameters
input_size = 13 # 13 inputs
num_classes = 1 # heartdisease or not
learning_rate = 0.001
batch_size = 64
num_epochs = 1
#load data
class MyDataset(Dataset):
def __init__(self, root, n_inp):
self.df = pd.read_csv(root)
self.data = self.df.to_numpy()
self.x , self.y = (torch.from_numpy(self.data[:,:n_inp]),
torch.from_numpy(self.data[:,n_inp:]))
def __getitem__(self, idx):
return self.x[idx, :], self.y[idx,:]
def __len__(self):
return len(self.data)
train_dataset = MyDataset("heart.csv", input_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle =True)
test_dataset = MyDataset("heart.csv", input_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle =True)
#initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)
#loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#train network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(train_loader):
#get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
#forward
scores = model(data.float())
targets = targets.float()
loss = criterion(scores, targets)
#backward
optimizer.zero_grad()
loss.backward()
#grad descent or adam step
optimizer.step()
#check accuracy of model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x.float())
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print("Got {} / {} with accuracy {}".format(num_correct, num_samples, float(num_correct)/float(num_samples)*100))
model.train()
print("checking accuracy on training data")
check_accuracy(train_loader, model)
print("checking accuracy on test data")
check_accuracy(test_loader, model)
Note: Don't fool yourself. A single linear layer + a sigmoid + BCE loss = logistic regression. This is a linear model, so just take note of that when referring to it as a "neural network", which is a term usually reserved for similar networks but with at least one hidden layer and nonlinear activations.
The sigmoid layer at the end of your model's forward() function returns an (N,1)-sized tensor, where N is the batch size. In other words, it returns a scalar for every data point. Each scalar is a value between 0 and 1 (this is the range of the sigmoid function).
The idea is to interpret those scalars as probabilities corresponding to the positive class. Suppose 1 corresponds to heart disease, and 0 corresponds to no heart disease; heart disease is the positive class, and no heart disease is the negative class. Now suppose a score is 0.6. This might be interpreted as a 60% chance that the associated label is heart disease, and a 40% chance that the associated label is no heart disease. This interpretation of the sigmoid output is what motivates the BCE loss to begin with (it's ultimately just a negative log likelihood).
So what you might do is check if your scores are greater than 0.5. If so, predict heart disease. If not, predict no heart disease.
Right now, you're computing maximums from the scores across dimension 1, which does nothing because dimension 1 is already of size 1; taking the maximum of a single value simply gives you that value.
Try something like this:
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x.float())
// Create a Boolean tensor (True for scores > 0.5, False for others)
// and then cast it to a long tensor (Trues -> 1, Falses -> 0)
predictions = (scores > 0.5).long()
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print("Got {} / {} with accuracy {}".format(num_correct, num_samples, float(num_correct)/float(num_samples)*100))
model.train()
You may also want to squeeze your prediction and target tensors to size (N) instead of (N,1), though I'm not sure it's necessary in your case.

Using LSTM stateful for passing context b/w batches; may be some error in context passing, not getting good results?

I have checked the data before giving it to the network. The data is correct.
Using LSTM and passing the context b/w batches. per_class_accuracy is changing, but the loss is not going down. Been stuck for long, not sure if there is an error in the Code?
I have multi-class classification problem based upon an imbalanced dataset
Dataset_type: CSV
Dataset_size: 20000
Based upon CSV data of sensors
X = 0.6986111111111111,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0
Y = leaveHouse
Per class accuracy:
{'leaveHouse': 0.34932855, 'getDressed': 1.0, 'idle': 0.8074534, 'prepareBreakfast': 0.8, 'goToBed': 0.35583413, 'getDrink': 0.0, 'takeShower': 1.0, 'useToilet': 0.0, 'eatBreakfast': 0.8857143}
Training:
# Using loss weights, the inverse of class frequency
criterion = nn.CrossEntropyLoss(weight = class_weights)
hn, cn = model.init_hidden(batch_size)
for i, (input, label) in enumerate(trainLoader):
hn.detach_()
cn.detach_()
input = input.view(-1, seq_dim, input_dim)
if torch.cuda.is_available():
input = input.float().cuda()
label = label.cuda()
else:
input = input.float()
label = label
# Forward pass to get output/logits
output, (hn, cn) = model((input, (hn, cn)))
# Calculate Loss: softmax --> cross entropy loss
loss = criterion(output, label)#weig pram
running_loss += loss
loss.backward() # Backward pass
optimizer.step() # Now we can do an optimizer step
optimizer.zero_grad() # Reset gradients tensors
Network
class LSTMModel(nn.Module):
def init_hidden(self, batch_size):
self.batch_size = batch_size
if torch.cuda.is_available():
hn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).cuda()
# Initialize cell state
cn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).cuda()
else:
hn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim)
# Initialize cell state
cn = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim)
return hn, cn
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, seq_dim):
super(LSTMModel, self).__init__()
# Hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
self.input_dim = input_dim
# Building your LSTM
# batch_first=True causes input/output tensors to be of shape
# (batch_dim, seq_dim, feature_dim)
self.lstm = nn.LSTM(self.input_dim, hidden_dim, layer_dim, batch_first=True)
# Readout layer
self.fc = nn.Linear(hidden_dim, output_dim)
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
self.seq_dim = seq_dim
def forward(self, inputs):
# Initialize hidden state with zeros
input, (hn, cn) = inputs
input = input.view(-1, self.seq_dim, self.input_dim)
# time steps
out, (hn, cn) = self.lstm(input, (hn, cn))
# Index hidden state of last time step
out = self.fc(out[:, -1, :])
out = self.softmax(out)
return out, (hn,cn)
One problem you might have is CrossEntropyLoss combines a log softmax operation with negative log likelihood loss, but you're applying a softmax in your model. You should pass the raw logits out of the final layer to CrossEntropyLoss.
Also I an't say without seeing the models forward pass, but it looks like you're applying the softmax on dimension 1 to a tensor that (I'm inferring) has shape batch_size, sequence_length, output_dim, when you should be applying it along the output dim.

How can I use LSTM in pytorch for classification?

My code is as below:
class Mymodel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers, batch_size):
super(Discriminator, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.num_layers = num_layers
self.batch_size = batch_size
self.lstm = nn.LSTM(input_size, hidden_size)
self.proj = nn.Linear(hidden_size, output_size)
self.hidden = self.init_hidden()
def init_hidden(self):
return (Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size)),
Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size)))
def forward(self, x):
lstm_out, self.hidden = self.lstm(x, self.hidden)
output = self.proj(lstm_out)
result = F.sigmoid(output)
return result
I want to use LSTM to classify a sentence to good (1) or bad (0). Using this code, I get the result which is time_step * batch_size * 1 but not 0 or 1. How to edit the code in order to get the classification result?
Theory:
Recall that an LSTM outputs a vector for every input in the series. You are using sentences, which are a series of words (probably converted to indices and then embedded as vectors). This code from the LSTM PyTorch tutorial makes clear exactly what I mean (***emphasis mine):
lstm = nn.LSTM(3, 3) # Input dim is 3, output dim is 3
inputs = [autograd.Variable(torch.randn((1, 3)))
for _ in range(5)] # make a sequence of length 5
# initialize the hidden state.
hidden = (autograd.Variable(torch.randn(1, 1, 3)),
autograd.Variable(torch.randn((1, 1, 3))))
for i in inputs:
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden state.
out, hidden = lstm(i.view(1, 1, -1), hidden)
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# *** (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(
torch.randn((1, 1, 3)))) # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)
One more time: compare the last slice of "out" with "hidden" below, they are the same. Why? Well...
If you're familiar with LSTM's, I'd recommend the PyTorch LSTM docs at this point. Under the output section, notice h_t is output at every t.
Now if you aren't used to LSTM-style equations, take a look at Chris Olah's LSTM blog post. Scroll down to the diagram of the unrolled network:
As you feed your sentence in word-by-word (x_i-by-x_i+1), you get an output from each timestep. You want to interpret the entire sentence to classify it. So you must wait until the LSTM has seen all the words. That is, you need to take h_t where t is the number of words in your sentence.
Code:
Here's a coding reference. I'm not going to copy-paste the entire thing, just the relevant parts. The magic happens at self.hidden2label(lstm_out[-1])
class LSTMClassifier(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size):
...
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
self.hidden2label = nn.Linear(hidden_dim, label_size)
self.hidden = self.init_hidden()
def init_hidden(self):
return (autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim)),
autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim)))
def forward(self, sentence):
embeds = self.word_embeddings(sentence)
x = embeds.view(len(sentence), self.batch_size , -1)
lstm_out, self.hidden = self.lstm(x, self.hidden)
y = self.hidden2label(lstm_out[-1])
log_probs = F.log_softmax(y)
return log_probs
The main problem you need to figure out is the in which dim place you should put your batch size when you prepare your data. As far as I know, if you didn't set it in your nn.LSTM() init function, it will automatically assume that the second dim is your batch size, which is quite different compared to other DNN framework. Maybe you can try:
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
like this to ask your model to treat your first dim as the batch dim.
As a last layer you have to have a linear layer for however many classes you want i.e 10 if you are doing digit classification as in MNIST . For your case since you are doing a yes/no (1/0) classification you have two lablels/ classes so you linear layer has two classes. I suggest adding a linear layer as
nn.Linear ( feature_size_from_previous_layer , 2)
and then train the model using a cross-entropy loss.
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

Resources