Different training result obtained from training simple LSTM in Keras and Pytorch - keras

I’m trying to implement my LSTM model from Keras to Pytorch, but the results in Pytorch seem really bad at the moment. The network is really simple as below.
model = Sequential()
model.add(LSTM(10, input_length=shape[1], input_dim=shape[2]))
# output shape: (1, 1)
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(1,activation="linear"))
model.compile(loss="mse", optimizer="adam")
model.summary()
And I migrate it to the Pytorch framework,
class LSTM(nn.Module):
def __init__(self, input_dim, hidden_dim, num_layers, output_dim,bilstm=False):
super(LSTM, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.isBi = bilstm
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True,bidirectional=bilstm).double()
# for name, param in self.lstm.named_parameters():
# if name.startswith("weight"):
# nn.init.orthogonal_(param)
# else:
# pass
self.fc1 = nn.Sequential(nn.Linear(hidden_dim, 10).double(),nn.Tanh())
self.final_layer1 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer2 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer3 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer4 = nn.Sequential(nn.Linear(10,output_dim).double())
def forward(self, x):
out, (hn, cn) = self.lstm(x)
out = out[:, -1, :]
out = self.fc1(out)
out = self.final_layer1(out)
out = self.final_layer2(out)
out = self.final_layer3(out)
out = self.final_layer4(out)
return out
The result is really bad. I was wondering if the initializing methods/activation functions used in Keras are different from the one I used in Pytorch(Keras seems to be using hard_sigmoid where Pytorch uses sigmoid?).
Would really appreciate it if somebody could help me with this problem!
UPDATED
My training code in Pytorch.
criterion = nn.MSELoss()
model = LSTM(input_dim,hidden_dim,num_layers,output_dim,bilstm)
model = model.cuda()
optimizer = optim.Adam(model.parameters(),lr=0.001)
for epoch in range(1,epoch_number+1):
model.train()
iteration = 0
for i,data in enumerate(train_loader):
dat, label = data
dat = dat.double()
label = label.double()
if torch.cuda.is_available():
dat = dat.cuda()
label = label.cuda()
else:
dat = Variable(dat)
label = Variable(label)
out = model(dat)
optimizer.zero_grad()
loss = criterion(out, label)
loss.backward()
optimizer.step()

Related

PyTorch text classification model not improving

I am training a simple LSTM model for binary text classification. Here is the model class:
class LSTM(nn.Module):
def __init__(self, vocabulary_size, embeddings_size, num_classes):
super(LSTM, self).__init__()
self.vocabulary_size = vocabulary_size
self.embeddings_size = embeddings_size
self.embedding = nn.Embedding(num_embeddings=vocabulary_size,
embedding_dim=embeddings_size,
padding_idx=0)
self.lstm = nn.LSTM(input_size=embeddings_size,
hidden_size=128,
num_layers=1,
batch_first=True)
self.fc = nn.Linear(in_features=128,
out_features=num_classes)
def forward(self, x):
out = self.embedding(x)
out, _ = self.lstm(out)
out = out[:, -1]
out = self.fc(out)
out = torch.sigmoid(out)
return out
I am using BCELoss and Adam optimizer created with the following code:
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=learning_rate)
This is the training loop that I am using:
train_steps = len(train_data_loader)
for epoch in range(epochs):
train_loss = 0
model.train()
for i, (sequences, labels) in enumerate(train_data_loader):
optimizer.zero_grad()
sequences = sequences.to(device)
labels = labels.to(device)
outputs = model(sequences)
loss = criterion(outputs, labels)
train_loss += loss.item()
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss / train_steps:.4f}')
I have experimented with different datasets, number of epochs, learning rate, batch size. However, the model does not seem to learn - the loss is always around 0.7 and only the 0 class is predicted.
Does anyone know what the issue could be?

How to create a data preprocessing pipeline in pytorch outside the Dataloader class?

I am trying to make a model for data with 40 features which have to classified into 10 classes. I am new to PyTorch and this is my first project in it.
I am given a custom Dataset class (which I am not allowed to change) which is as follows:
class MyData(Dataset):
def _init_(self, mode):
with open(mode+'.pkl', 'rb') as handle:
data = pickle.load(handle)
self.X = data['x'].astype('float')
self.y = data['y'].astype('long')
def _len_(self):
return len(self.X)
def _getitem_(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sample = (self.X[idx], self.y[idx])
return sample
I have done some preprocessing on the data like normalization and then trained and saved the model. As I wasn't allowed to change the dataset class, I made the changes outside of it and then used the DataLoader method. The preprocessing is as follows :
train_data=MyData("train")
features, labels = train_data[:]
df = pd.DataFrame(features)
x = df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
input_array = x_scaled
output_array = labels
inputs = torch.Tensor(input_array)
targets = torch.Tensor(output_array).type(torch.LongTensor)
dataset = TensorDataset(inputs, targets)
train_ds, val_ds = random_split(dataset, [3300, 300])
batch_size = 300
n_epochs = 200
log_interval = 10
train_losses = []
train_counter = []
test_losses = []
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
After this I define the training and testing functions ( and remove the print statements as the autograder will not be able to grade my assignment if I do so) as follows:
def train(epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = model(data.double())
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
train_losses.append(loss.item())
train_counter.append(
(batch_idx*32) + ((epoch-1)*len(train_loader.dataset)))
save_model(model)
def test():
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in val_loader:
output = model(data.double())
test_loss += criterion(output, target).item()
pred = output.data.max(1, keepdim=True)[1]
correct += pred.eq(target.data.view_as(pred)).sum()
test_loss /= len(val_loader.dataset)
test_losses.append(test_loss)
test()
for epoch in range(1, n_epochs + 1):
train(epoch)
test()
Even after doing that, the autograder is still not able to grade my code. I mainly think it's because maybe I am making an error with how I input the data to the model but I am not able to narrow down to what exactly is the problem and how do I correct it. As I'm new to pytorch, I was looking at how to do the preprocessing but all of them involved the Dataset Class so I'm not sure how to go about it.
My model is as follows:
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
#self.flatten=nn.Flatten()
self.net_stack=nn.Sequential(
nn.Conv1d(in_channels=40, out_channels=256, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(256, affine=True),
nn.Conv1d(in_channels=256, out_channels=128, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(128, affine=True),
nn.Conv1d(in_channels=128, out_channels=64, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(64, affine=True),
nn.Conv1d(in_channels=64, out_channels=32, kernel_size=1, stride=2), #applying batch norm
nn.ReLU(),
nn.MaxPool1d(kernel_size=1),
nn.Dropout(p=0.1),
nn.BatchNorm1d(32, affine=True),
nn.Flatten(),
nn.Linear(32, 10),
nn.Softmax(dim=1)).double()
def forward(self,x):
# result=self.net_stack(x[None])
x=x.double()
result=self.net_stack(x[:, :, None]).double()
print(result.size())
return result
One instruction I've got is that they've written:
# Please make sure we can load your model with:
# model = MyModel()
# This means you must give default values to all parameters you may wish to set, such as output size.
You can try to do it within the training loop
for batch_idx, (data, target) in enumerate(train_loader):
# you can do something here to manipulate your input
data = transform(data)
data.to('cuda') # Move to gpu, i noticed you didnt do it in your training loop
# Forward pass
output = model(data)

Training loss is not changing at all while training model

I’m trying to solve a VQA classification problem. my training loss is not changing at all while training the model.
I put in comment the CNN model and try to run it with the text only, but still, no change in loss value.
I pass through those models:
class question_lstm(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, output_dim, que_size):
super(question_lstm, self).__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(input_dim, emb_dim)
self.tanh = nn.Tanh()
self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
self.dropout = nn.Dropout(dropout)
#self.fc1=nn.Linear(n_layers*hid_dim,que_size)
self.fc1=nn.Linear(n_layers*output_dim,que_size)
def forward(self, question):
emb_question=self.embedding(question) #(batchsize, input_dim, emb_dim=256)
emb_question=self.dropout(emb_question)
emb_question=self.tanh(emb_question)
emb_question = emb_question.transpose(0, 1) #(input_dim, batchsize, emb_dim)
output, (hidden, cell) = self.lstm(emb_question)
qu_feature = torch.cat((hidden, cell), dim=2)
qu_feature = qu_feature.transpose(0, 1) #(batchsize=100, num_layer=2, hid_dim=2048)
question_output =self.fc1(qu_feature)
return question_output
class vqamodel(nn.Module):
def __init__(self, output_dim,input_dim, emb_dim, hid_dim, n_layers, dropout, answer_len, que_size,):
super(vqamodel,self).__init__()
#self.image=img_CNN(img_size,image_feature)
self.question=question_lstm(input_dim, emb_dim, hid_dim, n_layers, dropout,output_dim,que_size)
self.tanh=nn.Tanh()
self.relu=nn.ReLU()
self.dropout=nn.Dropout(dropout)
self.fc1=nn.Linear(que_size,output_dim)
self.fc2=nn.Linear(output_dim,answer_len)
def forward(self, image, question):
question_emb=self.question(question)
combine =question_emb #*img_emb
out_feature=self.fc1(combine) #(batchsize=100, output_dim=2048)
out_feature=self.relu(out_feature)
out_feature=self.dropout(out_feature)
out_feature=self.fc2(out_feature) #(batchsize=100, answer_len=1000)
return (out_feature)
I’m using cross entropy loss and Adam:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vqa_model.parameters(),lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
any idea what can cause this constant loss value?
the train loop:
def train(model,criterion,optimizer,scheduler):
start_time = time.time() #the time we start the train
for epoch in range(num_epochs):
train_loss = 0
#test_loss = 0
train_correct = 0
#test_correct = 0
vqa_model.train()
for i,sample in enumerate(train_VQAdataset_loader):
#image = sample['image'].to(device=device)
question = sample['question'].to(torch.int64).to(device=device)
label = sample['answer'].to(device=device)
output = vqa_model(image, question) # forward
loss = criterion(output, label)
optimizer.zero_grad() # Zero the gradients
loss.backward() # backprop
optimizer.step() # Update weights
scheduler.step()
# Statitcs
train_loss += loss.item() # save the loss for the entire epoch
_, predictions = torch.max(output, 1)
train_correct += (predictions == label).sum() #number of success - cumulative
train_losses.append(train_loss / len(train_VQAdataset_loader))

I get something wrong when use model.train() and model.eval() on pytorch

I have prepare features and their labels as blow; I want to build a model which is constructed by transformers' encoder and then add a linear layer to predict a value. but I got some error when I use the model to predict after its training.
At first I run below code:
import torch
from torch import nn
features = torch.rand(bach_size, channels, lenght)
labels = torch.rand(batch_size)
class TransformerModel(nn.Module):
def __init__(self):
super(TransformerModel, self).__init__()
encoder_layer = nn.TransformerEncoderLayer(d_model=8, nhead=8, dropout=0.5)
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, 6)
self.decoder = nn.Linear(40, 1)
def forward(self, src):
encoded = self.transformer_encoder(src.transpose(1, 0)).transpose(1, 0)
pred = self.decoder(encoded.reshape(encoded.shape[0], -1))
return pred
model = TransformerModel()
criterion = nn.MSELoss()
lr = 0.3 # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
def train():
model.train() # Turn on the train mode
optimizer.zero_grad()
output = model(features)
loss = criterion(output.view(-1, 1), labels.view(-1, 1))
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
return loss.item()
for _ in range(100):
train()
After that, I predict features by the below codes:
model.eval()
output = model(features)
I get all values of 'output' are the same, and if use 'model.train()', the 'output' seems Ok; so what is the problem? or the model was built wrong?

PyTorch LSTMCell Teacher Forcing

I'm fairly new to PyTorch and I'm trying to design an 18 node LSTM using LSTMCell with Teacher Forcing. I have quite a few difficulties.
Here's my model:
class tryLSTM(nn.moduleList):
def __init__(self, input_size, hidden_size, batch_size):
super(tryLSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.batch_size = batch_size
self.lstm0 = nn.LSTMCell(input_size, hidden_size, bias=True)
self.lstm1 = nn.LSTMCell(input_size, hidden_size, bias=True)
self.lstm2 = nn.LSTMCell(input_size, hidden_size, bias=True)
.........
self.lstm17 = nn.LSTMCell(input_size, hidden_size, bias=True)
def init_hidden(self):
# initialize the hidden state and the cell state to zeros
hidden = torch.zeros(self.batch_size, self.hidden_size)
cell = torch.zeros(self.batch_size, self.hidden_size)
return hidden, cell
def forward(self, x, hc):
out = []
h_0, c_0 = hc
h_1, c_1 = self.lstm1(x[0], h_0, c_0)
out[0] = h_1
h_2, c_2 = self.lstm2(x[1], h_1, c_1)
out[1] = h_2
......
h_17, c_17 = self.lstm17(x[16], h_16, c_16)
out[16] = h_17
model = tryLSTM(input_size=128, hidden_size=128, batch_size=18)
if gpu: model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.BCELoss(weight=None, reduction='mean')
here's the training loop:
def train(epoch):
model.train()
# initialize hidden and cell state
hc = model.init_hidden()
for batch_idx, (data, target) in enumerate(train_loader):
# Zero out the gradients
optimizer.zero_grad()
target = data[1:]
print(target.size())
# Put data on GPU
if gpu:
data = data.cuda()
target = target.cuda()
# Get outputs of LSTM
output = model(data, hc)
print(output.size)
# Calculate loss
loss = criterion(output, target)
# Calculate gradients
loss.backward()
# Update model parameters
optimizer.step()
train_loss.append(loss.item())
Q.1 I'm getting the following error:
TypeError: forward() takes from 2 to 3 positional arguments but 4 were given
Please help, Thanks!

Resources