Translating LSTM model from Keras to Pytorch - keras

I am having a hard time translating a quite simple LSTM model from Keras to Pytorch. X (get it here) corresponds to 1152 samples of 90 timesteps, each timestep has only 1 dimension. y (here) is a single prediction at t = 91 for all 1152 samples.
In Keras:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
import numpy as np
import pandas as pd
X = pd.read_csv('X.csv', header = None).values
X.shape
y = pd.read_csv('y.csv', header = None).values
y.shape
# From Keras documentation [https://keras.io/layers/recurrent/]:
# Input shape 3D tensor with shape (batch_size, timesteps, input_dim).
X = np.reshape(X, (1152, 90, 1))
regressor = Sequential()
regressor.add(LSTM(units = 100, return_sequences = True, input_shape = (90, 1)))
regressor.add(Dropout(0.3))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.3))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.3))
regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.3))
regressor.add(Dense(units = 1, activation = 'linear'))
regressor.compile(optimizer = 'rmsprop', loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
regressor.fit(X, y, epochs = 10, batch_size = 32)
... leads me to:
# Epoch 10/10
# 1152/1152 [==============================] - 33s 29ms/sample - loss: 0.0068 - mean_absolute_error: 0.0628
Then in Pytorch:
import torch
from torch import nn, optim
from sklearn.metrics import mean_absolute_error
X = pd.read_csv('X.csv', header = None).values
y = pd.read_csv('y.csv', header = None).values
X = torch.tensor(X, dtype = torch.float32)
y = torch.tensor(y, dtype = torch.float32)
dataset = torch.utils.data.TensorDataset(X, y)
loader = torch.utils.data.DataLoader(dataset, batch_size = 32, shuffle = True)
class regressor_LSTM(nn.Module):
def __init__(self):
super().__init__()
self.lstm1 = nn.LSTM(input_size = 1, hidden_size = 100)
self.lstm2 = nn.LSTM(100, 50)
self.lstm3 = nn.LSTM(50, 50, dropout = 0.3, num_layers = 2)
self.dropout = nn.Dropout(p = 0.3)
self.linear = nn.Linear(in_features = 50, out_features = 1)
def forward(self, X):
# From the Pytorch documentation [https://pytorch.org/docs/stable/_modules/torch/nn/modules/rnn.html]:
# **input** of shape `(seq_len, batch, input_size)`
X = X.view(90, 32, 1)
# I am discarding hidden/cell states since in Keras I am using a stateless approach
# [https://keras.io/examples/lstm_stateful/]
X, _ = self.lstm1(X)
X = self.dropout(X)
X, _ = self.lstm2(X)
X = self.dropout(X)
X, _ = self.lstm3(X)
X = self.dropout(X)
X = self.linear(X)
return X
regressor = regressor_LSTM()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(regressor.parameters())
for epoch in range(10):
running_loss = 0.
running_mae = 0.
for i, data in enumerate(loader):
inputs, labels = data
optimizer.zero_grad()
outputs = regressor(inputs)
outputs = outputs[-1].view(*labels.shape)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
mae = mean_absolute_error(labels.detach().cpu().numpy().flatten(), outputs.detach().cpu().numpy().flatten())
running_mae += mae
print('EPOCH %3d: loss %.5f - MAE %.5f' % (epoch+1, running_loss/len(loader), running_mae/len(loader)))
... leads me to:
# EPOCH 10: loss 0.04220 - MAE 0.16762
You can notice that both loss and MAE are quite different (Pytorch's are much higher). If I use Pytorch's model to predict the values, they all return as a constant.
What am I doing wrong?

Oh I believe I made considerable progress. It seems that the way to represent y is different between Keras and Pytorch. In Keras, we should pass it as a single value representing one timestep in the future (or, at least, for the problem I am trying to solve). But in Pytorch, y must be X shifted one timestep to the future. It is like this:
time_series = [0, 1, 2, 3, 4, 5]
X = [0, 1, 2, 3, 4]
# Keras:
y = [5]
# Pytorch:
y = [1, 2, 3, 4, 5]
This way, Pytorch compares all values in the time slice when calculating loss. I believe Keras rearranges the data under the hood to conform to this approach, as the code works when fed the variables just like that. But in Pytorch, I was estimating loss based only on one value (the one I was trying to predict), not the whole series, therefore I believe it could not correctly capture the time dependency.
When taking this in consideration, I got to:
EPOCH 100: loss 0.00551 - MAE 0.058435
And, most importantly, comparing true and predicted values in a separate dataset got me to
The patterns were clearly captured by the model.
Hooray!

Related

PyTorch simple ConvNet diverge so easly

So I'm studiying pytorch coming from a background with tensorflow.
I'm trying to replicate a simple convnet, that I've developed with success in tensorflow, to classify cat vs dogs images.
In pytorch I see some strange behaviors:
Using a Learning Rate of 0.001 make the CNet predicting only 0 after the first batch (might be exploding gradients?)
Using a Learning Rate of 0.0005 gives a smooth learning curve and the CNet converge
Can anyone help me to understand what I'm doing wrong? that the code:
import pathlib
import torch
import torch.nn.functional as F
import torchvision
from torch.utils.data.dataloader import DataLoader
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CNet(torch.nn.Module):
def __init__(self):
super(CNet, self).__init__() #input is 180x180 image
self.conv1 = torch.nn.Conv2d(3, 32, 3) # out -> 178x178x32
self.conv2 = torch.nn.Conv2d(32, 64, 3)
self.conv3 = torch.nn.Conv2d(64, 128, 3)
self.conv4 = torch.nn.Conv2d(128, 256, 3)
self.conv5 = torch.nn.Conv2d(256, 256, 3)
self.flatten = torch.nn.Flatten()
#self.fc = torch.nn.LazyLinear(1)
self.fc = torch.nn.Linear(7*7*256, 1)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv4(x)), (2, 2))
x = F.relu(self.conv5(x))
x = self.flatten(x)
o = torch.sigmoid(self.fc(x))
return o
def train(model : CNet, train_data : DataLoader, criterion, optimizer : torch.optim.Optimizer, epochs = 10, validation_data : DataLoader = None):
losses = []
for epoch in range(epochs):
epoch_loss = 0.0
running_loss = 0.0
for i, data in enumerate(train_data, 0):
imgs, labels = data
imgs, labels = imgs.to(device), labels.to(device, dtype=torch.float)
labels = labels.unsqueeze(-1)
# run
output = net(imgs)
# zero out accumulated grads
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
epoch_loss += loss.item()
#if i % 50 == 49:
# print(f'[{epoch+1}, {i:5d}] loss: {running_loss / 50.0:.3f}')
# running_loss = 0.0
losses.append(epoch_loss / len(train_data.dataset))
print(f'[{epoch+1}, {epochs:5d}] loss: {losses[-1]:.3f}')
return losses
if __name__=="__main__":
transforms = torchvision.transforms.Compose([
torchvision.transforms.Resize((180, 180)),
torchvision.transforms.ToTensor(),
])
dataset_dir = pathlib.Path("E:\Datasets\\torch\Cat_Dog\cats_vs_dogs_small")
train_data = torchvision.datasets.ImageFolder(dataset_dir / "train", transform=transforms)
validation_data = torchvision.datasets.ImageFolder(dataset_dir / "validation", transform=transforms)
test_data = torchvision.datasets.ImageFolder(dataset_dir / "test", transform=transforms)
train_data_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=2, persistent_workers=True, pin_memory=True)
validation_data_loader = DataLoader(validation_data, batch_size=32, num_workers=2, shuffle=True, pin_memory=True)
test_data_loader = DataLoader(test_data, batch_size=32, shuffle=True, pin_memory=True, num_workers=2)
import matplotlib.pyplot as plt
#plt.figure()
#for i in range(1, 10):
# plt.subplot(3, 3, i)
# plt.axis('off')
# rand_idx = np.random.random_integers(0, len(train_data))
# plt.imshow(np.moveaxis(test_data[rand_idx][0].numpy(), 0, 2))
#plt.show()
net = CNet()
net = net.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.RMSprop(net.parameters(), 0.001)
net.train()
# TODO save best model
losses = train(net, train_data_loader, criterion, optimizer, epochs=30)
epochs = range(1, len(losses) + 1)
plt.plot(epochs, losses, 'bo', label='Training Loss')
plt.show()
print('Training Finished')
correct_count, all_count = 0, 0
for images,labels in test_data_loader:
images,labels = images.to(device), labels.to(device, dtype=torch.float)
with torch.no_grad():
ps = net(images)
pred_label = (ps > 0.5).to(torch.float)
true_label = labels.unsqueeze(1)
correct_count += (pred_label == true_label).sum().item()
all_count += len(labels)
print("Number Of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))
and here some screenshot of the loss for each point:
LR=0.001 (not convering on pytorch, converging on tensorflow)
LR=0.0005 (converging in 30 epochs) [I know that the validation loss is not 0, accuracy is ~70% but is expected]
As you can see the loss on the two experiment are very different in scale. What might cause that such a weird behavior? I call it 'wierd' cause I never seen that happen on tensorflow.
Is typicall such different behavior between those 2 framework? or am I loosing something?

Pytorch couldn't build multi scaled kernel nested model

I'm trying to create a modified MNIST model which takes input 1x28x28 MNIST tensor images, and it kind of branches into different models with different sized kernels, and accumulates at the end, so as to give a multi-scale-kerneled response in the spatial domain of the images. I'm worried about the model, since, I'm unable to construct it.
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
from torchvision import datasets, transforms
import torch.nn.functional as F
import timeit
import unittest
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)
# check availability of GPU and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# define a transforms for preparing the dataset
transform = transforms.Compose([
transforms.ToTensor(), # convert the image to a pytorch tensor
transforms.Normalize((0.1307,), (0.3081,)) # normalise the images with mean and std of the dataset
])
# Load the MNIST training, test datasets using `torchvision.datasets.MNIST` using the transform defined above
train_dataset = datasets.MNIST('./data',train=True,transform=transform,download=True)
test_dataset = datasets.MNIST('./data',train=False,transform=transform,download=True)
# create dataloaders for training and test datasets
# use a batch size of 32 and set shuffle=True for the training set
train_dataloader = Data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_dataloader = Data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=True)
# My Net
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# define a conv layer with output channels as 16, kernel size of 3 and stride of 1
self.conv11 = nn.Conv2d(1, 16, 3, 1) # Input = 1x28x28 Output = 16x26x26
self.conv12 = nn.Conv2d(1, 16, 5, 1) # Input = 1x28x28 Output = 16x24x24
self.conv13 = nn.Conv2d(1, 16, 7, 1) # Input = 1x28x28 Output = 16x22x22
# define a conv layer with output channels as 32, kernel size of 3 and stride of 1
self.conv21 = nn.Conv2d(16, 32, 3, 1) # Input = 16x26x26 Output = 32x24x24
self.conv22 = nn.Conv2d(16, 32, 5, 1) # Input = 16x24x24 Output = 32x20x20
self.conv23 = nn.Conv2d(16, 32, 7, 1) # Input = 16x22x22 Output = 32x16x16
# define a conv layer with output channels as 64, kernel size of 3 and stride of 1
self.conv31 = nn.Conv2d(32, 64, 3, 1) # Input = 32x24x24 Output = 64x22x22
self.conv32 = nn.Conv2d(32, 64, 5, 1) # Input = 32x20x20 Output = 64x16x16
self.conv33 = nn.Conv2d(32, 64, 7, 1) # Input = 32x16x16 Output = 64x10x10
# define a max pooling layer with kernel size 2
self.maxpool = nn.MaxPool2d(2), # Output = 64x11x11
# define dropout layer with a probability of 0.25
self.dropout1 = nn.Dropout(0.25)
# define dropout layer with a probability of 0.5
self.dropout2 = nn.Dropout(0.5)
# define a linear(dense) layer with 128 output features
self.fc11 = nn.Linear(64*11*11, 128)
self.fc12 = nn.Linear(64*8*8, 128) # after maxpooling 2x2
self.fc13 = nn.Linear(64*5*5, 128)
# define a linear(dense) layer with output features corresponding to the number of classes in the dataset
self.fc21 = nn.Linear(128, 10)
self.fc22 = nn.Linear(128, 10)
self.fc23 = nn.Linear(128, 10)
self.fc33 = nn.Linear(30,10)
def forward(self, x1):
# Use the layers defined above in a sequential way (folow the same as the layer definitions above) and
# write the forward pass, after each of conv1, conv2, conv3 and fc1 use a relu activation.
x = F.relu(self.conv11(x1))
x = F.relu(self.conv21(x))
x = F.relu(self.maxpool(self.conv31(x)))
#x = torch.flatten(x, 1)
x = x.view(-1,64*11*11)
x = self.dropout1(x)
x = F.relu(self.fc11(x))
x = self.dropout2(x)
x = self.fc21(x)
y = F.relu(self.conv12(x1))
y = F.relu(self.conv22(y))
y = F.relu(self.maxpool(self.conv32(y)))
#x = torch.flatten(x, 1)
y = y.view(-1,64*8*8)
y = self.dropout1(y)
y = F.relu(self.fc12(y))
y = self.dropout2(y)
y = self.fc22(y)
z = F.relu(self.conv13(x1))
z = F.relu(self.conv23(z))
z = F.relu(self.maxpool(self.conv33(z)))
#x = torch.flatten(x, 1)
z = z.view(-1,64*5*5)
z = self.dropout1(z)
z = F.relu(self.fc13(z))
z = self.dropout2(z)
z = self.fc23(z)
out = self.fc33(torch.cat((x, y, z), 0))
output = F.log_softmax(out, dim=1)
return output
import unittest
class TestImplementations(unittest.TestCase):
# Dataloading tests
def test_dataset(self):
self.dataset_classes = ['0 - zero',
'1 - one',
'2 - two',
'3 - three',
'4 - four',
'5 - five',
'6 - six',
'7 - seven',
'8 - eight',
'9 - nine']
self.assertTrue(train_dataset.classes == self.dataset_classes)
self.assertTrue(train_dataset.train == True)
def test_dataloader(self):
self.assertTrue(train_dataloader.batch_size == 32)
self.assertTrue(test_dataloader.batch_size == 32)
def test_total_parameters(self):
model = Net().to(device)
#self.assertTrue(sum(p.numel() for p in model.parameters()) == 1015946)
suite = unittest.TestLoader().loadTestsFromModule(TestImplementations())
unittest.TextTestRunner().run(suite)
def train(model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
# send the image, target to the device
data, target = data.to(device), target.to(device)
# flush out the gradients stored in optimizer
optimizer.zero_grad()
# pass the image to the model and assign the output to variable named output
output = model(data)
# calculate the loss (use nll_loss in pytorch)
loss = F.nll_loss(output, target)
# do a backward pass
loss.backward()
# update the weights
optimizer.step()
if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
# send the image, target to the device
data, target = data.to(device), target.to(device)
# pass the image to the model and assign the output to variable named output
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
model = Net().to(device)
## Define Adam Optimiser with a learning rate of 0.01
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
start = timeit.default_timer()
for epoch in range(1, 11):
train(model, device, train_dataloader, optimizer, epoch)
test(model, device, test_dataloader)
stop = timeit.default_timer()
print('Total time taken: {} seconds'.format(int(stop - start)) )
Here is my full code. I couldn't understand what could possibly go wrong...
It is giving
<ipython-input-72-194680537dcc> in forward(self, x1)
46 x = F.relu(self.conv11(x1))
47 x = F.relu(self.conv21(x))
---> 48 x = F.relu(self.maxpool(self.conv31(x)))
49 #x = torch.flatten(x, 1)
50 x = x.view(-1,64*11*11)
TypeError: 'tuple' object is not callable
Error.
P.S.: Pytorch Noob here.
You have mistakenly placed a comma at the end of the line where you define self.maxpool : self.maxpool = nn.MaxPool2d(2), # Output = 64x11x11 see?
This comma makes self.maxpool a tuple instead of a torch.nn.modules.pooling.MaxPool2d. Drop the comma at the end and this error is fixed.
I see you haven't given the stride argument in you definition of self.maxpool = nn.MaxPool2d(2). Choose one: e.g. self.maxpool = nn.MaxPool2d(2, stride = 2).

PyTorch version of as simple Keras LSTM model

Trying to translate a simple LSTM model in Keras to PyTorch code. The Keras model converges after just 200 epochs, while the PyTorch model:
needs many more epochs to reach the same loss level (200 vs. ~8000)
seems to overfit the inputs because the predicted value is not near 100
This is the Keras code:
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
X = array([10,20,30,20,30,40,30,40,50,40,50,60,50,60,70,60,70,80]).reshape((6,3,1))
y = array([40,50,60,70,80,90])
model = Sequential()
model.add(LSTM(50, activation='relu', recurrent_activation='sigmoid', input_shape=(3, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
model.fit(X, y, epochs=200, verbose=1)
x_input = array([70, 80, 90]).reshape((1, 3, 1))
yhat = model.predict(x_input, verbose=0)
print(yhat)
And this is the equivalent PyTorch code:
from numpy import array
import torch
import torch.nn as nn
import torch.nn.functional as F
X = torch.tensor([10,20,30,20,30,40,30,40,50,40,50,60,50,60,70,60,70,80]).float().reshape(6,3,1)
y = torch.tensor([40,50,60,70,80,90]).float().reshape(6,1)
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.lstm = nn.LSTM(input_size=1, hidden_size=50, num_layers=1, batch_first=True)
self.fc = nn.Linear(50, 1)
def forward(self, x):
batches = x.size(0)
h0 = torch.zeros([1, batches, 50])
c0 = torch.zeros([1, batches, 50])
(x, _) = self.lstm(x, (h0, c0))
x = x[:,-1,:] # Keep only the output of the last iteration. Before shape (6,3,50), after shape (6,50)
x = F.relu(x)
x = self.fc(x)
return x
model = Model()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())
n_epochs = 8000
for epoch in range(n_epochs):
model.train()
optimizer.zero_grad()
y_ = model(X)
loss = criterion(y_, y)
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}/{n_epochs}, loss = {loss.item()}")
model.eval()
x_input = torch.tensor([70, 80, 90]).float().reshape((1, 3, 1))
yhat = model(x_input)
print(yhat)
The only possible difference is the initial weight and bias values, but I don't think that slightly different weights and biases may account for such a big difference in behavior.
What am I missing in the PyTorch code?
The behaviour difference is because of the activation function in the LSTM API. By changing the activation to tanh, I can reproduce the problem in Keras too.
model.add(LSTM(50, activation='tanh', recurrent_activation='sigmoid', input_shape=(3, 1)))
There is no option to change the activation function to 'relu' in the pytorch LSTM API.
https://pytorch.org/docs/stable/nn.html#lstm
Taking the LSTM implementation from here, https://github.com/huggingface/torchMoji/blob/master/torchmoji/lstm.py
and changing hardsigmoid/tanh to sigmoid/relu, the model converges in pytorch as well.
I think you are initializing h0,c0 every time which is require at initial. So, better use the code below that i have modified. You can go through this link for RNN in pytorch: https://pytorch.org/docs/stable/nn.html?highlight=rnn#torch.nn.RNN
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.rnn = nn.RNN(input_size=1, hidden_size=50, num_layers=1, nonlinearity="relu", batch_first=True)
self.fc = nn.Linear(50, 1)
def forward(self, x):
# batches = x.size(0)
# h0 = torch.zeros([1, batches, 50])
# c0 = torch.zeros([1, batches, 50])
# (x, _) = self.lstm(x, (h0, c0))
(x, _) = self.rnn(x)
x = x[:,-1,:] # Keep only the output of the last iteration. Before shape (6,3,50), after shape (6,50)
x = F.relu(x)
x = self.fc(x)
return x
This gives good result of prediction within 2500 epochs.
I want to know why have you written below line of code and what is the purpose of it. So, that i can try to make it better.
x = x[:,-1,:] # Keep only the output of the last iteration. Before shape (6,3,50), after shape (6,50)

Tensor flow, making predictions using a trained network

So I am training a network to classify images in tensor flow. After I trained the network I began work on trying to use it to classify other images. The goal is to import an image, feed it to the classifier and have it print the result. I am having some trouble getting that part off the ground though. Here is what I have so far. I found that having tf.argmax(y,1) gave an error. I found that changing it to 0 fixed that error. However I am not convinced that it is actually working. I tossed 2 images through the classifier and they both got the same class even though they are vastly different. Just need some perspective here. Is this valid? Or is there something wrong here that will always feed me the same class (in this case I got class 0 for both of the images I tried).
Is this even the right way to approach making predictions in tensor flow? This is just the culmination of my debugging, not sure if it is what should be done or not.
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
X_train,X_validation,y_train,y_validation=train_test_split(X_train,y_train, test_size=20,random_state=0)
X_train, y_train = shuffle(X_train, y_train)
def LeNet(x):
# Arguments used for tf.truncated_normal, randomly defines variables
for the weights and biases for each layer
mu = 0
sigma = 0.1
# SOLUTION: Layer 1: Convolutional. Input = 32x32x3. Output = 28x28x6.
conv1_W = tf.Variable(tf.truncated_normal(shape=(5, 5, 3, 6), mean = mu, stddev = sigma))
conv1_b = tf.Variable(tf.zeros(6))
conv1 = tf.nn.conv2d(x, conv1_W, strides=[1, 1, 1, 1], padding='VALID') + conv1_b
# SOLUTION: Activation.
conv1 = tf.nn.relu(conv1)
# SOLUTION: Pooling. Input = 28x28x6. Output = 14x14x6.
conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
# SOLUTION: Layer 2: Convolutional. Output = 10x10x16.
conv2_W = tf.Variable(tf.truncated_normal(shape=(5, 5, 6, 16), mean = mu, stddev = sigma))
conv2_b = tf.Variable(tf.zeros(16))
conv2 = tf.nn.conv2d(conv1, conv2_W, strides=[1, 1, 1, 1], padding='VALID') + conv2_b
# SOLUTION: Activation.
conv2 = tf.nn.relu(conv2)
# SOLUTION: Pooling. Input = 10x10x16. Output = 5x5x16.
conv2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
# SOLUTION: Flatten. Input = 5x5x16. Output = 400.
fc0 = flatten(conv2)
# SOLUTION: Layer 3: Fully Connected. Input = 400. Output = 120.
fc1_W = tf.Variable(tf.truncated_normal(shape=(400, 120), mean = mu, stddev = sigma))
fc1_b = tf.Variable(tf.zeros(120))
fc1 = tf.matmul(fc0, fc1_W) + fc1_b
# SOLUTION: Activation.
fc1 = tf.nn.relu(fc1)
# SOLUTION: Layer 4: Fully Connected. Input = 120. Output = 84.
fc2_W = tf.Variable(tf.truncated_normal(shape=(120, 84), mean = mu, stddev = sigma))
fc2_b = tf.Variable(tf.zeros(84))
fc2 = tf.matmul(fc1, fc2_W) + fc2_b
# SOLUTION: Activation.
fc2 = tf.nn.relu(fc2)
# SOLUTION: Layer 5: Fully Connected. Input = 84. Output = 43.
fc3_W = tf.Variable(tf.truncated_normal(shape=(84, 43), mean = mu, stddev = sigma))
fc3_b = tf.Variable(tf.zeros(43))
logits = tf.matmul(fc2, fc3_W) + fc3_b
return logits
import tensorflow as tf
x = tf.placeholder(tf.float32, (None, 32, 32, 3))
y = tf.placeholder(tf.int32, (None))
one_hot_y = tf.one_hot(y, 43)
EPOCHS=10
BATCH_SIZE=128
rate = 0.001
logits = LeNet(x)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, one_hot_y)
loss_operation = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate = rate)
training_operation = optimizer.minimize(loss_operation)
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(one_hot_y, 1))
accuracy_operation = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
saver = tf.train.Saver()
def evaluate(X_data, y_data):
num_examples = len(X_data)
total_accuracy = 0
sess = tf.get_default_session()
for offset in range(0, num_examples, BATCH_SIZE):
batch_x, batch_y = X_data[offset:offset+BATCH_SIZE], y_data[offset:offset+BATCH_SIZE]
accuracy = sess.run(accuracy_operation, feed_dict={x: batch_x, y: batch_y})
total_accuracy += (accuracy * len(batch_x))
return total_accuracy / num_examples
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
num_examples = len(X_train)
print("Training...")
print()
for i in range(EPOCHS):
X_train, y_train = shuffle(X_train, y_train)
for offset in range(0, num_examples, BATCH_SIZE):
end = offset + BATCH_SIZE
batch_x, batch_y = X_train[offset:end], y_train[offset:end]
sess.run(training_operation, feed_dict={x: batch_x, y: batch_y})
validation_accuracy = evaluate(X_validation, y_validation)
print("EPOCH {} ...".format(i+1))
print("Validation Accuracy = {:.3f}".format(validation_accuracy))
print()
saver.save(sess, './lenet')
print("Model saved")
import cv2
image=cv2.imread('File path')
image=cv2.resize(image,(32,32)) #classifier takes 32X32 images
image=np.array(image)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver3 = tf.train.import_meta_graph('./lenet.meta')
saver3.restore(sess, "./lenet")
pred = tf.nn.softmax(logits)
predictions = sess.run(tf.argmax(y,0), feed_dict={x: image})
print (predictions)
So what had to happen here was first clear the kernel and outputs. Somewhere along the way my placeholders got muddled up and clearing the kernel fixed that right up. Then I had to realize what really had to get done here: I had to call up the softmax function on my new data.
Like this:
pred = tf.nn.softmax(logits)
classification = sess.run(pred, feed_dict={x: image_array})

How do I create a variable-length input LSTM in Keras?

I am trying to do some vanilla pattern recognition with an LSTM using Keras to predict the next element in a sequence.
My data look like this:
where the label of the training sequence is the last element in the list: X_train['Sequence'][n][-1].
Because my Sequence column can have a variable number of elements in the sequence, I believe an RNN to be the best model to use. Below is my attempt to build an LSTM in Keras:
# Build the model
# A few arbitrary constants...
max_features = 20000
out_size = 128
# The max length should be the length of the longest sequence (minus one to account for the label)
max_length = X_train['Sequence'].apply(len).max() - 1
# Normal LSTM model construction with sigmoid activation
model = Sequential()
model.add(Embedding(max_features, out_size, input_length=max_length, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
And here's how I attempt to train my model:
# Train the model
for seq in X_train['Sequence']:
print("Length of training is {0}".format(len(seq[:-1])))
print("Training set is {0}".format(seq[:-1]))
model.fit(np.array([seq[:-1]]), [seq[-1]])
My output is this:
Length of training is 13
Training set is [1, 3, 13, 87, 1053, 28576, 2141733, 508147108, 402135275365, 1073376057490373, 9700385489355970183, 298434346895322960005291, 31479360095907908092817694945]
However, I get the following error:
Exception: Error when checking model input: expected embedding_input_1 to have shape (None, 347) but got array with shape (1, 13)
I believe my training step is correctly setup, so my model construction must be wrong. Note that 347 is max_length.
How can I correctly build a variable-length input LSTM in Keras? I'd prefer not to pad the data. Not sure if it's relevant, but I'm using the Theano backend.
I am not clear about the embedding procedure. But still here is a way to implement a variable-length input LSTM. Just do not specify the timespan dimension when building LSTM.
import keras.backend as K
from keras.layers import LSTM, Input
I = Input(shape=(None, 200)) # unknown timespan, fixed feature size
lstm = LSTM(20)
f = K.function(inputs=[I], outputs=[lstm(I)])
import numpy as np
data1 = np.random.random(size=(1, 100, 200)) # batch_size = 1, timespan = 100
print f([data1])[0].shape
# (1, 20)
data2 = np.random.random(size=(1, 314, 200)) # batch_size = 1, timespan = 314
print f([data2])[0].shape
# (1, 20)
The trick to training and classifying sequences is training with masking and classifying using a stateful network. Here's an example that I made that classifies whether a sequence of variable length starts with zero or not.
import numpy as np
np.random.seed(1)
import tensorflow as tf
tf.set_random_seed(1)
from keras import models
from keras.layers import Dense, Masking, LSTM
import matplotlib.pyplot as plt
def stateful_model():
hidden_units = 256
model = models.Sequential()
model.add(LSTM(hidden_units, batch_input_shape=(1, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='relu', name='output'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
return model
def train_rnn(x_train, y_train, max_len, mask):
epochs = 10
batch_size = 200
vec_dims = 1
hidden_units = 256
in_shape = (max_len, vec_dims)
model = models.Sequential()
model.add(Masking(mask, name="in_layer", input_shape=in_shape,))
model.add(LSTM(hidden_units, return_sequences=False))
model.add(Dense(1, activation='relu', name='output'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
validation_split=0.05)
return model
def gen_train_sig_cls_pair(t_stops, num_examples, mask):
x = []
y = []
max_t = int(np.max(t_stops))
for t_stop in t_stops:
one_indices = np.random.choice(a=num_examples, size=num_examples // 2, replace=False)
sig = np.zeros((num_examples, max_t), dtype=np.int8)
sig[one_indices, 0] = 1
sig[:, t_stop:] = mask
x.append(sig)
cls = np.zeros(num_examples, dtype=np.bool)
cls[one_indices] = 1
y.append(cls)
return np.concatenate(x, axis=0), np.concatenate(y, axis=0)
def gen_test_sig_cls_pair(t_stops, num_examples):
x = []
y = []
for t_stop in t_stops:
one_indices = np.random.choice(a=num_examples, size=num_examples // 2, replace=False)
sig = np.zeros((num_examples, t_stop), dtype=np.bool)
sig[one_indices, 0] = 1
x.extend(list(sig))
cls = np.zeros((num_examples, t_stop), dtype=np.bool)
cls[one_indices] = 1
y.extend(list(cls))
return x, y
if __name__ == '__main__':
noise_mag = 0.01
mask_val = -10
signal_lengths = (10, 15, 20)
x_in, y_in = gen_train_sig_cls_pair(signal_lengths, 10, mask_val)
mod = train_rnn(x_in[:, :, None], y_in, int(np.max(signal_lengths)), mask_val)
testing_dat, expected = gen_test_sig_cls_pair(signal_lengths, 3)
state_mod = stateful_model()
state_mod.set_weights(mod.get_weights())
res = []
for s_i in range(len(testing_dat)):
seq_in = list(testing_dat[s_i])
seq_len = len(seq_in)
for t_i in range(seq_len):
res.extend(state_mod.predict(np.array([[[seq_in[t_i]]]])))
state_mod.reset_states()
fig, axes = plt.subplots(2)
axes[0].plot(np.concatenate(testing_dat), label="input")
axes[1].plot(res, "ro", label="result", alpha=0.2)
axes[1].plot(np.concatenate(expected, axis=0), "bo", label="expected", alpha=0.2)
axes[1].legend(bbox_to_anchor=(1.1, 1))
plt.show()
Not sure how applicable recurrent networks are for your sequences, ie how strongly dependent each element is on its preceding sequence as opposed to other factors. That being said (which doesn't help you one bit of course), if you don't want to pad your input with some bad value, a stateful model that processes a single timestep at once is the only alternative for variable length sequences IMHO. If you don't mind taking an alternative approach to encoding:
import numpy as np
import keras.models as kem
import keras.layers as kel
import keras.callbacks as kec
import sklearn.preprocessing as skprep
X_train, max_features = {'Sequence': [[1, 2, 4, 5, 8, 10, 16], [1, 2, 1, 5, 5, 1, 11, 16, 7]]}, 16
num_mem_units = 64
size_batch = 1
num_timesteps = 1
num_features = 1
num_targets = 1
num_epochs = 1500
model = kem.Sequential()
model.add(kel.LSTM(num_mem_units, stateful=True, batch_input_shape=(size_batch, num_timesteps, num_features),
return_sequences=True))
model.add(kel.Dense(num_targets, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam')
range_act = (0, 1) # sigmoid
range_features = np.array([0, max_features]).reshape(-1, 1)
normalizer = skprep.MinMaxScaler(feature_range=range_act)
normalizer.fit(range_features)
reset_state = kec.LambdaCallback(on_epoch_end=lambda *_ : model.reset_states())
# training
for seq in X_train['Sequence']:
X = seq[:-1]
y = seq[1:] # predict next element
X_norm = normalizer.transform(np.array(X).reshape(-1, 1)).reshape(-1, num_timesteps, num_features)
y_norm = normalizer.transform(np.array(y).reshape(-1, 1)).reshape(-1, num_timesteps, num_targets)
model.fit(X_norm, y_norm, epochs=num_epochs, batch_size=size_batch, shuffle=False,
callbacks=[reset_state])
# prediction
for seq in X_train['Sequence']:
model.reset_states()
for istep in range(len(seq)-1): # input up to not incl last
val = seq[istep]
X = np.array([val]).reshape(-1, 1)
X_norm = normalizer.transform(X).reshape(-1, num_timesteps, num_features)
y_norm = model.predict(X_norm)
yhat = int(normalizer.inverse_transform(y_norm[0])[0, 0])
y = seq[-1] # last
put = '{0} predicts {1:d}, expecting {2:d}'.format(', '.join(str(val) for val in seq[:-1]), yhat, y)
print(put)
which produces sth like:
1, 2, 4, 5, 8, 10 predicts 11, expecting 16
1, 2, 1, 5, 5, 1, 11, 16 predicts 7, expecting 7
with ridiculous loss, however.
It turns out that you can do this using ragged inputs.
Firstly, you need to convert your input data to classes using the to_categorical function
from tensorflow.keras.utils import to_categorical
from tensorflow.ragged import constant
X_train = constant(list(map(lambda x: to_categorical(x, num_classes=max_features),X_train)))
Then, you need to edit your model slightly:
model = Sequential()
model.add(Input((None,max_features),ragged=True)) # use this instead of an Embedding
model.add(Embedding(max_features, out_size, input_length=max_length, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
And then work from there!

Resources