Multiple propositions for multiple class prediction - keras

I am working on a word prediction problem. I have examples of career path, and I would like to be able to predict a next person's job using their last 2 jobs. I have built a LSTM model to perform it
I have a problem when intenting to get multiple results from keras model.predict_classes function. It only returns 1 result. I would like to get multiple results, ordered by their probability.
Here is the code :
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
in_text = seed_text
# generate a fixed number of words
for _ in range(n_words):
# encode the text as integer
encoded = tokenizer.texts_to_sequences([in_text])[0]
# pre-pad sequences to a fixed length
encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
# predict probabilities for each word
yhat = model.predict_classes(encoded, verbose=1)
print('yhat = ' + yhat)
#print('yhat : ' + str(yhat))
# map predicted word index to word
out_word = ''
for word, index in tokenizer.word_index.items():
if index == yhat:
out_word = word
break
# append to input
in_text += ' ' + out_word
return in_text
# source text
data = """apprenti electricien chefOdeOprojet \n
soudeur chefOdeOsection directeurOusine\n
mecanicien chefOdeOsection directeurOadjoint\n
ingenieur chefOdeOprojet directeurOadjoint directeurOusine\n
ingenieur chefOdeOprojet \n
apprenti soudeur chefOdeOsection chefOdeOprojet\n
ingenieurOetude chefOdeOprojet\n
ingenieurOetude manager chefOdeOprojet directeurOdepartement\n
apprenti gestionOproduction manager directeurOdepartement\n
ingenieurOetude commercial\n
soudeur ingenieurOetude manager directeurOadjoint\n
ingenieurOetude directeurOdepartement directeurOusine\n
apprenti soudeur\n
agentOsecurite chefOsecurite\n
apprenti mecanicien ouvrier manager\n
commercial directeurOadjoint\n
agentOsecurite chefOsecurite\n
directeurOusine retraite\n
ouvrier manager\n
ingenieur vente\n
secretaire comptable\n
comptable chefOcomptable\n
chefOcomptable directeurOdepartement\n
assistant secretaire comptable\n
assistant comptable\n
assistant secretaire commercial\n
commercial chefOdeOprojet\n
commercial vente chefOdeOprojet\n
electricien chefOdeOsection\n
apprenti ouvrier chefOdeOsection\n"""
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# encode 2 words -> 1 word
sequences = list()
for line in data.split('\n'):
encoded = tokenizer.texts_to_sequences([line])[0]
for i in range(2, len(encoded)):
sequence = encoded[i-2:i+1]
sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dropout(0.2))
#model.add(Dense(units = 3, activation = 'relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500, verbose=0)
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'electricien secretaire', 1))
and there is the console display:
Vocabulary Size: 24
Total Sequences: 20
Max Sequence Length: 3
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_2 (Embedding) (None, 2, 10) 240
_________________________________________________________________
lstm_2 (LSTM) (None, 50) 12200
_________________________________________________________________
dropout_2 (Dropout) (None, 50) 0
_________________________________________________________________
dense_2 (Dense) (None, 24) 1224
=================================================================
Total params: 13,664
Trainable params: 13,664
Non-trainable params: 0
_________________________________________________________________
None
1/1 [==============================] - 0s 86ms/step
yhat = [1]
electricien secretaire chefodeoprojet

If I understand your question correctly, you would like to see the probabilities associated with each class of a multi-classification problem?
The code looks pretty correct to me, but I would recommend trying a different evaluation step. I have gotten multi-classifier outputs with the following snippet:
# Fit the model
print "Fitting model..."
model.fit(np.asarray(self.X), np.asarray(self.Y), epochs=200, batch_size=10)
print "Model fitting complete."
self.TEST = np.asarray(self.TEST).reshape(( test_data.shape[0], 1, 128))
print "Predicting on Test (unseen) data..."
predictions = model.predict(self.TEST)
# Sigmoid predictions
labels = np.zeros(predictions.shape)
labels[predictions>0.5] = 1
print "Prediction labels for unseen: " + str(labels)
The output:
Prediction labels for unseen:
[[ 0. 1. 0. 0.]
[ 0. 1. 0. 0.]
[ 0. 1. 0. 0.]
[ 0. 1. 0. 0.]
[ 0. 1. 0. 0.]
[ 0. 0. 1. 0.]
[ 0. 0. 1. 0.]
[ 0. 0. 1. 0.]]
Each row denotes the classification of one sample; the index of the 1 represents which class (A,B,C,D) the sample fell into.

Related

why does my model not reach zero loss / still makes mistakes even though I intentionally overfit?

I am using the cartpole environment of openai and tried to make a DQN that solves it. I saw that the DQN doesn't get better scores even though I trained it for 200 episodes. Thus I tried to see how well the keras model performs that should learn the q values for each action.
# imports & constants
import numpy as np
import time, random
import matplotlib.pyplot as plt
import gym
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.callbacks import TensorBoard
import tensorflow as tf
import pickle
from collections import deque
NO_NEURONS = 20
BATCH_SIZE = 32
# make a model
model = Sequential()
model.add(Dense(NO_NEURONS, batch_input_shape=(32,4), activation=tf.nn.relu, name="dense1"))
model.add(Dense(NO_NEURONS, activation=tf.nn.relu, name="dense2"))
model.add(Dense(NO_NEURONS, activation=tf.nn.relu, name="dense3"))
model.add(Dense(NO_NEURONS, activation=tf.nn.relu, name="dense4"))
model.add(Dense(2, activation=tf.nn.sigmoid, name="dense5"))
model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError(), metrics=['MSE'])
print(current_qs.shape, future_qs.shape, len(transitions))
# this gives (32, 2) (32, 2) 32 which is good.
print("transitions: ", transitions[0])
print("old states q: ",old_states[0])
print("new states q: ", new_states[0])
"""
This gives:
transitions: (array([ 0.0209578 , -0.34232047, -0.00570952, 0.5816825 ], dtype=float32), 0, 1.0, array([ 0.01411139, -0.537362 , 0.00592413, 0.8725614 ], dtype=float32), False)
old states q: [ 0.00633551 -0.22556473 -0.00497439 0.24501044]
new states q: [ 1.8242138e-03 -4.2061529e-01 -7.4184951e-05 5.3612018e-01]
"""
# here we update the q values
# the reward should always be 1 because for every extra time the environment isn't done we get rewards.
for index, (old_state, action, reward, observation, done) in enumerate(transitions):
if done: # if done
q = reward # then the right q value is the reward
else:
next_q = np.max(future_qs[index])
q = reward + 0.90*next_q
# otherwise it is the reward + the discount factor times the next maximum q value
current_q = current_qs[index]
current_q[action] = q
x.append(old_state)
y.append(current_q)
x = np.array(x)
y = np.array(y)
print(x.shape, y.shape)
# gives (32, 4) (32, 2) which is good.
# actually fit the model
model.fit(x, y, epochs=500, batch_size=64, shuffle=False, verbose=2)
The last 12 epochs look like this and have exactly the same loss
Epoch 500/500
1/1 - 0s - loss: 0.1357 - MSE: 0.1357 - 5ms/epoch - 5ms/ste
For printing the results:
# see the mistakes still
preds = model.predict(x) for i in range(len(y)):
if np.argmax(preds[i]) != np.argmax(y[i]):
print("WRONG: ", i, np.argmax(preds[i]), preds[i], np.argmax(y[i]), y[i])
I get:
WRONG: 6 0 [0.9999985 0.65476906] 1 [0.55540055 1.5391604 ]
WRONG: 8 0 [1. 0.6853456] 1 [0.5077556 1. ]
WRONG: 20 0 [0.9998766 0.99974597] 1 [0.55703723 1.5438437 ]
WRONG: 25 0 [1. 0.6897808] 1 [0.54609764 1. ]
I thought that since I have more weights in my model that it should be able to perfectly learn the weights. Is that assumption wrong or does anyone see something that I missed?

Bi-LSTM with Keras : dimensions must be equal but are 7 and 300

I am creating for the first time a bilstm with keras but I am having difficulties. So that you understand, here are the steps I have done:
I created an embedding matrix with Glove for my x ;
def create_embeddings(fichier,dictionnaire,dictionnaire_tokens):
with open(fichier) as file:
line = file.readline()
max_words = max(dictionnaire_tokens.values())+1 #1032
max_size_dimensions = 300
emb_matrix = np.zeros((max_words,max_size_dimensions))
for item,count in dictionnaire_tokens.items():
try:
vecteur = dictionnaire[item]
except:
pass
if vecteur is not None:
emb_matrix[count]= vecteur
return emb_matrix
I did some one hot encoding with my y's;
def one_hot_encoding(file):
with open(file) as file:
line = file.readline()
liste = []
while line:
tag = line.split(" ")[1]
tag = [tag]
line = file.readline()
liste.append(tag)
one_hot = MultiLabelBinarizer()
array = one_hot.fit_transform(liste)
return array
I compiled my model with keras
from tensorflow.keras.layers import Bidirectional
model = Sequential()
embedding_layer = Embedding(input_dim=1031 + 1,
output_dim=300,
weights=[embedding_matrix],
trainable=False)
model.add(embedding_layer)
bilstm_layer = Bidirectional(LSTM(units=300, return_sequences=True))
model.add(bilstm_layer)
model.add(Dense(300, activation="relu"))
#crf_layer = CRF(units=len(self.tags), sparse_target=True)
#model.add(crf_layer)
model.compile(optimizer="adam", loss='binary_crossentropy', metrics='acc')
model.summary()
Input of my embedding layer (embedding matrix) :
[[ 0. 0. 0. ... 0. 0. 0. ]
[ 0. 0. 0. ... 0. 0. 0. ]
[ 0. 0. 0. ... 0. 0. 0. ]
...
[-0.068577 -0.71314 0.3898 ... -0.077923 -1.0469 0.56874 ]
[ 0.32461 0.50463 0.72544 ... 0.17634 -0.28961 0.29007 ]
[-0.33771 -0.24912 -0.032685 ... -0.033254 -0.45513 -0.13319 ]]
I train my model. However when I want to train it, I get the following message: ValueError: Dimensions must be equal, but are 7 and 300 for '{{node binary_crossentropy/mul}} = Mul[T=DT_FLOAT](binary_crossentropy/Cast, binary_crossentropy/Log)' with input shapes: [?,7], [?,300,300].
My embedding matrix was made with glove 300d so it has 300 dimensions. While my labels, I have only 7 labels. So I have to make my x and y have the same dimensions but how? Thank you!!!
keras.backend.clear_session()
from tensorflow.keras.layers import Bidirectional
model = Sequential()
_input = keras.layers.Input(shape=(300,1))
model.add(_input)
bilstm_layer = Bidirectional(LSTM(units=300, return_sequences=False))
model.add(bilstm_layer)
model.add(Dense(7, activation="relu")) #here 7 is the number of classes you have and None is the batch_size
#crf_layer = CRF(units=len(self.tags), sparse_target=True)
#model.add(crf_layer)
model.compile(optimizer="adam", loss='binary_crossentropy', metrics='acc')
model.summary()

ValueError while performing 'binary-crossentropy' ==> array shapes not matching

I was using tensorflow + keras while trying to implement a "Text classification" model to classify different types of movie reviews. I am running into a error which tells that the shapes aren't equal.
Because I am not sure where the error might be hidden, I can't produce a reprex example as i am not certain how to isolate the problem. It may be worth your time if the line with variable(x_val) is there as there may be a problem with the partitioning.
note this is not the final code. as I had already encountered an error at this point, i stopped writing it.
from __future__ import absolute_import, division, print_function
import numpy as np
import tensorflow as tf
from tensorflow import keras
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
#print ("The length if training data: "len(train_data[0]), "And labels: "len(test_data[0]))
word_index = imdb.get_word_index()
word_index = {k: (v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNKNOWN>"] = 2
word_index["<END>"] = 3
reverse_word_index = dict([(value, key) for (key, value) in word_index.items() ])
def decode_review(text):
return (' '.join([reverse_word_index.get(i , "?") for i in text ]))
print (decode_review(train_data[0]))
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
value=word_index["<PAD>"],
padding="post",
maxlen=256)
test_data = keras.preprocessing.sequence.pad_sequences(test_data,
value=word_index["<PAD>"],
padding="post",
maxlen=256)
#print ('train length :' ,len(train_data[0]), 'test length: ', len(train_data[1]))
vocab_size = 10000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size , 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(16, activation=tf.nn.sigmoid))
print ("the model summary is :======>>" , model.summary())
model.compile(optimizer="adam" , loss="binary_crossentropy", metrics=["acc"])
x_val = train_data[:10000]
partial_x_train = train_data[10000:]
y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]
history = model.fit(partial_x_train , partial_y_train , epochs=40 , batch_size=512,
validation_data=(x_val, y_val), verbose=1)
This is the error message I was getting:-----
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, None, 16) 160000
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16) 0
_________________________________________________________________
dense_2 (Dense) (None, 16) 272
_________________________________________________________________
dense_3 (Dense) (None, 16) 272
=================================================================
Total params: 160,544
Trainable params: 160,544
Non-trainable params: 0
_________________________________________________________________
the model summary is :======>> None
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-02082e1f39d4> in <module>()
57
58 history = model.fit(partial_x_train , partial_y_train , epochs=40 , batch_size=512,
---> 59 validation_data=(x_val, y_val), verbose=1)
ValueError: A target array with shape (15000, 1) was passed for an output of shape (None, 16) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.
You need to update the final/output layer of your model. Since it's a binary classification problem, the output Dense layer should have one node like follows:
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
You may want to check out this tutorial on text classification using IMDB dataset.

Error in model performance metrics

Well my neural network is as follows :
# Leaks data input is a 2-D vector of window*size*341 features
# Reshape to match picture format [Height x Width x Channel]
# Tensor input become 4-D: [Batch Size, Height, Width, Channel]
x = tf.reshape(x, shape= [-1, 16, 341, 2])
# Convolution Layer with 32 filters and a kernel size of 5
conv1 = tf.layers.conv2d(x, 6, 2, activation=tf.nn.relu)
# Max Pooling (down-sampling) with strides of 2 and kernel size of 2
conv1 = tf.layers.max_pooling2d(conv1, 2, 2)
# Convolution Layer with 64 filters and a kernel size of 3
conv2 = tf.layers.conv2d(conv1, 8, 3, activation=tf.nn.relu)
# Max Pooling (down-sampling) with strides of 2 and kernel size of 2
conv2 = tf.layers.max_pooling2d(conv2, 2, 2)
# Flatten the data to a 1-D vector for the fully connected layer
fc1 = tf.contrib.layers.flatten(conv2)
# Fully connected layer (in tf contrib folder for now)
fc1 = tf.layers.dense(fc1, 1024)
# Apply Dropout (if is_training is False, dropout is not applied)
fc1 = tf.layers.dropout(fc1, rate=dropout, training=is_training)
# 1-layer LSTM with n_hidden units.
out = tf.layers.dense(fc1, n_classes)
it predicts a multi-label classification vector on len = 339, first i wanted to make sure that i'm fully able to overfit small sample of data to make sure that every thing work okey and well defined.
I trained my neural network on 1700 len data,to measure my model performance i added accuracy as follow :
logits_train = conv_net(features, num_classes, dropout, reuse=False,
is_training=True)
logits_test = conv_net(features, num_classes, dropout, reuse=True,
is_training=False)
# Predictions
pred_classes = tf.cast(tf.greater(logits_test,0.5), tf.float32)
pred_probas = tf.nn.sigmoid(logits_test)
# If prediction mode, early return
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode, predictions=pred_classes)
# Define loss and optimizer
#tf.one_hot(tf.cast(labels,dtype=tf.int32),depth=2)
loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(labels,dtype=tf.float32),logits=logits_train))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op,global_step=tf.train.get_global_step())
# Evaluate the accuracy of the model
accuracy = tf.metrics.accuracy(labels=labels , predictions = pred_classes )
#correct_prediction = tf.equal(tf.round(tf.nn.sigmoid(logits_test)), tf.round(labels))
#accuracy1 = tf.metrics.mean(tf.cast(correct_prediction, tf.float32))
#acc_op = tf.metrics.mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred_classes,labels=labels))
# TF Estimators requires to return a EstimatorSpec, that specify
# the different ops for training, evaluating, ...
estim_specs = tf.estimator.EstimatorSpec(
mode=mode,
predictions=pred_probas,
loss=loss_op,
train_op=train_op,
eval_metric_ops={'accuracy': accuracy})
return estim_specs
The problem is that with few epochs the performance seems to be very good
for i in range(1,50):
print('Epoch',(i+1))
input_fn = tf.estimator.inputs.numpy_input_fn(x= curr_data_batch,y=curr_target_batch[:,:339] ,batch_size=96, shuffle=False)
model.train(input_fn=input_fn)
if (i+1) % 10 :
# eval the model
eval_model = model.evaluate(input_fn=input_fn)
print('Loss ,',eval_model['loss'] )
print('accuracy ,',eval_model['accuracy'] )
Loss , 0.029562088
accuracy , 0.9958855
Epoch 3:
Loss , 0.028194984
accuracy , 0.99588597
Epoch 4:
Loss , 0.027557796
accuracy , 0.9958862
but when i try to predict same training data i got fully oposet metrics
loss = 0.65
accuracy = 0.33
I don't know where this issue come from did i miss defined something or no ?
Ty

Keras, stateless LSTM

Here is a very simple example of LSTM in stateless mode and we train it on a very simple sequence [0–>1] and [0–>2]
Any idea why it won’t converge in stateless mode.?
We have a batch of size 2 with 2 samples and it supposed to keep the state within the batch. When predicting we would like to receive successively 1 and 2.
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import numpy
# define sequences
seq = [0, 1, 0, 2]
# convert sequence into required data format.
#We are going to extract 2 samples [0–>1] and [0–>2] and convert them into one hot vectors
seqX=numpy.array([[( 1. , 0. , 0.)], [( 1. , 0. , 0.)]])
seqY=numpy.array([( 0. , 1. , 0.) , ( 0. , 0. , 1.)])
# define LSTM configuration
n_unique = len(set(seq))
n_neurons = 20
n_batch = 2
n_features = n_unique #which is =3
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, input_shape=( 1, n_features) ))
model.add(Dense(n_unique, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='Adam')
# train LSTM
model.fit(seqX, seqY, epochs=300, batch_size=n_batch, verbose=2, shuffle=False)
# evaluate LSTM
print('Sequence')
result = model.predict_classes(seqX, batch_size=n_batch, verbose=0)
for i in range(2):
print('X=%.1f y=%.1f, yhat=%.1f' % (0, i+1, result[i]))
Example 2
Here I want to clarify a bit what result I want.
Same code example but in stateful mode (stateful=True). It works perfectly. We feed the network 2 times with zeros and get 1 and then 2. But I want to get the same result in stateless mode as it supposed to keep the state within the batch.
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import numpy
# define sequences
seq = [0, 1, 0, 2]
# convert sequences into required data format
seqX=numpy.array([[( 1. , 0. , 0.)], [( 1. , 0. , 0.)]])
seqY=numpy.array([( 0. , 1. , 0.) , ( 0. , 0. , 1.)])
# define LSTM configuration
n_unique = len(set(seq))
n_neurons = 20
n_batch = 1
n_features = n_unique
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, batch_input_shape=(n_batch, 1, n_features), stateful=True ))
model.add(Dense(n_unique, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='Adam')
# train LSTM
for epoch in range(300):
model.fit(seqX, seqY, epochs=1, batch_size=n_batch, verbose=2, shuffle=False)
model.reset_states()
# evaluate LSTM
print('Sequence')
result = model.predict_classes(seqX, batch_size=1, verbose=0)
for i in range(2):
print('X=%.1f y=%.1f, yhat=%.1f' % (0, i+1, result[i]))
As a correct result we should get:
Sequence
X=0.0 y=1.0, yhat=1.0
X=0.0 y=2.0, yhat=2.0
You must feed one sequence with two steps instead of two sequences with one step:
One sequence, two steps: seqX.shape = (1,2,3)
Two sequences, one step: seqX.shape = (2,1,3)
The input shape is (numberOfSequences, stepsPerSequence, featuresPerStep)
seqX = [[[1,0,0],[1,0,0]]]
If you want to get both steps for y as output, you must use return_sequences=True.
LSTM(n_neurons, input_shape=( 1, n_features), return_sequences=True)
The entire working code:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import numpy
# define sequences
seq = [0, 1, 0, 2]
# convert sequence into required data format.
#We are going to extract 2 samples [0–>1] and [0–>2] and convert them into one hot vectors
seqX=numpy.array([[[ 1. , 0. , 0.], [ 1. , 0. , 0.]]])
seqY=numpy.array([[[0. , 1. , 0.] , [ 0. , 0. , 1.]]])
#shapes are (1,2,3) - 1 sequence, 2 steps, 3 features
# define LSTM configuration
n_unique = len(set(seq))
n_neurons = 20
n_features = n_unique #which is =3
#no need for batch size
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, input_shape=( 2, n_features),return_sequences=True))
#the input shape must have two steps
model.add(Dense(n_unique, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='Adam')
# train LSTM
model.fit(seqX, seqY, epochs=300, verbose=2)
#no shuffling and no batch size needed.
# evaluate LSTM
print('Sequence')
result = model.predict_classes(seqX, verbose=0)
print(seqX)
print(result) #all steps are predicted in a single array (with return_sequences=True)

Resources