IndexError: List Index out of range Keras Tokenizer - python-3.x

I'm working with the sentiment140 dataset to try and learn sentiment analysis using RNNs. I found this tutorial online that uses the keras.imdb datasource, but I want to try and use my own datasource, so I have tried to adapt the code my own data.
Tutorial: https://towardsdatascience.com/a-beginners-guide-on-sentiment-analysis-with-rnn-9e100627c02e
The data preprocessing involves extracting series data and then tokenizing and padding it before sending it to the model for training. I performed these operations below, in my code but whenever I try to run the training I get if isinstance(data[0], list):IndexError: list index out of range. I did not define data so this leads me to believe that I did something that keras or tensorflow did not like. Any ideas as to what is causing this error?
My data is currently in a csv file format with the headers being SENTIMENT and TEXT. SENTIMENT is 0 for negative and 1 for positive. TEXT is the processed tweet that was collected. Here is a sample.
Dataset CSV (Only a view lines to save space)
SENTIMENT,TEXT
0,about to file tax
0,ahh i hate dogs
1,My paycheck came in today
1,lot to do before chi this weekend
1,lol love food
Code
import pandas as pd
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import json
import numpy as np
# Load in DS
df = pd.read_csv('./train.csv')
print(df.head())
#Create sequence
vocabulary_size = 1000
tokenizer = Tokenizer(num_words= vocabulary_size, split=' ')
tokenizer.fit_on_texts(df['TEXT'].values)
X_train = tokenizer.texts_to_sequences(df['TEXT'].values)
#Pad Sequence
X_train = pad_sequences(X_train)
print(X_train)
#Get Sentiment
y_train = df['SENTIMENT'].tolist()
#create model
max_words = 24
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
embedding_size=32
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
batch_size = 64
num_epochs = 3
X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]
model.fit(X_train2, y_train2,
validation_data=(X_valid, y_valid),
batch_size=batch_size,
epochs=num_epochs)
Output
Using TensorFlow backend.
SENTIMENT TEXT
0 0 aww that be bummer You shoulda get david carr ...
1 0 be upset that he can not update his facebook b...
2 0 I dive many time for the ball manage to save t...
3 0 my whole body feel itchy and like its on fire
4 0 no it be not behave at all be mad why be here ...
[[ 0 0 0 ... 3 10 5]
[ 0 0 0 ... 46 47 89]
[ 0 0 0 ... 29 9 96]
...
[ 0 0 0 ... 30 309 310]
[ 0 0 0 ... 0 0 72]
[ 0 0 0 ... 33 312 313]]
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 24, 32) 32000
_________________________________________________________________
lstm_1 (LSTM) (None, 100) 53200
_________________________________________________________________
dense_1 (Dense) (None, 1) 101
=================================================================
Total params: 85,301
Trainable params: 85,301
Non-trainable params: 0
_________________________________________________________________
None
Traceback (most recent call last):
File "mcve.py", line 50, in <module>
epochs=num_epochs)
File "/home/dv/tensorflow/venv/lib/python3.6/site-packages/keras/engine/training.py", line 950, in fit
batch_size=batch_size)
File "/home/dv/tensorflow/venv/lib/python3.6/site-packages/keras/engine/training.py", line 787, in _standardize_user_data
exception_prefix='target')
File "/home/dv/tensorflow/venv/lib/python3.6/site-packages/keras/engine/training_utils.py", line 79, in standardize_input_data
if isinstance(data[0], list):
IndexError: list index out of range
JUPYTER NOTEBOOK ERROR
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-25-184505b70981> in <module>()
20 model.fit(X_train2, y_train2,
21 batch_size=batch_size,
---> 22 epochs=num_epochs)
23
~/tensorflow/venv/lib/python3.6/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
948 sample_weight=sample_weight,
949 class_weight=class_weight,
--> 950 batch_size=batch_size)
951 # Prepare validation data.
952 do_validation = False
~/tensorflow/venv/lib/python3.6/site-packages/keras/engine/training.py in _standardize_user_data(self, x, y, sample_weight, class_weight, check_array_lengths, batch_size)
785 feed_output_shapes,
786 check_batch_axis=False, # Don't enforce the batch size.
--> 787 exception_prefix='target')
788
789 # Generate sample-wise weight values given the `sample_weight` and
~/tensorflow/venv/lib/python3.6/site-packages/keras/engine/training_utils.py in standardize_input_data(data, names, shapes, check_batch_axis, exception_prefix)
77 'for each key in: ' + str(names))
78 elif isinstance(data, list):
---> 79 if isinstance(data[0], list):
80 data = [np.asarray(d) for d in data]
81 elif len(names) == 1 and isinstance(data[0], (float, int)):
IndexError: list index out of range

Edit
My former suggestion is wrong. I've checked your code and run it, and it works without errors for me.
Then I've looked at the source code, standardize_input_data function. There's a line which checks a data argument:
def standardize_input_data(data,
names,
shapes=None,
check_batch_axis=True,
exception_prefix=''):
"""Normalizes inputs and targets provided by users.
Users may pass data as a list of arrays, dictionary of arrays,
or as a single array. We normalize this to an ordered list of
arrays (same order as `names`), while checking that the provided
arrays have shapes that match the network's expectations.
# Arguments
data: User-provided input data (polymorphic).
...
At the line 79:
elif isinstance(data, list):
if isinstance(data[0], list):
...
So, it looks like in case of error an input data is list, but a list of zero length.
A standartize_input_data function is called inside Model.fit(...) method throught a call to Model._standardize_user_data(...). Through this chain of functions, passed data argument gets a value of x argument of Model.fit(x, y, ...). So, I guess is that the problem with type or content of X_train2 or X_valid. Would you provide X_train2 and X_val in addition to X_train content?
Old wrong suggestion
You should increase vocabulary size by one to deal with out-of-vocabulary tokens, I guess.
I.e, change initialization of the Embedding layer:
model.add(Embedding(vocabulary_size + 1, embedding_size, input_length=max_words))
According to the docs, "input_dim: int > 0. Size of the vocabulary, i.e. maximum integer index + 1".
You may check a max. value of the max(X_train) (edited).
Hope it helps!

Related

Tensorflow HammingLoss gives ValueError with keras.utils.Sequence

I am working on a multi-label image classification problem with 13 labels. I want to use Hamming Loss to evaluate the performance of the model. So I specified tfa.metrics.HammingLoss(mode = 'multilabel') in the metrics parameter during model compilation. This worked when I provided both X_train and y_train to model.fit(), but it threw a ValueError when I used a Sequence object (described below) for training.
Data Generator description
I used a keras.utils.Sequence input object similar to what is present here. The generator returns 2 numpy arrays for each batch - the first array consists of the input images of shape (128, 128, 3) and the second array consists of labels each of shape (13,).
This is what my code looks like:
model.compile(
loss='binary_crossentropy',
optimizer='rmsprop',
metrics=[tfa.metrics.HammingLoss(mode = 'multilabel')]
)
model.fit(
train_datagen,
epochs = 5,
batch_size = BATCH_SIZE,
steps_per_epoch = TOTAL // BATCH_SIZE
)
And this is the error that I obtained:
Epoch 1/5
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-140-978987a2bbaa> in <module>
3 epochs=5,
4 batch_size=BATCH_SIZE,
----> 5 steps_per_epoch = 2000 // BATCH_SIZE
6 # validation_data=validation_generator,
7 )
4 frames
/usr/local/lib/python3.7/dist-packages/tensorflow_addons/metrics/hamming.py in else_body_2()
64 try:
65 do_return = True
---> 66 retval_ = (ag__.ld(nonzero) / ag__.converted_call(ag__.ld(y_true).get_shape, (), None, fscope)[(- 1)])
67 except:
68 do_return = False
ValueError: in user code:
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1051, in train_function *
return step_function(self, iterator)
File "/usr/local/lib/python3.7/dist-packages/tensorflow_addons/metrics/utils.py", line 66, in update_state *
matches = self._fn(y_true, y_pred, **self._fn_kwargs)
File "/usr/local/lib/python3.7/dist-packages/tensorflow_addons/metrics/hamming.py", line 133, in hamming_loss_fn *
return nonzero / y_true.get_shape()[-1]
ValueError: None values not supported.
How do I correct this? Is there any issue with the format of the labels?

Problem of Predict of 1 example in a Keras model - input must be a vector, got shape: []

New to the Keras model implementation and despite looking for answers:
https://stackoverflow.com/questions/60991253/invalidargumenterror-input-must-be-a-vector-got-shape
https://stackoverflow.com/questions/41736677/how-could-keras-model-predict-only-one-sample
But still couldnt make it work :(
I successfully trained my Keras model (words embeddings using "https://tfhub.dev/google/universal-sentence-encoder/4")
But when I try to predict with:
test_text = ["We are looking for Data Scientists"]
test_text = np.array(test_text , dtype=object)[:, np.newaxis]
predicts = model.predict(test_text , batch_size=32)
predicts
But I get the following error:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-150-078cff510ad4> in <module>
----> 1 predicts = model.predict(test_text, batch_size=32)
2
3 predicts
1 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
53 ctx.ensure_initialized()
54 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 55 inputs, attrs, num_outputs)
56 except core._NotOkStatusException as e:
57 if name is not None:
InvalidArgumentError: Graph execution error:
input must be a vector, got shape: []
[[{{node text_preprocessor/tokenize/StringSplit/StringSplit}}]] [Op:__inference_predict_function_838262]
Below the model summary and the batch input shape required - but really dont get it to work :(
Any HELP more than welcome!!!
Thanks
>> model.summary()
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=
input_18 (InputLayer) \[(None, 1)\] 0
lambda_14 (Lambda) (None, 512) 0
dense_1 (Dense) (None, 256) 131328
dense_2 (Dense) (None, 788) 202516
=================================================================
Total params: 333,844
Trainable params: 333,844
Non-trainable params: 0
config = model.get_config() # Returns pretty much every information about your model
print(config\["layers"\]\[0\]\["config"\]\["batch_input_shape"\]) # returns a tuple of width, height and channels
(None, 1)
Looking at some answers like
Keras model input shape wrong
I also tried to reshape like that
predicts = model.predict(test_text.reshape((1, -1)))
predicts
But I get exactly the same error than previously

I want to know how can we give a categorical variable as an input to an embedding layer in keras and train that embedding layer?

let's say we have a data frame where we have a categorical column which has 7 categories - Monday, Tuesday, Wednesday, Thursday, Friday, Saturday and Sunday. Let's say we have 100 data points and we want to give the categorical data as an input to the embedding layer and train the embedding layer using Keras. How do we actually achieve it? Can you share some intuition with code examples?
I have tried this code but it gives me an error which says "ValueError: "input_length" is 1, but received input has shape (None, 26)". I have referred to this blog https://medium.com/#satnalikamayank12/on-learning-embeddings-for-categorical-data-using-keras-165ff2773fc9, but I didn't get how to use it for my particular case.
from sklearn.preprocessing import LabelEncoder
l_encoder=LabelEncoder()
l_encoder.fit(X_train["Weekdays"])
encoded_weekdays_train=l_encoder.transform(X_train["Weekdays"])
encoded_weekdays_test=l_encoder.transform(X_test["Weekdays"])
no_of_unique_cat=len(X_train.school_state.unique())
embedding_size = min(np.ceil((no_of_unique_cat)/2),50)
embedding_size = int(embedding_size)
vocab = no_of_unique_cat+1
#Get the flattened LSTM output for categorical text
input_layer2 = Input(shape=(embedding_size,))
embedding = Embedding(input_dim=vocab, output_dim=embedding_size, input_length=1, trainable=True)(input_layer2)
flatten_school_state = Flatten()(embedding)
I want to know in case of 7 categories, what will be the shape of input_layer2? What should be the vocab size, output dim and input_length? Can anyone explain, or correct my code? Your insights will be really helpful.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-46-e28d41acae85> in <module>
1 #Get the flattened LSTM output for input text
2 input_layer2 = Input(shape=(embedding_size,))
----> 3 embedding = Embedding(input_dim=vocab, output_dim=embedding_size, input_length=1, trainable=True)(input_layer2)
4 flatten_school_state = Flatten()(embedding)
~/anaconda3/lib/python3.7/site-packages/keras/engine/base_layer.py in __call__(self, inputs, **kwargs)
472 if all([s is not None
473 for s in to_list(input_shape)]):
--> 474 output_shape = self.compute_output_shape(input_shape)
475 else:
476 if isinstance(input_shape, list):
~/anaconda3/lib/python3.7/site-packages/keras/layers/embeddings.py in compute_output_shape(self, input_shape)
131 raise ValueError(
132 '"input_length" is %s, but received input has shape %s' %
--> 133 (str(self.input_length), str(input_shape)))
134 elif s1 is None:
135 in_lens[i] = s2
ValueError: "input_length" is 1, but received input has shape (None, 26)
embedding_size can never be the input size.
A Keras embedding takes "integers" as input. You should have your data as numbers from 0 to 6.
If your 100 data points form a sequence of days, you cannot restrict the length of the sequences in the embedding to 1.
Your input shape should be (length_of_sequence,). Which means your training data should have shape (any, length_of_sequence). Which is probably (1, 100) by your description.
All the rest is automatic.

How to debug "ValueError: Error when checking input"

I trained a model using keras, now I want to predict the value from each row in a pandas dataframe. I don't know where the shape of the input is changing, it looks of all the way down to the prediction step.
I want to compare the performance of my NN before and after loading the trained weights. I already have my weights saved
PS: I loosely followed the steps in this tutorial
Create the NN:
def create_model(sample_input): # numpy array so we can use .shape
model = tf.keras.Sequential([
layers.Dense(64, activation='relu',
input_shape=sample_input.shape),
layers.Dense(64, activation='relu'),
layers.Dense(9, activation='softmax')
])
model.compile(optimizer=tf.train.RMSPropOptimizer(0.01),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
return model
Predict:
right = 0
wrong = 0
for index, row in X_test.iterrows():
print(row.values.shape)
prediction = model.predict(row.values)
if prediction == y_test.values[index]:
right += 1
else:
wrong +=1
print("Correct prediction = ", right)
print("Wrong prediction = ", wrong)
However, I get this output (note: it breaks in the first iteration, notice when I print the shape of the sample to predict, it matches the expected input that Keras complained about):
(11,)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-30-7b432180b212> in <module>
3 for index, row in X_test.iterrows():
4 print(row.values.shape)
----> 5 prediction = model.predict(row.values)
6 if prediction == y_test.values[index]:
7 right += 1
~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py in predict(self, x, batch_size, verbose, steps, max_queue_size, workers, use_multiprocessing)
1094 # batch size.
1095 x, _, _ = self._standardize_user_data(
-> 1096 x, check_steps=True, steps_name='steps', steps=steps)
1097
1098 if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py in _standardize_user_data(self, x, y, sample_weight, class_weight, batch_size, check_steps, steps_name, steps, validation_split, shuffle)
2380 feed_input_shapes,
2381 check_batch_axis=False, # Don't enforce the batch size.
-> 2382 exception_prefix='input')
2383
2384 if y is not None:
~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_utils.py in standardize_input_data(data, names, shapes, check_batch_axis, exception_prefix)
360 'Error when checking ' + exception_prefix + ': expected ' +
361 names[i] + ' to have shape ' + str(shape) +
--> 362 ' but got array with shape ' + str(data_shape))
363 return data
364
ValueError: Error when checking input: expected dense_9_input to have shape (11,) but got array with shape (1,)
I tried wrapping it like model.predict([row.values]), thinking at some point keras accesses the inner elements, but no luck, same problem.
I expect the model to predict something, even if it's wrong.

Input Dimensions Tensorflow v1.8 ConvLSTMCell

ConvLSTMCell Official Docs
GitHub _conv where the error occurs
Issue
I'm experimenting with the ConvLSTMCell in tensorflow r1.8. The error I'm continuing to generate occurs in the __call__ method of ConvLSTMCell. The _conv method is invoked and the error is raised.
ValueError: Conv Linear Expects 3D, 4D, 5D
The error is raised from the unstacked inputs. unstacked (in this example) has dimensions of [BATCH_SIZE, N_INPUTS] = [2,5]. I am using tf.unstack to generate the required sequence that the ConvLSTMCell requires.
Why use tf.unstack?
If the input array is not unstacked, the TypeError below is raised.
TypeError: inputs must be a sequence
Question
What am I missing on the formatting? I've read through related issues but have not found anything that has guided me into a working implementation.
Are the placeholder dimensions correct?
Should I be unstacking or is there a better way?
Am I providing the proper input dimension into the ConvLSTMCell?
Code
# Parameters
TIME_STEPS = 28
N_INPUT = 5
N_HIDDEN = 128
LEARNING_RATE = 0.001
NUM_UNITS = 28
CHANNEL = 1
tf.reset_default_graph()
# Input placeholders
x = tf.placeholder(tf.float32, [BATCH_SIZE, TIME_STEPS, N_INPUT])
y = tf.placeholder(tf.float32, [None, 1])
# Format input as a sequence for LSTM Input
unstacked = tf.unstack(x, TIME_STEPS, 1) # shape=(timesteps, batch, inputs)
# Convolutional LSTM Layer
lstm_layer = tf.contrib.rnn.ConvLSTMCell(
conv_ndims=1,
input_shape=[BATCH_SIZE, N_INPUT],
output_channels=5,
kernel_shape=[7,5]
)
# Error is generated when the lstm_layer is invoked
outputs, _ = tf.contrib.rnn.static_rnn(
lstm_layer,
unstacked,
dtype=tf.float32)
Error Message
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-83-3568a097e4ea> in <module>()
10 lstm_layer,
11 unstacked,
---> 12 dtype=tf.float32)
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py in static_rnn(cell, inputs, initial_state, dtype, sequence_length, scope)
1322 state_size=cell.state_size)
1323 else:
-> 1324 (output, state) = call_cell()
1325
1326 outputs.append(output)
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py in <lambda>()
1309 varscope.reuse_variables()
1310 # pylint: disable=cell-var-from-loop
-> 1311 call_cell = lambda: cell(input_, state)
1312 # pylint: enable=cell-var-from-loop
1313 if sequence_length is not None:
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/python/ops/rnn_cell_impl.py in __call__(self, inputs, state, scope)
230 setattr(self, scope_attrname, scope)
231 with scope:
--> 232 return super(RNNCell, self).__call__(inputs, state)
233
234 def _rnn_get_variable(self, getter, *args, **kwargs):
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/python/layers/base.py in __call__(self, inputs, *args, **kwargs)
715
716 if not in_deferred_mode:
--> 717 outputs = self.call(inputs, *args, **kwargs)
718 if outputs is None:
719 raise ValueError('A layer\'s `call` method should return a Tensor '
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/contrib/rnn/python/ops/rnn_cell.py in call(self, inputs, state, scope)
2110 cell, hidden = state
2111 new_hidden = _conv([inputs, hidden], self._kernel_shape,
-> 2112 4 * self._output_channels, self._use_bias)
2113 gates = array_ops.split(
2114 value=new_hidden, num_or_size_splits=4, axis=self._conv_ndims + 1)
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/contrib/rnn/python/ops/rnn_cell.py in _conv(args, filter_size, num_features, bias, bias_start)
2184 if len(shape) not in [3, 4, 5]:
2185 raise ValueError("Conv Linear expects 3D, 4D "
-> 2186 "or 5D arguments: %s" % str(shapes))
2187 if len(shape) != len(shapes[0]):
2188 raise ValueError("Conv Linear expects all args "
ValueError: Conv Linear expects 3D, 4D or 5D arguments: [[2, 5], [2, 2, 5]]
Here's an example with a couple tweaks, which at least passes static shape checking:
import tensorflow as tf
# Parameters
TIME_STEPS = 28
N_INPUT = 5
N_HIDDEN = 128
LEARNING_RATE = 0.001
NUM_UNITS = 28
CHANNEL = 1
BATCH_SIZE = 16
# Input placeholders
x = tf.placeholder(tf.float32, [BATCH_SIZE, TIME_STEPS, N_INPUT])
y = tf.placeholder(tf.float32, [None, 1])
# Format input as a sequence for LSTM Input
unstacked = tf.unstack(x[..., None], TIME_STEPS, 1) # shape=(timesteps, batch, inputs)
# Convolutional LSTM Layer
lstm_layer = tf.contrib.rnn.ConvLSTMCell(
conv_ndims=1,
input_shape=[N_INPUT, 1],
output_channels=5,
kernel_shape=[7]
)
# Error is generated when the lstm_layer is invoked
outputs, _ = tf.contrib.rnn.static_rnn(
lstm_layer,
unstacked,
dtype=tf.float32)
Notes:
input_shape does not include the batch dimension (see docstring)
The input needs a channels dimension. Fine for it to be one in the input (that's what I've done).
Not sure what more than one dimension on kernel_shape would mean for a 1-D convolution.

Resources