Keras: setting an array element with a sequence error - keras

I try to build a very simple LSTM to classify text.
def encoded(texts):
res = [one_hot(text, 100000, filters='!"#$%&()*+,-./:;<=>?#[\]^_`{|}~', split=' ') for text in texts]
return res
def train(X, y, X_t, y_t):
X = encoded(X)
X_t = encoded(X_t)
model = Sequential()
model.add(Embedding(100000,100))
model.add(Bidirectional(LSTM(20,return_sequences = True),merge_mode='ave'))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.fit(np.array(X), np.array(y), batch_size=16, epochs=8)
score = model.evaluate(np.array(X_t), np.array(y_t), batch_size = 16)
print(score)
However I got this error:
ValueError: setting an array element with a sequence.
It seems like embedding layer didnt create right dimension vector or something wrong with the format of input X(X_t).
Any idea?

Related

How do I decode the output of my seq-to-seq model if I'm using an embedding layer?

I have a seq to seq model trained of some clever bot data:
justphrases_X is a list of sentences and justphrases_Y is a list of responses to those sentences.
maxlen = 62
#low is a list of all the unique words.
def Convert_To_Encoding(just_phrases):
encodings = []
for sentence in just_phrases:
onehotencoded = one_hot(sentence, len(low))
encodings.append(np.array(onehotencoded))
encodings_padded = pad_sequences(encodings, maxlen=maxlen, padding='post', value = 0.0)
return encodings_padded
encodings_X_padded = Convert_To_Encoding(just_phrases_X)
encodings_y_padded = Convert_To_Encoding(just_phrases_y)
model = Sequential()
embedding_layer = Embedding(len(low), output_dim=8, input_length=maxlen)
model.add(embedding_layer)
model.add(GRU(128)) # input_shape=(None, 496)
model.add(RepeatVector(numberofwordsoutput)) #number of characters?
model.add(GRU(128, return_sequences = True))
model.add(Flatten())
model.add(Dense(62, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer= 'adam', metrics=['accuracy'])
model.summary()
model.fit(encodings_X_padded, encodings_y_padded, batch_size = 1, epochs=1) #, validation_data = (testX, testy)
model.save("cleverbottheseq-uel.h5")
When I use this model for prediction, the output will be between 0 and 1 because of my use of softmax. However as I have around 3000 unique words, each with a separate integer assigned to it, how do I essentially repeat what the model did during training and convert the output back to an integer which has a word assigned to it?
I dont think it is possible to create seq2seq with Sequential API. Try to create encoder and decoder separately with Functional API. You need two inputs - first for encoder, second - for decoder.

with tensorflow and keras how to find the "category" of a given string

Hello ML/AI newbie here,
I'm asking this question because I've no idea about machine learning, ai, e.t.c and I've no idea how to continue, what questions to ask. Even if I accidentally find the solution i wouldn't know.
Ok, I followed this tutorial about "Text Classification" and it went pretty well, no problems up to here.
https://www.youtube.com/watch?v=6g4O5UOH304&list=WL&index=8&t=0s
It classifies IMDB comments and checks if a review is "Positive" or "Negative", "0" or "1"
My question is
Let say I've my own dataset, similar to IMDB but instead of "0" and "1" I have several categories as numbers like "1,2,3,4,5,6,7,8,9,10,11,12,...." for each string. So I need it to return one of these numbers (since it's learning let say two of them if it can't decide)
What should I do?
A link to a tutorial related to what I need would be great too.
import tensorflow as tf
from tensorflow import keras
import numpy as np
data = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=3000)
word_index = data.get_word_index()
word_index = {k:(v+3) for k, v in word_index.items()}
word_index["<PAD>"] = 0;
word_index["<START>"] = 1;
word_index["<UNK>"] = 2;
word_index["<UNUSED>"] = 3;
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding="post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding="post", maxlen=250)
def decode_review(text):
return " ".join([reverse_word_index.get(i, "?") for i in text])
model = keras.Sequential()
model.add(keras.layers.Embedding(10000, 6))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))
#model.summary()
model.compile(optimizer="adam", loss="binary_crossentropy", metrics="accuracy")
x_val = train_data[:10000]
x_train = train_data[10000:]
y_val = train_labels[:10000]
y_train = train_labels[10000:]
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1)
results = model.evaluate(test_data, test_labels)
print(results)
for index in range(20):
test_review = test_data[index]
predict = model.predict([test_review])
if predict[0] > 0.8:
print(decode_review(test_data[index]))
print(str(predict[0]))
print(str(test_labels[index]))
your task is a multiclass classification problem and for this reason, you have to modify your output layer. you have two possibilities.
if you have 1D integer encoded target you can use sparse_categorical_crossentropy as loss function, softmax as the last activation and the dimension of the last dense output equal to the number of class to predict
X = np.random.randint(0,10, (1000,100))
y = np.random.randint(0,3, 1000)
model = Sequential([
Dense(128, input_dim = 100),
Dense(3, activation='softmax'),
])
model.summary()
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(X, y, epochs=3)
Otherwise, if you have one-hot encoded your target you can use categorical_crossentropy, softmax as the last activation and the dimension of the last dense output equal to the number of class to predict
X = np.random.randint(0,10, (1000,100))
y = pd.get_dummies(np.random.randint(0,3, 1000)).values
model = Sequential([
Dense(128, input_dim = 100),
Dense(3, activation='softmax'),
])
model.summary()
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(X, y, epochs=3)
the usage of softmax enables to interpret the output as probability scores which sum to 1
when you compute the final prediction, to obtain the predicted class you can simply to in this way np.argmax(model.predict(X), axis=1)
these are some basic tutorials for multiclass text classification:
https://towardsdatascience.com/multi-class-text-classification-with-lstm-using-tensorflow-2-0-d88627c10a35
https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17

How to connect LSTM with Dense?

When trying to connect LSTM with Dense, it gives an error (when trying to train):
input = Input(shape=(x_train.shape[1], None))
X = Embedding(num_words, max_article_len)(input)
X = LSTM(128, return_sequences=True, dropout = 0.5)(X)
X = LSTM(128)(X)
X = Dense(32, activation='softmax')(X)
model = Model(inputs=[input], outputs=[X])
...
>>> ValueError: Error when checking target: expected dense to have shape (32,) but got array with shape (1,)
I tried different connection options, but the error repeats:
X, h, c = LSTM(128, return_sequences=False, return_state=True, dropout = 0.5)(X)
X = Dense(32, activation='softmax')(X)
>>> ValueError: Error when checking target: expected dense to have shape (32,) but got array with shape (1,)
Any solution options on the functional API / Sequential?
Data conversion code:
train = pd.read_csv('train.csv')
articles = train['text']
y_train = train['lang']
num_words = 50000
max_article_len = 20
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(articles)
sequences = tokenizer.texts_to_sequences(articles)
x_train = pad_sequences(sequences, maxlen=max_article_len, padding='post')
x_train.shape
>>> (18974, 100)
y_train.shape
>>> (18974,)
The last parameter must be set to False;
X = LSTM(128, return_sequences=True, dropout = 0.5)(X)
X = LSTM(128, return_sequences=False)(X)
If you still have issues, then the problem must be with your input shape.

Why concatenate cause shape error in Keras even the inputs are right?

I am new to Keras and I have a code for the model part:
# make inputs
self.input_samples = Input(shape=(self.input_shape, ))
self.input_labels = Input(shape=(self.nClass, ))
# Encoder for samples
self.E = self.encoder()(self.input_samples)
# Encoder for labels
self.E_LBLs = self.encoder4lbls()(self.input_labels)
# Decoder for reconstruction
self.D = self.decoder()(self.E)
# Task network
task_net = self.taskOut()
self.T = task_net(self.E)
self.T_LBLS = task_net(self.E_LBLs)
# define GAN for prior matching for samples and labels
self.A = self.adversarial() # This is the discriminator for latent code matching
print(type(self.E))
self.Adv = self.A(concatenate([self.E, self.E_LBLs], axis=0)) # logits for samples and labels
self.A.compile('Adam', loss='binary_crossentropy', metrics=['acc'])
# define MMD loss
# self.merge_embeds = concatenate([self.E, self.E_LBLs], axis=0, name='mmd')
model = Model([self.input_samples, self.input_labels], [self.E, self.E_LBLs, self.Adv])
When I want to output the self.Adv using model.predict([inouts1, inputs2]), it seems the concat operation in concatenate([self.E, self.E_LBLs], axis=0) . always wrong.
The error message is:
res_list = model.predict([trainSamples, trainLabels])
File "/DB/rhome/xchen/anaconda2/envs/Conda_python3_5/lib/python3.5/site-packages/keras/engine/training.py", line 1835, in predict
verbose=verbose, steps=steps)
File "/DB/rhome/xchen/anaconda2/envs/Conda_python3_5/lib/python3.5/site-packages/keras/engine/training.py", line 1339, in _predict_loop
outs[i][batch_start:batch_end] = batch_out
ValueError: could not broadcast input array from shape (64,1) into shape (32,1)
I am sure that self.E and self.E_LBLs are right. And their shapes are [N1x2000] and [N2x2000] respectively.
Do you have any idea? I cannot solve it.
Thanks.

LSTM autoencoder for variable length text input in keras

Here padded_docs.shape=(736,50). As it is an autoencoder input and output are same. The output of last LSTM layer is 3-dimensional however, the padded_docs which is kept as an output is 2-dimensional. How to fix this?
df1=pd.read_csv('snapdeal_data.csv')
df1=df1.head(1000)
df2=df1['Review_Text']
labels=df1['B_Helpfulness']
# encode full sentence into vector
encoded_docs=[one_hot(d,vocab_size) for d in X_train]
print encoded_docs
#####Padding encoded sequence of words
max_length=50
padded_docs = sequence.pad_sequences(encoded_docs, maxlen=max_length, padding='pre')
print padded_docs
model = Sequential()
timesteps = padded_docs.shape[1]
input_dim = max_length
#inputs = Input(shape=(input_dim,))
model.add(Embedding(vocab_size+1, 100,weights=[embedding_matrix],input_length=max_length,trainable=False))
model.add(LSTM(200,return_sequences = True))
model.add(LSTM(100,return_sequences = True))
model.add(LSTM(50))
model.add(RepeatVector(timesteps))
model.add(LSTM(100,return_sequences = True))
model.add(LSTM(200,return_sequences = True))
model.add(LSTM(input_dim,return_sequences = True))
model.compile(loss='mean_squared_error', optimizer='Adam')
model.summary()
`model.fit(padded_docs,padded_docs,epochs=100,batch_size=1,shuffle=True, verbose=2)`
ValueError: Error when checking target: expected lstm_6 to have 3 dimensions, but got array with shape (736, 50)
I think return_sequences=True should not be kept in the last layer of any LSTM based architecture.

Resources