Keras: Multiclass CNN Text Classifier predicts the same class for all input data - keras

I am trying to predict one out of 8 classes for some short text data with word embeddings and CNN. The occuring problem is that the CNN Classifier predicts everything in one class (the biggest one of the training data, distribution ~40%). The text data should be preprocesed quite well, because the classification works fine with other classifiers like SVM or NB.
'''
#first, build index mapping words in the embeddings set to their embedding vector
import os
embeddings_index = {}
f = open(os.path.join('', 'embedding_word2vec.txt'), encoding='utf-8')
for line in f:
values = line.split()
words = values[0]
coefs = np.asarray(values[1:])
embeddings_index[word] = coefs
f.close()
#vectorize text samples in 2D Integer Tensor
from tensorflow.python.keras.preprocessing.text import Tokenizer
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(stemmed_text)
sequences = tokenizer_obj.texts_to_sequences(stemmed_text)
#pad sequences
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
max_length = max([len(s.split(',')) for s in total_texts])
print('MaxLength :', max_length)
word_index = tokenizer_obj.word_index
print('Found %s unique Tokens.' % len(word_index))
text_pad = pad_sequences(sequences)
labels = to_categorical(np.asarray(labels))
print('Shape of label tensor:', labels.shape)
print('Shape of Text Tensor: ', text_pad.shape)
embedding_dim = 100
num_words = len(word_index) + 1
#prepare embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for index, word in enumerate(vocab):
embedding_vector = w2v.wv[word]
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
#train test split
validation_split = 0.2
indices = np.arange(text_pad.shape[0])
np.random.shuffle(indices)
text_pad = text_pad[indices]
labels = labels[indices]
num_validation_samples = int(validation_split * text_pad.shape[0])
x_train = text_pad[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_test = text_pad[-num_validation_samples:]
y_test = labels[-num_validation_samples:]
from keras.models import Sequential
from keras.layers.core import Dense, Flatten, Dropout
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length,
#embeddings_initializer = Constant(embedding_matrix),
weights=[embedding_matrix],
trainable=False))
model.add(Dropout(0.2))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
#model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
#model.add(MaxPooling1D(pool_size=4))
model.add(Flatten())
model.add(Dense(8, activation='sigmoid'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(x_train, y_train,
epochs=25,
validation_data=(x_test, y_test))
print(model.summary())
'''
I am grateful for every hint.
Thank you.

Related

Why is there no improvement in a categorical data time series model?

I built a simple categorical time series model to predict the next number of a random sequence, but the accuracy hardly moved even I trained it for 10000 epochs. The validation loss started to take off after a few hundred epochs. Could anyone make suggestions for improvement? Here's the model:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
DEVICE = 'CPU'
if DEVICE == 'CPU':
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
else:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print(tf.test.gpu_device_name())
TOTAL_CATALOG=4
POSSIBLE_OUTCOME_COL=4
LOOK_BACK_WINDOW=1
TRAINING_DATA_RATIO=0.8
TRAINING_EPOCHS=10000
sys.path.insert(0, '/DataScience/MyModules')
from m6data import getDrawData, series_to_supervised, Split_data, get_all_categories
def get_all_categories_local(last_combination):
all_category = np.arange(1, last_combination+1)
return all_category.reshape(1,all_category.shape[0])
All_categories=get_all_categories_local(TOTAL_CATALOG)
data_sequence = [1,1,2,4,2,3,1,2,3,3,4,1,2,3,4,2,2,3,1,3]
raw_df = pd.DataFrame(data_sequence, columns=['NE'])
values = raw_df.values
# 05-Apr-2022: One-Hot Encoding
oh_encoder = OneHotEncoder(categories=All_categories, sparse=False)
encoded_input = oh_encoder.fit_transform(values)
FEATURES = encoded_input.shape[1]
POSSIBLE_OUTCOME_COL = FEATURES
draw_reframe = series_to_supervised(encoded_input, LOOK_BACK_WINDOW,1)
train, test = Split_data(draw_reframe, TRAINING_DATA_RATIO)
# Total input = all possible One-Hot Encoding outcome * number of look-back samples.
ALL_INPUT = POSSIBLE_OUTCOME_COL * LOOK_BACK_WINDOW
# split into input and outputs
train_X, train_y = train.iloc[:,:ALL_INPUT], train.iloc[:,ALL_INPUT:]
test_X, test_y = test.iloc[:,:ALL_INPUT], test.iloc[:,ALL_INPUT:]
train_X = train_X.values.reshape((train_X.shape[0], LOOK_BACK_WINDOW , FEATURES))
test_X = test_X.values.reshape((test_X.shape[0], LOOK_BACK_WINDOW, FEATURES))
print(train_X.shape, train_y.shape)
print(test_X.shape, test_y.shape)
def create_model():
model = Sequential()
model.add(LSTM(10,
return_sequences=False,
input_shape=(train_X.shape[1], train_X.shape[2]),
activation='relu'
)
)
#model.add(LSTM(20))
model.add(Dense(units=train_y.shape[1], activation='softmax'))
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.00005),
loss = 'categorical_crossentropy',
metrics=['accuracy'])
return model
model=create_model()
history = model.fit(
train_X, train_y,
epochs=TRAINING_EPOCHS,
batch_size=8,
validation_data=(test_X, test_y),
verbose=1,
)
Here are the plots of accuracies and losses (red=training, blue=validation).
Accuracies
Losses
Thank you in advance for any suggestions.
Update (13-Jun-2022)
I changed my model to the following
def create_model():
model = Sequential()
model.add(LSTM(50,
return_sequences=True,
input_shape=(train_X.shape[1], train_X.shape[2]),
activation='relu'
)
)
model.add(LSTM(units=1000, kernel_regularizer=regularizers.l1(0.05), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=1000, kernel_regularizer=regularizers.l1(0.05), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=1000, kernel_regularizer=regularizers.l1(0.05), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=1000, kernel_regularizer=regularizers.l1(0.05), activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())
model.add(Dense(1000))
model.add(Dense(units=train_y.shape[1], activation='softmax'))
model.compile(optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2, nesterov=True),
#tf.keras.optimizers.Adam(learning_rate=0.001),
loss = 'categorical_crossentropy',
metrics=['accuracy'])
return model
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
patience=20,min_lr=1e-10)
early_stop = EarlyStopping(monitor='loss', patience=100)
history = model.fit(
train_X, train_y,
epochs=TRAINING_EPOCHS,
batch_size=16,
validation_split=0.1,
validation_data=(test_X, test_y),
verbose=1,
shuffle=False,
callbacks=([reduce_lr], [early_stop])
Accuracy was bouncing around and Val_accuracy was zero all the way. The loss and val_loss were almost the same and dropping together.
Can anyone advise what I can do in this scenario?

why is different between model.evaluate() and computed loss by myself based on model.predict()?

I am running neural network by keras. There is my code:
import numpy as np
from keras import Model
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K
def mean_squared_error(y_true, y_pred):
return K.mean(K.square(y_pred - y_true),axis=-1)
np.random.seed(1)
Train_X = np.random.randint(low=0,high=100,size = (50,5))
Train_Y = np.matmul(Train_X,np.arange(10).reshape(5,2))+np.random.randint(low=0,high=10,size=(50,2))
Test_X = np.random.randint(low=0,high=100,size = (10,5))
Test_Y = np.matmul(Test_X,np.arange(10).reshape(5,2))+np.random.randint(low=0,high=10,size=(10,2))
model = Sequential()
model.add(Dense(4,activation = 'relu'))
model.add(Dense(2,activation='relu'))
model.add(Dense(2,activation='relu'))
model.add(Dense(2))
model.compile(loss=mean_squared_error, optimizer='adam', metrics=['mae'])
history = model.fit(Train_X, Train_Y, epochs=100, batch_size=5,validation_data = (Test_X, Test_Y))
loss1 = model.evaluate(Test_X,Test_Y)
loss2 = history.history['val_loss'][99]
y_pred = model.predict(Test_X)
y_true = Test_Y
loss3 = np.mean(np.square(y_pred-y_true))
I find that loss1 is the same as loss2 but is different with loss3. So i feel so confused. Could someone tell me why?
This is possibly due to different dtypes for Test_Y and y_pred. Keras tries to automatically take care of dtype mismatches for you, so it is possible that Test_Y is a float64 and y_pred is a float32. If that is indeed the case, try converting one of their dtypes for the loss3 calculation and see if the values match.
y_pred = model.predict(Test_X)
y_true = Test_Y.astype(np.float32)
loss3 = np.mean(np.square(y_pred-y_true))

Universal Sentence Encoder Error: Input 0 is incompatible with layer conv1d_6: expected ndim=3, found ndim=2

I'm worked on sentiment analysis task using universal sentence encoder embed_size=512 with CNN but have an error says: Input 0 is incompatible with layer conv1d_6: expected ndim=3, found ndim=2.
and wanna know if this is right to add universal sentence encoder with CNN in this way or not?
pickle_in=open("X.pickle","rb")
X=pickle.load(pickle_in)
X = X.tolist() #convert x to list as The embedding code works once I
converted
the pandas.series data type to list.
X = np.array(X, dtype=object)[:, np.newaxis]
pickle_in=open("Y.pickle","rb")
Y=pickle.load(pickle_in)
Y = np.asarray(pd.get_dummies(Y), dtype = np.int8)
import tensorflow as tf
import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(module_url)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.15,
random_state = 42)
X_train, X_Val, Y_train, Y_Val = train_test_split(X_train,Y_train, test_size
= 0.15, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
print(X_Val.shape,Y_Val.shape)
type(Y_test)
embed_size = embed.get_output_info_dict()['default'].get_shape()[1].value
def UniversalEmbedding(x):
return embed(tf.squeeze(tf.cast(x, tf.string)),
signature="default", as_dict=True)["default"]
import keras
seed=7
np.random.seed(seed)
from keras.layers import Input, Dense, concatenate, Activation,
GlobalMaxPooling1D
from keras import layers
from keras.models import Model
input_text = layers.Input(shape=(1,), dtype=tf.string)
embedding = layers.Lambda(UniversalEmbedding,
output_shape=(embed_size,))(input_text)
bigram_branch = Conv1D(filters=64, kernel_size=1, padding='same',
activation='relu', strides=1)(embedding)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=64, kernel_size=2, padding='same',
activation='relu', strides=1)(embedding)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=64, kernel_size=3, padding='same',
activation='relu', strides=1)(embedding)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch],
axis=1)
merged = Dense(512, activation='relu')(merged)
merged = Dropout(0.8)(merged)
merged = Dense(2)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
adam=keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None,
decay=0.0, amsgrad=False)
model.compile(loss='mean_squared_error',
optimizer= adam,
metrics=['accuracy'])
model.summary()
You can not directly pass Universal Sentence Encoder to Conv1D because Conv1D expected a tensor with shape [batch, sequence, feature] while the output of Universal Sentence Encoder is [batch, feature]. It is also stated in tfhub.dev:
The input is variable length English text and the output is a 512
dimensional vector.
How can I fix this?
In my view, the easiest possible solution is to use ELMo on Tensorhub. With ELMo you can map each sentence to [batch, sequence, feature] and then feed into the Conv1D.

InvalidArgumentError Sentiment analyser with keras

I have built a sentiment analyzer using Keras as a binary classification problem. I am using the Imdb dataset using GRU.
My code is:
# coding=utf-8
# ==========
# MODEL
# ==========
# imports
from __future__ import print_function
from timeit import default_timer as timer
from datetime import timedelta
from keras.models import Sequential
from keras.preprocessing import sequence
from keras import regularizers
from keras.layers import Dense, Embedding
from keras.layers import GRU, LeakyReLU, Bidirectional
from keras.datasets import imdb
#start a timer
start = timer()
# Hyperparameters
Model_Name = 'my_model.h5'
vocab_size = 5000
maxlen = 1000
batch_size = 512
hidden_layer_size = 2
test_split = 0.3
dropout = 0.1
num_epochs = 1
alpha = 0.2
validation_split = 0.25
l1 = 0.01
l2 = 0.01
# Dataset loading
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
maxlen=maxlen)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
# Data preprocessing
# Sequence padding
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
# Network building
print('Build model...')
model = Sequential()
model.add(Embedding(vocab_size, hidden_layer_size))
model.add(Bidirectional(GRU(hidden_layer_size, kernel_initializer='uniform', kernel_regularizer=regularizers.l1_l2(l1=l1,l2=l2), dropout=dropout, recurrent_dropout=dropout,return_sequences=True)))
model.add(LeakyReLU())
model.add(Bidirectional(GRU(hidden_layer_size, kernel_initializer='uniform', dropout=dropout, kernel_regularizer=regularizers.l1_l2(l1=l1,l2=l2), recurrent_dropout=dropout)))
model.add(LeakyReLU())
model.add(Dense(1, activation='softmax', kernel_initializer='uniform', kernel_regularizer=regularizers.l1_l2(l1=l1,l2=l2)))
# Compile my model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print('Train...')
# Fit the model
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_split=validation_split)
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
# Create a summary, a plot and print the scores of the model
model.summary()
print('Test score:', score)
print('Test accuracy:', acc)
# Save model architecture, weights, training configuration (loss,optimizer),
# and also the state of the optimizer, so you can resume where you stopped
model.save(Model_Name)
end = timer()
print('Running time: ' + str(timedelta(seconds=(end - start))) + ' in Hours:Minutes:Seconds')
I keep receiving an Error message which I don't completely understand:
InvalidArgumentError (see above for traceback): indices[502,665] = 5476 is not in [0, 5000)
[[Node: embedding_1/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device=/job:localhost/replica:0/task:0/device:CPU:0](embedding_1/embeddings/read, embedding_1/Cast)]]
Can anyone help me understand what causes this error and how to solve it?
The error complains about a non-existent word index. That's because you are only limiting the number of Emedding features (i.e. there is a word with index 5476 which is not in the range [0, 5000), which 5000 refers to the vocab_size you have set). To resolve this, you also need to pass the vocab_size as num_words argument of load_data function, like this:
... = imdb.load_data(num_words=vocab_size, ...)
This way you are limiting the words to the most frequent words (i.e. top vocab_size words with the most frequency in the dataset) with their indices in range [0, vocab_size).

multi-label text classification. I have a text/label csv. Text is pure text, labels are alphanumeric

import keras
import keras.backend as K
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.core import Activation
from keras.preprocessing.text import Tokenizer # for
tokenizing text
from keras.preprocessing.sequence import pad_sequences # for
padding sentences with zeros. To make the sentence length same
from keras.utils import to_categorical # for one-
hot encoding of the labels
from keras.layers import Dense, Input, Flatten, Dropout,
BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential
from sklearn.model_selection import train_test_split
MAX_SEQUENCE_LENGTH = 300
MAX_NB_WORDS = 20000
#Reading the data
raw_data=pd.read_csv("/home/riaz.k/Desktop/TRAIN.csv")
raw_data.head()
# create training and testing vars
train, test = train_test_split(raw_data, test_size=0.3)
train.head()
test.head()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train.Procedure)
train_sequences = tokenizer.texts_to_sequences(train.Procedure)
test_sequences = tokenizer.texts_to_sequences(test.Procedure)
word_index = tokenizer.word_index
containing words and their index
# print(tokenizer.word_index)
print('Found %s unique tokens.' % len(word_index))
train_data = pad_sequences(train_sequences,
maxlen=MAX_SEQUENCE_LENGTH)
train
test_data=pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH)
test
print(train_data.shape)
print(test_data.shape)
print (word_index)
train_labels = train['dxcode']
test_labels = test['dxcode']
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() # converts the character
array to numeric array.
Assigns levels to unique labels.
le.fit(train_labels)
le.fit(test_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)
print(le.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))
le.inverse_transform(1)
labels_train = to_categorical(np.asanyarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_test.shape)
EMBEDDING_DIM = 100
print(MAX_SEQUENCE_LENGTH)
print('Training model.')
model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
EMBEDDING_DIM,
input_length=MAX_SEQUENCE_LENGTH
))
model.add(Dropout(0.2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(23, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['acc'],)
model.fit(train_data, labels_train,
batch_size=32,
epochs=10,
validation_data=(test_data, labels_test))
model.evaluate(test_data, labels_test)
pred = model.predict(test_data)
pred
# print(model.layers)
for layer in model.layers:
print(layer)
import keras.backend as K
emd = K.function(inputs=[model.layers[0].input],
outputs=[model.layers[0].output])
rbind = np.concatenate((train_data, test_data), axis=0)
print(rbind.shape)
### Submissions file
test_results = model.predict_classes(rbind)
#print(test_results)
test_labels = le.inverse_transform(test_results)
#test_labels = [le.inverse_transform(i) for i in test_results]
submissions_CNN =
pd.DataFrame({'id':raw_data['Claimno'],"label":test_labels})
submissions_CNN.to_csv("/home/riaz.k/Desktop/submissions.csv",index=False)
The text document can be labelled with more than one label, so how can I do a multi-label classification on this dataset? I've read a lot of documentation from sklearn, but I can't seem to find the right way to do multi-label classification. Thanks in advance for any help.
Are you getting the error on this line:
train_labels = le.transform(train_labels)
If yes, then its because in the line just above it, you are doing this:
le.fit(test_labels)
What this does is it forgets the previous data (previous call to fit() on the line above it) and only remembers the data in the test_labels. So when a new label (which is present in train but not in test) comes, it will throw this error.
You need to replave the lines:
le.fit(train_labels)
le.fit(test_labels)
with this:
# I am using .tolist() because I observe that your
# train_labels, test_labels are pandas Series objects
le.fit(train_labels.tolist() + test_labels.tolist())

Resources