I have built a sentiment analyzer using Keras as a binary classification problem. I am using the Imdb dataset using GRU.
My code is:
# coding=utf-8
# ==========
# MODEL
# ==========
# imports
from __future__ import print_function
from timeit import default_timer as timer
from datetime import timedelta
from keras.models import Sequential
from keras.preprocessing import sequence
from keras import regularizers
from keras.layers import Dense, Embedding
from keras.layers import GRU, LeakyReLU, Bidirectional
from keras.datasets import imdb
#start a timer
start = timer()
# Hyperparameters
Model_Name = 'my_model.h5'
vocab_size = 5000
maxlen = 1000
batch_size = 512
hidden_layer_size = 2
test_split = 0.3
dropout = 0.1
num_epochs = 1
alpha = 0.2
validation_split = 0.25
l1 = 0.01
l2 = 0.01
# Dataset loading
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
maxlen=maxlen)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
# Data preprocessing
# Sequence padding
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
# Network building
print('Build model...')
model = Sequential()
model.add(Embedding(vocab_size, hidden_layer_size))
model.add(Bidirectional(GRU(hidden_layer_size, kernel_initializer='uniform', kernel_regularizer=regularizers.l1_l2(l1=l1,l2=l2), dropout=dropout, recurrent_dropout=dropout,return_sequences=True)))
model.add(LeakyReLU())
model.add(Bidirectional(GRU(hidden_layer_size, kernel_initializer='uniform', dropout=dropout, kernel_regularizer=regularizers.l1_l2(l1=l1,l2=l2), recurrent_dropout=dropout)))
model.add(LeakyReLU())
model.add(Dense(1, activation='softmax', kernel_initializer='uniform', kernel_regularizer=regularizers.l1_l2(l1=l1,l2=l2)))
# Compile my model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print('Train...')
# Fit the model
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_split=validation_split)
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
# Create a summary, a plot and print the scores of the model
model.summary()
print('Test score:', score)
print('Test accuracy:', acc)
# Save model architecture, weights, training configuration (loss,optimizer),
# and also the state of the optimizer, so you can resume where you stopped
model.save(Model_Name)
end = timer()
print('Running time: ' + str(timedelta(seconds=(end - start))) + ' in Hours:Minutes:Seconds')
I keep receiving an Error message which I don't completely understand:
InvalidArgumentError (see above for traceback): indices[502,665] = 5476 is not in [0, 5000)
[[Node: embedding_1/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device=/job:localhost/replica:0/task:0/device:CPU:0](embedding_1/embeddings/read, embedding_1/Cast)]]
Can anyone help me understand what causes this error and how to solve it?
The error complains about a non-existent word index. That's because you are only limiting the number of Emedding features (i.e. there is a word with index 5476 which is not in the range [0, 5000), which 5000 refers to the vocab_size you have set). To resolve this, you also need to pass the vocab_size as num_words argument of load_data function, like this:
... = imdb.load_data(num_words=vocab_size, ...)
This way you are limiting the words to the most frequent words (i.e. top vocab_size words with the most frequency in the dataset) with their indices in range [0, vocab_size).
Related
TF 2.x - just for the experience I tried with a simple experimental dataset - to show the problem:
import numpy as np
import tensorflow as tf
import keras
from tensorflow.keras.callbacks import LambdaCallback
import tensorflow_datasets as tfds
data, info = tfds.load('iris', split='train[:80%]',
as_supervised=True, with_info=True)
print(info)
features, labels = tuple(zip(*data))
# NB: the generator should yield a dictionary for the inputs, and the output as is.
def gen(x_train, y_train):
print('generator initiated')
(x_train, y_train)= tfds.load('iris', shuffle_files=True, as_supervised=True, with_info=True)
idx = 0
while True:
yield tf.transpose([x_train[:32], tf.one_hot(y_train[:32])])
print('generator yielded a batch %d' % idx)
idx += 1
train_ds = tf.data.Dataset.from_generator(gen, args=(features, labels),
output_types=(tf.float32, tf.int32),
output_shapes=(tf.TensorShape([32,4]), tf.TensorShape([32,4 ])),
)
# OR
#output_signature=(
# tf.TensorSpec(shape=(4,), dtype=tf.float32),
# tf.TensorSpec(shape=(), dtype=tf.int32)),
#)
# datasetGen = iter(train_ds)
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(32,4,))) # 4 fields
model.add(tf.keras.layers.Dense(4, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
train_ds= train_ds.batch(32).prefetch(32)
# callbacks=[LambdaCallback(on_epoch_end=generator.on_epoch_end)],
history= model.fit(train_ds, epochs = 7, verbose = 1)
print(history.history['accuracy'])
& am getting :
In ln: yield tf.transpose([x_train[:32], tf.one_hot(y_train[:32])])
TypeError: unhashable type: 'slice'
problem seems to be here - x_train[:32] ?
Q ?? how to make corrections to the code (either to the generator-func? or to the output_signature? or to the input_shape=? or somewhere else) to be able to use Dataset in model.fit() method ?
(sorry for dummy example, but I'd like to test generator-func use in model.fit())
well, it was really a dummy example of generator use; & moreover tf.data always win in speed compared with generator use. Nevertheless, such works (code also needs refactoring - e.g. or organizing pipelines for BigData - e.g.)
import tensorflow as tf
import numpy as np
import pandas as pd
# LOAD DATA
df= pd.read_csv('https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv', dtype = 'float32', converters = {'variety' : str},
nrows=64, decimal='.')
# df.head()
_features=df.iloc[:,:4].copy()
_labels=df.iloc[:,-1:].copy()
_labels['variety1'] = pd.factorize(_labels['variety'])[0]
_target= _labels['variety1'].astype(np.int64).copy()
_targets= _target[:,np.newaxis]
#print(_features)
print(type(_targets))
# SPLIT for Train & Test
# https://www.kdnuggets.com/2020/07/getting-started-tensorflow2.html
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(_features,_targets, test_size=0.3)
# Typically, we normalize the data when we have a high amount of variance in it.
print(X_train.var())
print(X_test.var())
# Here we can see that both X_train and X_test have very low variance, so no need to normalize the data.
# PREPROCESSING
#
# to_categorical
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)
print(y_train[:5,:])
# convert our data to numpy arrays
X_train = X_train.values
X_test = X_test.values
#################################################
#################################################
def gen(_features, _labels):
x_train= _features
y_train= _labels
#print('gen:\n', list(x_train))
#print('gen:\n', list(y_train))
idx = 0
while idx<64:
yield x_train[:32], y_train[:32]
print('generator yielded a batch %d' % idx)
idx += 1
#################################################
# train_ds <<<<<<<<<<<<<<<<<<<<<<<
train_ds = tf.data.Dataset.from_generator(gen, args=(X_train, y_train),
output_types=(tf.float32, tf.int64),
output_shapes=(tf.TensorShape([32,4]), tf.TensorShape([32, 2 ])),
)
# OR
#output_signature=(
# tf.TensorSpec(shape=(4,), dtype=tf.float32),
# tf.TensorSpec(shape=(), dtype=tf.int32)),
#)
# datasetGen = iter(train_ds)
# print('train_ds:\n',list(train_ds.as_numpy_iterator()))
#################################################
# Model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense((512), activation='relu', input_shape=(32,4 ))) # 4 fields
model.add(tf.keras.layers.Dense((2), activation='softmax'))
# INSTEAD OF ONE-HOT CAN USE sparse_categorical_crossentropy HERE
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
train_ds= train_ds.batch(32).prefetch(32)
# callbacks=[LambdaCallback(on_epoch_end=generator.on_epoch_end)],
history= model.fit(train_ds, epochs = 7, verbose = 1)
validation_ ds from source X_test, y_test formed with tf.data.Dataset.from_tensor_slices() have problems with shape (4,) instead of model's input shape (32,4,) - but it is of the inappropriate generator's task at all from the very beginningg, I think... though with train_ds evaluate() & predict() methods works (though that is not the task of ML)
##############################################
score = model.evaluate(train_ds, batch_size=32, verbose=1) # test_ds needed
print("Test Accuracy:", score[1])
y_pred = model.predict(train_ds)
print('PREDICTIONS:\n', y_pred)
##############################################
#https://medium.com/#nutanbhogendrasharma/tensorflow-deep-learning-model-with-iris-dataset-8ec344c49f91
#Print actual and predicted value
features, labels = tuple(zip(*train_ds)) # If you need the numpy array version, convert them using np.array(): # https://stackoverflow.com/a/65499385/15893581
actual = np.argmax(labels,axis=-1)
predicted = np.argmax(y_pred,axis=-1)
print(f"Actual: {actual}")
print(f"Predicted: {predicted}")
So, incoming test_ds e.g. still needs to be adopted (though better to adopt gen_func here, I think), but overall idea of using generator in TF 2.x is clear now (only if will be used for huge data)...
P.S.
and advice to improve the model here
I apologize for this dummy question, as I'm still a novice in ML, but needed to connect somehow generator & training for the experience
Finally I generated iris_dataset from function (really, not quick operation)... some attention stll needed else to repeat-fn, but code-design in general works (for really random data)
# Importing the tensorflow library
import tensorflow as tf
import numpy as np
import keras
#FeaturesDict({
# 'features': Tensor(shape=(4,), dtype=tf.float32),
# 'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=3),
#})
BATCH_SIZE= 12
EPOCHS = 7
QTY_BATCHES= 10 # to be generated
# The Dataset.from_generator constructor converts the python generator to a fully functional tf.data.Dataset.
def gen():
for i in range(BATCH_SIZE):
# should yield a pair Features - Label
data= np.expand_dims(np.random.sample(4) , axis=0)
label= [np.random.randint(3)]
yield data, label
train_ds = tf.data.Dataset.from_generator(gen,
(tf.float32, tf.int32),
(tf.TensorShape([None,4]),
tf.TensorShape([ 1])))
# Applying the Dataset.repeat() transformation with no arguments will repeat the input indefinitely.
# The Dataset.repeat transformation concatenates its arguments without signaling the end of one epoch and the beginning of the next epoch. Because of this a Dataset.batch applied after Dataset.repeat will yield batches that straddle epoch boundaries:
train_ds= train_ds.repeat(count= EPOCHS*BATCH_SIZE*QTY_BATCHES).batch(BATCH_SIZE, drop_remainder=True).prefetch(BATCH_SIZE)
NUM_CLASSES= 3
train_ds = train_ds.map(lambda x, y: (x, tf.one_hot(y, depth=NUM_CLASSES)))
for x, y in train_ds:
print(x)
print(y)
# Build a simple linear model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(64, activation='relu', input_shape=(None,4))) # unknown(variable) batch_size, 4 fields
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
# steps_per_epoch = int( np.ceil(x_train.shape[0] / batch_size) )
# The Steps per epoch denote the number of batches to be selected for one epoch. If 500 steps are selected then the network will train for 500 batches to complete one epoch.
history= model.fit(train_ds, batch_size=BATCH_SIZE, epochs= EPOCHS, \
steps_per_epoch= (QTY_BATCHES*BATCH_SIZE)//BATCH_SIZE, \
verbose = 1)
print(history.history['accuracy'])
print(history.history['loss'])
# Keras - Plot training, validation and test set accuracy
# https://stackoverflow.com/questions/41908379/keras-plot-training-validation-and-test-set-accuracy
import keras
from matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
#plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
#plt.legend(['train', 'val'], loc='upper left')
plt.legend(['train'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
# plt.legend(['train', 'val'], loc='upper left')
plt.legend(['train'], loc='upper left')
plt.show()
ok, I'v got working case for the initial Dataset:
import numpy as np
import tensorflow as tf
import keras
from tensorflow.keras.callbacks import LambdaCallback
import tensorflow_datasets as tfds
data, info = tfds.load('iris', split='train[:100%]', batch_size=10, as_supervised=True, with_info=True)
print(info)
NUM_CLASSES= info.features["label"].num_classes
data = data.map(lambda x, y: (x, tf.one_hot(y, depth=NUM_CLASSES)))
features, labels = tuple(zip(*data))
print(features)
print(labels)
# NB: the generator should yield a dictionary for the inputs, and the output as is.
def gen(x_train, y_train):
print('generator initiated')
print(x_train.shape)
print(y_train.shape)
idx = 0
while True:
yield x_train, y_train
print('generator yielded a batch %d' % idx)
idx += 1
train_ds = tf.data.Dataset.from_generator(gen, args=(features, labels),
output_types=(tf.float32, tf.int32),
output_shapes=(tf.TensorShape([None,10,4]), tf.TensorShape([ None, 10, 3 ])),
)
# OR (better! because prev. is Deprecated)
#output_signature=(
# tf.TensorSpec(shape=(4,), dtype=tf.float32),
# tf.TensorSpec(shape=(), dtype=tf.int32)),
#)
#it = iter(train_ds)
#print(it.get_next())
for feature, label in train_ds:
print("shape of ds_generated: ", feature.shape,label.shape)
break
#num_val = len(train_ds) # TypeError: The dataset length is unknown. BECAUSE it is FLOW
#print(num_val)
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(None,10,4))) # 4 fields
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
train_ds= train_ds.batch(32).prefetch(32)
# callbacks=[LambdaCallback(on_epoch_end=generator.on_epoch_end)],
history= model.fit(train_ds, epochs = 2, steps_per_epoch= 120 // 10, verbose = 1)
print(history.history['accuracy'])
one-hot encoding I've moved out of gen_func-scope
divided DS for features & labels
! gave correct input_shape to model (& appropriate shape changes in gen_func) - according [variable_rows_count_in_batch, batch_size, columns_features]
verbose = 1 for readable Debug in MT env.
advice from here
to define a variable batch size with None and setting the
steps_per_epoch
-- still not helps if taking split='train[:50%]' and steps_per_epoch= 60 // 10, -- as for unfully filled LAST batch -- the source of problem in my code IS in gen_func output_shapes -- that is clear here, because gen_func really was got dummy for testing purposes...
for real cases use Logical Output ! and appropriate Shapes
P.S.
though for 5 epochs I am getting:
Graph execution error: >> ZMQError: Too many open file
AttributeError: '_thread._local' object has no attribute 'event_pipe'
-- ! probably, NOT enough memory to finish training !... - decreasing output in Dense(512,..) HELPS (as well as decreasing number of epochs)
I am confused as to which metric GridsearchCV is using in its parameter search. My understanding is that my model object feeds it a metric and this is what is used to determine the "best_params". But this doesn't appear to be the case. I thought that score=None is the default and as a result the first metric given in the metrics option of model.compile() was used. So in my case the the scoring function used should be the mean_squred_error. My explanation for this issue is described next.
Here is what I am doing. I simulated some regression data using sklearn with 10 features on 100,000 observations. I am playing around with keras because I typically used pytorch in the past and never really dabbled with keras until now. I am noticing a discrepancy in the loss function output from my GridsearchCV call vs the model.fit() call after I have my optimal set of parameters. Now I know I can just refit=True and not re-fit the model again, but I am trying to get a feel for the output of the keras and sklearn GridsearchCV functions.
To be explicit about the discrepancy here is what I am seeing. I simulated some data using sklearn as follows:
# Setting some data basics
N = 10000
feats = 10
# generate regression dataset
X, y = make_regression(n_samples=N, n_features=feats, n_informative=2, noise=3)
# training data and testing data #
X_train = X[:int(N * 0.8)]
y_train = y[:int(N * 0.8)]
X_test = X[int(N * 0.8):]
y_test = y[int(N * 0.8):]
I have created a "create_model" function that is looking to tune which activation function I am using (again this is a simple example for a proof of concept).
def create_model(activation_fn):
# create model
model = Sequential()
model.add(Dense(30, input_dim=feats, activation=activation_fn,
kernel_initializer='normal'))
model.add(Dropout(0.2))
model.add(Dense(10, activation=activation_fn))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
# Compile model
model.compile(loss='mean_squared_error',
optimizer='adam',
metrics=['mean_squared_error','mae'])
return model
Performing the grid search I get the following output
model = KerasRegressor(build_fn=create_model, epochs=50, batch_size=200, verbose=0)
activations = ['linear','relu']
param_grid = dict(activation_fn = activations)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3)
grid_result = grid.fit(X_train, y_train, verbose=1)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
Best: -21.163454 using {'activation_fn': 'linear'}
Ok, so the best metric is the mean squared error of 21.16 (I understand they flip the sign to create a maximization problem). So, when I fit the model using the activation_fn = 'linear' the MSE I get is totally different.
best_model = create_model('linear')
history = best_model.fit(X_train, y_train, epochs=50, batch_size=200, verbose=1)
.....
.....
Epoch 49/50
8000/8000 [==============================] - 0s 48us/step - loss: 344.1636 - mean_squared_error: 344.1636 - mean_absolute_error: 12.2109
Epoch 50/50
8000/8000 [==============================] - 0s 48us/step - loss: 326.4524 - mean_squared_error: 326.4524 - mean_absolute_error: 11.9250
history.history['mean_squared_error']
Out[723]:
[10053.778002929688,
9826.66806640625,
......
......
344.16363830566405,
326.45237121582034]
The difference is in 326.45 vs. 21.16. Any insight as to what I am misunderstanding would be greatly appreciated. I would be more comfortable if they were within a reasonable neighborhood of each other, given one is the error from one fold vs the entire training data set. But 21 is nowhere near 326. Thanks!
The entire code is seen here.
import pandas as pd
import numpy as np
from keras import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from keras.constraints import maxnorm
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn.datasets import make_regression
from matplotlib import pyplot as plt
# Setting some data basics
N = 10000
feats = 10
# generate regression dataset
X, y = make_regression(n_samples=N, n_features=feats, n_informative=2, noise=3)
# training data and testing data #
X_train = X[:int(N * 0.8)]
y_train = y[:int(N * 0.8)]
X_test = X[int(N * 0.8):]
y_test = y[int(N * 0.8):]
def create_model(activation_fn):
# create model
model = Sequential()
model.add(Dense(30, input_dim=feats, activation=activation_fn,
kernel_initializer='normal'))
model.add(Dropout(0.2))
model.add(Dense(10, activation=activation_fn))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
# Compile model
model.compile(loss='mean_squared_error',
optimizer='adam',
metrics=['mean_squared_error','mae'])
return model
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
# create model
model = KerasRegressor(build_fn=create_model, epochs=50, batch_size=200, verbose=0)
# define the grid search parameters
activations = ['linear','relu']
param_grid = dict(activation_fn = activations)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3)
grid_result = grid.fit(X_train, y_train, verbose=1)
best_model = create_model('linear')
history = best_model.fit(X_train, y_train, epochs=50, batch_size=200, verbose=1)
history.history.keys()
plt.plot(history.history['mean_absolute_error'])
# summarize results
grid_result.cv_results_
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
The large loss reported in your output (326.45237121582034) is the training loss. If you need a metric to be compared with the grid_result.best_score_ (in the GridSearchCV) and the MSE (in the best_model.fit), you have to request the validation loss (cf. code below).
Now to the question: why is the validation loss lower than the training loss? In your case it is essentially because of dropout (which is applied during training but not during validation/test) - that is why the difference between training and validation losses disappears when you remove dropout. You can find a detailed explanation here of the possible reasons for a lower validation loss.
In short, the performance (MSE) of your model is given by the grid_result.best_score_ (21.163454 in your example).
import numpy as np
from keras import Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.datasets import make_regression
import tensorflow as tf
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
tf.random.set_seed(42)
# Setting some data basics
N = 10000
feats = 10
# generate regression dataset
X, y = make_regression(n_samples=N, n_features=feats, n_informative=2, noise=3)
# training data and testing data #
X_train = X[:int(N * 0.8)]
y_train = y[:int(N * 0.8)]
X_test = X[int(N * 0.8):]
y_test = y[int(N * 0.8):]
def create_model(activation_fn):
# create model
model = Sequential()
model.add(Dense(30, input_dim=feats, activation=activation_fn,
kernel_initializer='normal'))
model.add(Dropout(0.2))
model.add(Dense(10, activation=activation_fn))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
# Compile model
model.compile(loss='mean_squared_error',
optimizer='adam',
metrics=['mean_squared_error','mae'])
return model
# create model
model = KerasRegressor(build_fn=create_model, epochs=50, batch_size=200, verbose=0)
# define the grid search parameters
activations = ['linear','relu']
param_grid = dict(activation_fn = activations)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3)
grid_result = grid.fit(X_train, y_train, verbose=1, validation_data=(X_test, y_test))
best_model = create_model('linear')
history = best_model.fit(X_train, y_train, epochs=50, batch_size=200, verbose=1, validation_data=(X_test, y_test))
history.history.keys()
# plt.plot(history.history['mae'])
# summarize results
print(grid_result.cv_results_)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
I recently started taking advantage of Keras's flow_from_dataframe() feature for a project, and decided to test it with the MNIST dataset. I have a directory full of the MNIST samples in png format, and a dataframe with the absolute directory for each in one column and the label in the other.
I'm also using transfer learning, importing VGG16 as a base, and adding my own 512 node relu dense layer and 0.5 drop-out before a softmax layer of 10. (For digits 0-9). I'm using rmsprop (lr=1e-4) as the optimizer.
When I launch my environment, it calls the latest version of keras_preprocessing from Git, which has support for absolute directories and capitalized file extensions.
My problem is that I have a very high training accuracy, and a terribly low validation accuracy. By my final epoch (10), I had a training accuracy of 0.94 and a validation accuracy of 0.01.
I'm wondering if there's something fundamentally wrong with my script? With another dataset, I'm even getting NaNs for both my training and validation loss values after epoch 4. (I checked the relevant columns, there aren't any null values!)
Here's my code. I'd be deeply appreciative is someone could glance through it and see if anything jumped out at them.
import pandas as pd
import numpy as np
import keras
from keras_preprocessing.image import ImageDataGenerator
from keras import applications
from keras import optimizers
from keras.models import Model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras import backend as k
from keras.callbacks import ModelCheckpoint, CSVLogger
from keras.applications.vgg16 import VGG16, preprocess_input
# INITIALIZE MODEL
img_width, img_height = 32, 32
model = VGG16(weights = 'imagenet', include_top=False, input_shape = (img_width, img_height, 3))
# freeze all layers
for layer in model.layers:
layer.trainable = False
# Adding custom Layers
x = model.output
x = Flatten()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(10, activation="softmax")(x)
# creating the final model
model_final = Model(input = model.input, output = predictions)
# compile the model
rms = optimizers.RMSprop(lr=1e-4)
#adadelta = optimizers.Adadelta(lr=0.001, rho=0.5, epsilon=None, decay=0.0)
model_final.compile(loss = "categorical_crossentropy", optimizer = rms, metrics=["accuracy"])
# LOAD AND DEFINE SOURCE DATA
train = pd.read_csv('MNIST_train.csv', index_col=0)
val = pd.read_csv('MNIST_test.csv', index_col=0)
nb_train_samples = 60000
nb_validation_samples = 10000
batch_size = 60
epochs = 10
# Initiate the train and test generators
train_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()
train_generator = train_datagen.flow_from_dataframe(dataframe=train,
directory=None,
x_col='train_samples',
y_col='train_labels',
has_ext=True,
target_size = (img_height,
img_width),
batch_size = batch_size,
class_mode = 'categorical',
color_mode = 'rgb')
validation_generator = test_datagen.flow_from_dataframe(dataframe=val,
directory=None,
x_col='test_samples',
y_col='test_labels',
has_ext=True,
target_size = (img_height,
img_width),
batch_size = batch_size,
class_mode = 'categorical',
color_mode = 'rgb')
# GET CLASS INDICES
print('****************')
for cls, idx in train_generator.class_indices.items():
print('Class #{} = {}'.format(idx, cls))
print('****************')
# DEFINE CALLBACKS
path = './chk/epoch_{epoch:02d}-valLoss_{val_loss:.2f}-valAcc_{val_acc:.2f}.hdf5'
chk = ModelCheckpoint(path, monitor = 'val_acc', verbose = 1, save_best_only = True, mode = 'max')
logger = CSVLogger('./chk/training_log.csv', separator = ',', append=False)
nPlus = 1
samples_per_epoch = nb_train_samples * nPlus
# Train the model
model_final.fit_generator(train_generator,
steps_per_epoch = int(samples_per_epoch/batch_size),
epochs = epochs,
validation_data = validation_generator,
validation_steps = int(nb_validation_samples/batch_size),
callbacks = [chk, logger])
Have you tried explicitly defining the classes of the images? as such:
train_generator=image.ImageDataGenerator().flow_from_dataframe(classes=[0,1,2,3,4,5,6,7,8,9])
in both the train and validation generators.
I have found that sometimes the train and validation generators create different correspondence dictionaries.
I have programmed convolutional neural network in keras to predict whether the image is of a cat or a dog.
I got an accuracy around 80%.
I tried checking the prediction of my code for many images as it always gave result of the one class out of the classes (dogs and cat) to be predicted.
I will walk you through my code:
The first part of my code is Data Preparation:
#Labels
Y = np.concatenate([np.array([[1,0]]*len(cats)),
np.array([[0,1]]*len(dogs))])
print(Y)
#Features
import scipy.misc
from keras.models import model_from_json
X=[]
for name in cats:
converted_image = scipy.misc.imresize(scipy.misc.imread(name), (64, 64))
X.append(converted_image)
for name in dogs:
converted_image = scipy.misc.imresize(scipy.misc.imread(name), (64, 64))
X.append(converted_image)
X= np.array(X)
print(len(X))
print(X.shape)
Now I have divided the data into training and testing
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.10,random_state=10)
X_train=X_train.astype("float32")
X_test = X_test.astype("float32")
print("Shape Of the Training Features are :",X_train.shape)
print("Shape Of Testing Features are :",X_test.shape)
print("Number of Training Samples : ",X_train.shape[0])
print("Number of Testing Samples :",X_test.shape[0])
Now, I will show my complete CNN architecture below:
from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation,Lambda,Flatten
from keras.layers import Conv2D,MaxPooling2D
def layer(input_shape = (64,64,3)):
model=Sequential()
model.add(Lambda(lambda x: x/127.5 - 1.,input_shape=input_shape, output_shape=input_shape))
model.add(Conv2D(32, (3, 3), activation='relu', name='conv1', input_shape=input_shape, padding="valid"))
model.add(Conv2D(64,(3,3),activation='relu',name='conv2',padding='valid'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(128,(3,3),activation='relu',name='conv3',padding='valid'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
#model.add(Dense(25088,activation='relu'))
#model.add(Dense(12000,activation='relu'))
#model.add(Dense(6000,activation='relu'))
model.add(Dense(500,activation='relu'))
model.add(Dense(2,activation='softmax'))
model.summary()
return model
model = layer()
model.compile(loss='mse',optimizer='adadelta',metrics=['accuracy'])
model.fit(X_train, Y_train, batch_size=128, epochs=5, verbose=1, validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])
My accuracy and score from the output is as follows:
Test score: 0.1459069953918457
Test accuracy: 0.7876
-Saving and loading The Model Code
#saving
model_json = model.to_json()
open('cd.json','w').write(model_json)
#Save Weights
model.save_weights('cd_weights.h5',overwrite=True)
#loading
model_architecture = 'cd.json'
model_weights = 'cd_weights.h5'
model = model_from_json(open(model_architecture).read())
model.load_weights(model_weights)
Now prediction part
img_names = ['d1.jpg','c1.jpg']
imgs = [np.transpose(scipy.misc.imresize(scipy.misc.imread(img_name), (64, 64)),
(1, 0, 2)).astype('float32')
for img_name in img_names]
imgs = np.array(imgs) / 255
# train
optim = SGD()
model.compile(loss='categorical_crossentropy', optimizer=optim,
metrics=['accuracy'])
predictions = model.predict_classes(imgs)
print(predictions)
Output : [0 0]
The images are taken from training set c1-cat, d1-dog for every image it outputs zero as predicted.
I think there is some problem with model.predict_classes() function.
You can use the following code to perform the same thing as model.predict_classes() can do.
This will give us the prediction in terms of probability.
predictions = model.predict(imgs)
ex:
1) [0.7865448 0.21910465] says there is 78% chance of the picture being the first class (cat) and 21% of chance being the second class(dog)
2) [0.7856076 0.22000787] says there is 78% chance of the picture being the first class (cat) and 21% of chance being the second class(dog)
In order to convert this probability to classes we use np.argmax function.
print(np.argmax(predictions ))
Your output will be 0 indication first class(cat)
or 1 indication second class(dog)
My code is for training vgg16 from custom data. Two classes, Diseased and not diseased.
I have around 3400 Images, The problem is while loading the data-set to memory.The above-mentioned process utilizes 99% of ram memory and it gets stuck.I am using spyder,however when I followed another example which has lower data size it works fine. My question is as follows Can anyone suggest an efficent method to run it without loading all the images into the memory? Because this is eventually leading to the blue screen of death.
Ps:my system is capable of running deeplearning codes.
import numpy as np
import os
import time
from vgg16 import VGG16
from keras.preprocessing import image
from imagenet_utils import preprocess_input, decode_predictions
from keras.layers import Dense, Activation, Flatten
from keras.layers import merge, Input
from keras.models import Model
from keras.utils import np_utils
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
# Loading the training data
PATH = os.getcwd()
# Define data path
data_path = PATH + '/data'
data_dir_list = os.listdir(data_path)
img_data_list=[]
for dataset in data_dir_list:
img_list=os.listdir(data_path+'/'+ dataset)
print ('Loaded the images of dataset-'+'{}\n'.format(dataset))
for img in img_list:
img_path = data_path + '/'+ dataset + '/'+ img
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
# x = x/255
print('Input image shape:', x.shape)
img_data_list.append(x)
img_data = np.array(img_data_list)
#img_data = img_data.astype('float32')
print (img_data.shape)
img_data=np.rollaxis(img_data,1,0)
print (img_data.shape)
img_data=img_data[0]
print (img_data.shape)
# Define the number of classes
num_classes = 2
num_of_samples = img_data.shape[0]
labels = np.ones((num_of_samples,),dtype='int64')
labels[0:2345]=0
labels[2245:3567]=1
names = ['YES','NO']
# convert class labels to on-hot encoding
Y = np_utils.to_categorical(labels, num_classes)
#Shuffle the dataset
x,y = shuffle(img_data,Y, random_state=2)
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
#########################################################################################
# Custom_vgg_model_1
#Training the classifier alone
image_input = Input(shape=(224, 224, 3))
model = VGG16(input_tensor=image_input, include_top=True,weights='imagenet')
model.summary()
last_layer = model.get_layer('fc2').output
#x= Flatten(name='flatten')(last_layer)
out = Dense(num_classes, activation='softmax', name='output')(last_layer)
custom_vgg_model = Model(image_input, out)
custom_vgg_model.summary()
for layer in custom_vgg_model.layers[:-1]:
layer.trainable = False
custom_vgg_model.layers[3].trainable
custom_vgg_model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
t=time.time()
# t = now()
hist = custom_vgg_model.fit(X_train, y_train, batch_size=32, epochs=12, verbose=1, validation_data=(X_test, y_test))
print('Training time: %s' % (t - time.time()))
(loss, accuracy) = custom_vgg_model.evaluate(X_test, y_test, batch_size=10, verbose=1)
print("[INFO] loss={:.4f}, accuracy: {:.4f}%".format(loss,accuracy * 100))
####################################################################################################################
#Training the feature extraction also
image_input = Input(shape=(224, 224, 3))
model = VGG16(input_tensor=image_input, include_top=True,weights='imagenet')
model.summary()
last_layer = model.get_layer('block5_pool').output
x= Flatten(name='flatten')(last_layer)
x = Dense(128, activation='relu', name='fc1')(x)
x = Dense(128, activation='relu', name='fc2')(x)
out = Dense(num_classes, activation='softmax', name='output')(x)
custom_vgg_model2 = Model(image_input, out)
custom_vgg_model2.summary()
# freeze all the layers except the dense layers
for layer in custom_vgg_model2.layers[:-3]:
layer.trainable = False
custom_vgg_model2.summary()
custom_vgg_model2.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])
t=time.time()
# t = now()
hist = custom_vgg_model2.fit(X_train, y_train, batch_size=32, epochs=12, verbose=1, validation_data=(X_test, y_test))
print('Training time: %s' % (t - time.time()))
(loss, accuracy) = custom_vgg_model2.evaluate(X_test, y_test, batch_size=10, verbose=1)
print("[INFO] loss={:.4f}, accuracy: {:.4f}%".format(loss,accuracy * 100))
#%%
import matplotlib.pyplot as plt
# visualizing losses and accuracy
train_loss=hist.history['loss']
val_loss=hist.history['val_loss']
train_acc=hist.history['acc']
val_acc=hist.history['val_acc']
xc=range(12)
plt.figure(1,figsize=(7,5))
plt.plot(xc,train_loss)
plt.plot(xc,val_loss)
plt.xlabel('num of Epochs')
plt.ylabel('loss')
plt.title('train_loss vs val_loss')
plt.grid(True)
plt.legend(['train','val'])
#print plt.style.available # use bmh, classic,ggplot for big pictures
plt.style.use(['classic'])
plt.figure(2,figsize=(7,5))
plt.plot(xc,train_acc)
plt.plot(xc,val_acc)
plt.xlabel('num of Epochs')
plt.ylabel('accuracy')
plt.title('train_acc vs val_acc')
plt.grid(True)
plt.legend(['train','val'],loc=4)
#print plt.style.available # use bmh, classic,ggplot for big pictures
plt.style.use(['classic'])