I have a ResNet based CNN for classifying images of cats and dogs. I'm loading prelearned weights in order to speed up training. Only the last few fully connected layers are trained by me, the others are frozen. Training is quite fast, achieving 97% accuracy on the testing set after a minute or so. The data set in this case is broken up into 80% for training (20,000 images), 10% validation (2,500 images) and 10% testing (2,500 images).
The problem arises whe I try to implement cross validation on the training set. The accuracy on the training set is improving but not on the validation set. I'm new to ML but have spent a few hours over the past few days trying to get this sorted and haven't got anywhere. I'd be very appreciative of any input you'd have to offer.
Below is the code. The fist section is the code that works fine, with what I'm calling standard validation. The second section is the troublesome components.
1. Code that works fine:
Initialize data generators
train_datagen = ImageDataGenerator(zoom_range=0.15,width_shift_range=0.2,height_shift_range=0.2,shear_range=0.15)
test_datagen = ImageDataGenerator()
val_datagen = ImageDataGenerator()
Flow from directory
train_generator = train_datagen.flow_from_directory(train_path,target_size=(224, 224),batch_size=32,shuffle=True,class_mode='binary')
test_generator = test_datagen.flow_from_directory(test_path,target_size=(224,224),batch_size=32,shuffle=False,class_mode='binary')
val_generator = val_datagen.flow_from_directory(val_path,target_size=(224,224),batch_size=32,shuffle=False,class_mode='binary')
Define Identity block
def identity_block(X, f, filters, stage, block):
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
F1, F2, F3 = filters
X_shortcut = X
X = Conv2D(filters=F1, kernel_size=(1, 1), strides=(1, 1), padding='valid', name=conv_name_base + '2a', kernel_initializer=glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2a')(X)
X = Activation('relu')(X)
X = Conv2D(filters=F2, kernel_size=(f, f), strides=(1, 1), padding='same', name=conv_name_base + '2b', kernel_initializer=glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2b')(X)
X = Activation('relu')(X)
X = Conv2D(filters=F3, kernel_size=(1, 1), strides=(1, 1), padding='valid', name=conv_name_base + '2c', kernel_initializer=glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2c')(X)
X = Add()([X, X_shortcut])# SKIP Connection
X = Activation('relu')(X)
return X
Define convolutional block
def convolutional_block(X, f, filters, stage, block, s=2):
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
F1, F2, F3 = filters
X_shortcut = X
X = Conv2D(filters=F1, kernel_size=(1, 1), strides=(s, s), padding='valid', name=conv_name_base + '2a', kernel_initializer=glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2a')(X)
X = Activation('relu')(X)
X = Conv2D(filters=F2, kernel_size=(f, f), strides=(1, 1), padding='same', name=conv_name_base + '2b', kernel_initializer=glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2b')(X)
X = Activation('relu')(X)
X = Conv2D(filters=F3, kernel_size=(1, 1), strides=(1, 1), padding='valid', name=conv_name_base + '2c', kernel_initializer=glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name=bn_name_base + '2c')(X)
X_shortcut = Conv2D(filters=F3, kernel_size=(1, 1), strides=(s, s), padding='valid', name=conv_name_base + '1', kernel_initializer=glorot_uniform(seed=0))(X_shortcut)
X_shortcut = BatchNormalization(axis=3, name=bn_name_base + '1')(X_shortcut)
X = Add()([X, X_shortcut])
X = Activation('relu')(X)
return X
Define ResNet50
def ResNet50(input_shape=(224, 224, 3)):
X_input = Input(input_shape)
X = ZeroPadding2D((3, 3))(X_input)
X = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', kernel_initializer=glorot_uniform(seed=0))(X)
X = BatchNormalization(axis=3, name='bn_conv1')(X)
X = Activation('relu')(X)
X = MaxPooling2D((3, 3), strides=(2, 2))(X)
X = convolutional_block(X, f=3, filters=[64, 64, 256], stage=2, block='a', s=1)
X = identity_block(X, 3, [64, 64, 256], stage=2, block='b')
X = identity_block(X, 3, [64, 64, 256], stage=2, block='c')
X = convolutional_block(X, f=3, filters=[128, 128, 512], stage=3, block='a', s=2)
X = identity_block(X, 3, [128, 128, 512], stage=3, block='b')
X = identity_block(X, 3, [128, 128, 512], stage=3, block='c')
X = identity_block(X, 3, [128, 128, 512], stage=3, block='d')
X = convolutional_block(X, f=3, filters=[256, 256, 1024], stage=4, block='a', s=2)
X = identity_block(X, 3, [256, 256, 1024], stage=4, block='b')
X = identity_block(X, 3, [256, 256, 1024], stage=4, block='c')
X = identity_block(X, 3, [256, 256, 1024], stage=4, block='d')
X = identity_block(X, 3, [256, 256, 1024], stage=4, block='e')
X = identity_block(X, 3, [256, 256, 1024], stage=4, block='f')
X = X = convolutional_block(X, f=3, filters=[512, 512, 2048], stage=5, block='a', s=2)
X = identity_block(X, 3, [512, 512, 2048], stage=5, block='b')
X = identity_block(X, 3, [512, 512, 2048], stage=5, block='c')
X = AveragePooling2D(pool_size=(2, 2), padding='same')(X)
model = Model(inputs=X_input, outputs=X, name='ResNet50')
return model
Define base model - Prelearned weights are loaded to this model
base_model = ResNet50(input_shape=(224, 224, 3))
Define head model - prelearned weights are not loaded to this model
headModel = base_model.output
headModel = Flatten()(headModel)
headModel=Dense(256, activation='relu', name='fc1',kernel_initializer=glorot_uniform(seed=0))(headModel)
headModel=Dense(128, activation='relu', name='fc2',kernel_initializer=glorot_uniform(seed=0))(headModel)
headModel = Dense( 1,activation='sigmoid', name='fc3',kernel_initializer=glorot_uniform(seed=0)(headModel)
Create ResNet50 model
model = Model(inputs=base_model.input, outputs=headModel)
Load prelearned weights to base model
base_model.load_weights("/content/drive/MyDrive/dogs-vs-cats.zip (Unzipped Files)/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5")
Make sure prelearned weights are not trainable
for layer in base_model.layers:
layer.trainable = False
Compile
from keras import losses
from keras import optimizers
from keras import metrics
model.compile(loss = 'binary_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])
Train model
training_history = model.fit(train_generator,validation_data=val_generator,epochs=1,verbose=1,callbacks=[mc,es])
This works fine. val_accuracy = 0.9716
625/625 [==============================] - ETA: 0s - loss: 0.1199 - accuracy: 0.9547
Epoch 1:
val_accuracy improved from -inf to 0.97160, saving model to /content/drive/My Drive/best_model.h5
625/625 [==============================] - 275s 422ms/step - loss: 0.1199 - accuracy: 0.9547 - val_loss: 0.0729 - val_accuracy: 0.9716
Test Accuracy = 98.199
_, acc = model.evaluate(test_generator, verbose = 1)
print('Accuracy: %.3f' % (acc * 100.0))
79/79 [==============================] - 10s 120ms/step - loss: 0.0445 - accuracy: 0.9820
Accuracy: 98.199`
2. Cross validation, training and testing does not work:
Put training images and labels into an array
x=np.concatenate([train_generator.next()[0] for i in range(train_generator.__len__())])
y=np.concatenate([train_generator.next()[1] for i in range(train_generator.__len__())])
Use ski-learn for cross validation
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=None)
Split into train and test sets & train model
for train, test in kfold.split(x, y):
# Create model copy - same as model that works (as far as I'm aware...)
base_model_copy = ResNet50(input_shape=(224, 224, 3))
head_model_copy = base_model_copy.output
head_model_copy = Flatten()(head_model_copy)
head_model_copy=Dense(256, activation='relu', name='fc1',kernel_initializer=glorot_uniform(seed=0))(head_model_copy)
head_model_copy=Dense(128, activation='relu', name='fc2',kernel_initializer=glorot_uniform(seed=0))(head_model_copy)
head_model_copy = Dense( 1,activation='sigmoid', name='fc3',kernel_initializer=glorot_uniform(seed=0))(head_model_copy)
model_copy = Model(inputs=base_model_copy.input, outputs=head_model_copy)
# load prelearned weights again
base_model_copy.load_weights("/content/drive/MyDrive/dogs-vs-cats.zip (Unzipped Files)/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5")
# make sure the prelearned weight layers are not trainable
for layer in base_model_copy.layers:
layer.trainable = False
#compile
from keras import losses
from keras import optimizers
from keras import metrics
model_copy.compile(loss = 'binary_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])
trainImgGen= ImageDataGenerator(zoom_range=0.15,width_shift_range=0.2,height_shift_range=0.2,shear_range=0.15)
testImgGen = ImageDataGenerator()
trainGen = trainImgGen.flow(x[train], y[train], batch_size=32, shuffle=True)
testGen = testImgGen.flow(x[test], y[test], batch_size=32, shuffle=True)
# training for cross validation
model_copy.fit(trainGen, validation_data=testGen, epochs=500, verbose=1)
# evaluate results on test set
_, acc = model.evaluate(testGen, verbose = 1)
print(acc)
Below are the results. I'd expect it to be closer to what I got using the standard validation technique prior. Any feedback would be appreciated.
Epoch 1/500
417/417 [==============================] - 139s 325ms/step - loss: 0.7239 - accuracy: 0.5001 - val_loss: 0.7060 - val_accuracy: 0.5025
Epoch 2/500
417/417 [==============================] - 134s 322ms/step - loss: 0.6985 - accuracy: 0.5166 - val_loss: 0.7049 - val_accuracy: 0.4936
Epoch 3/500
417/417 [==============================] - 135s 323ms/step - loss: 0.6923 - accuracy: 0.5251 - val_loss: 0.7034 - val_accuracy: 0.4971
Epoch 4/500
417/417 [==============================] - 134s 322ms/step - loss: 0.6898 - accuracy: 0.5376 - val_loss: 0.7043 - val_accuracy: 0.4867
Epoch 5/500
417/417 [==============================] - 134s 322ms/step - loss: 0.6861 - accuracy: 0.5431 - val_loss: 0.7110 - val_accuracy: 0.4980
Epoch 6/500
417/417 [==============================] - 135s 323ms/step - loss: 0.6813 - accuracy: 0.5572 - val_loss: 0.7078 - val_accuracy: 0.4980
Epoch 7/500
417/417 [==============================] - 135s 325ms/step - loss: 0.6769 - accuracy: 0.5680 - val_loss: 0.7222 - val_accuracy: 0.4908
Epoch 8/500
273/417 [==================>...........] - ETA: 44s - loss: 0.6716 - accuracy: 0.5768
Related
I'am beginner in deep learning, I created 3DCNN using Pytorch.
input image: 120 * 120 * 120
the problem that the accuracy and loss decrease and increase in the same interval [45,56].
Can you help me please ?
def __init__(self):
super(CNNModel, self).__init__()
self.conv_layer1 = self._conv_layer_set(3, 32)
self.conv_layer2 = self._conv_layer_set(32, 64)
self.conv_layer3 = self._conv_layer_set(64, 128)
self.conv_layer4 = self._conv_layer_set(128, 256)
self.conv_layer5 = self._conv_layer_set(256, 512)
self.fc1 = nn.Linear(512, 128)
self.fc2 = nn.Linear(128, num_classes)
self.relu = nn.LeakyReLU()
self.batch=nn.BatchNorm1d(128)
self.drop=nn.Dropout(p=0.5, inplace = True)
def _conv_layer_set(self, in_c, out_c):
conv_layer = nn.Sequential(
nn.Conv3d(in_c, out_c, kernel_size=(3, 3, 3), padding=0),
nn.LeakyReLU(),
nn.MaxPool3d((2, 2, 2)),
)
return conv_layer
def forward(self, x):
# Set 1
out = self.conv_layer1(x)
out = self.conv_layer2(out)
out = self.conv_layer3(out)
out = self.conv_layer4(out)
out = self.conv_layer5(out)
out = out.view(out.size(0), -1)
out = self.fc1(out)
out = self.relu(out)
out = self.batch(out)
out = self.drop(out)
out = self.fc2(out)
#out = F.softmax(out, dim=1)
return out
Result :
teration: 1/10 Loss: 0.8040086030960083 Accuracy: 47.023809523809526 %
Iteration: 2/10 Loss: 0.8323351740837097 Accuracy: 45.23809523809524 %
Iteration: 3/10 Loss: 0.8008261322975159 Accuracy: 50.595238095238095 %
Iteration: 4/10 Loss: 0.7527135610580444 Accuracy: 55.95238095238095 %
Iteration: 5/10 Loss: 0.7785584330558777 Accuracy: 51.19047619047619 %
Iteration: 6/10 Loss: 0.7463465929031372 Accuracy: 56.25 %
Iteration: 7/10 Loss: 0.8021382093429565 Accuracy: 52.083333333333336 %
Iteration: 8/10 Loss: 0.7705538868904114 Accuracy: 50.595238095238095 %
I am training a smaller VGG like model, and I set the pretrained weights of VGG16 to the first conv layers which are identical. My model acts pretty strange though, and does not learn anything at all - the loss stays the same, the accuracy stays the same. What is wrong and how can I fix it?
from keras import applications
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras import backend as k
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping
from keras import layers
from keras import models
from keras import optimizers
from keras.layers import Dropout
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
import os
base_model = models.Sequential()
base_model.add(layers.Conv2D(64, (3, 3), activation='relu', name='block1_conv1', input_shape=(224, 224, 3)))
base_model.add(layers.Conv2D(64, (3, 3), activation='relu', name='block1_conv2'))
base_model.add(layers.MaxPooling2D((2, 2)))
#model.add(Dropout(0.2))
base_model.add(layers.Conv2D(128, (3, 3), activation='relu', name='block2_conv1'))
base_model.add(layers.Conv2D(128, (3, 3), activation='relu', name='block2_conv2'))
base_model.add(layers.MaxPooling2D((2, 2), name='block2_pool'))
#model.add(Dropout(0.2))
base_model.summary()
"""
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
_________________________________________________________________
block1_conv1 (Conv2D) (None, 256, 256, 64) 1792
_________________________________________________________________
block1_conv2 (Conv2D) (None, 256, 256, 64) 36928
_________________________________________________________________
block1_pool (MaxPooling2D) (None, 128, 128, 64) 0
_________________________________________________________________
block2_conv1 (Conv2D) (None, 128, 128, 128) 73856
_________________________________________________________________
block2_conv2 (Conv2D) (None, 128, 128, 128) 147584
_________________________________________________________________
block2_pool (MaxPooling2D) (None, 64, 64, 128) 0
=================================================================
Total params: 260,160.0
Trainable params: 260,160.0
Non-trainable params: 0.0
"""
base_model.add(layers.Flatten())
#base_model.add(layers.Dropout(0.5)) #Dropout for regularization
base_model.add(layers.Dense(256, activation='relu'))
base_model.add(layers.Dense(1, activation='sigmoid')) #Sigmoid function at the end because we have just two classes
epochs = 50
callbacks = []
#schedule = None
decay = 0.0
#earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min')
#mcp_save = ModelCheckpoint('.mdl_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')
#reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-5, mode='min')
# compile the model with a SGD/momentum optimizer
# and a very slow learning rate.
base_model.compile(loss='binary_crossentropy',
optimizer=optimizers.SGD(lr=1e-4,decay=1e-6, momentum=0.9, nesterov=True),
metrics=['accuracy'])
vgg = applications.VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
print('Model loaded.')
"""
['block1_conv1',
'block1_conv2',
'block1_pool',
'block2_conv1',
'block2_conv2',
'block2_pool',
'block3_conv1',
'block3_conv2',
'block3_conv3',
'block3_conv4',
'block3_pool',
'block4_conv1',
'block4_conv2',
'block4_conv3',
'block4_conv4',
'block4_pool',
'block5_conv1',
'block5_conv2',
'block5_conv3',
'block5_conv4',
'block5_pool',
'dense_1',
'dense_2',
'dense_3',
'dropout_1',
'global_average_pooling2d_1',
'input_1']
"""
for layer in vgg.layers:
if layer.name == 'block1_conv1':
base_model.layers[0].set_weights(layer.get_weights())
elif layer.name == 'block1_conv2':
base_model.layers[1].set_weights(layer.get_weights())
elif layer.name == 'block2_conv1':
base_model.layers[3].set_weights(layer.get_weights())
elif layer.name == 'block2_conv2':
base_model.layers[4].set_weights(layer.get_weights())
os.environ["CUDA_VISIBLE_DEVICES"]="0"
train_dir = '/home/d/Desktop/s/data/train'
eval_dir = '/home/d/Desktop/s/data/eval'
test_dir = '/home/d/Desktop/s/data/test'
# create a data generator
train_datagen = ImageDataGenerator(rescale=1./255, #Scale the image between 0 and 1
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,)
val_datagen = ImageDataGenerator(rescale=1./255) #We do not augment validation data. we only perform rescale
test_datagen = ImageDataGenerator(rescale=1./255) #We do not augment validation data. we only perform rescale
# load and iterate training dataset
train_generator = train_datagen.flow_from_directory(train_dir, target_size=(224,224),class_mode='binary', batch_size=16, shuffle='True', seed=42)
# load and iterate validation dataset
val_generator = val_datagen.flow_from_directory(eval_dir, target_size=(224,224),class_mode='binary', batch_size=16, shuffle='True', seed=42)
# load and iterate test dataset
test_generator = test_datagen.flow_from_directory(test_dir, target_size=(224,224), class_mode=None, batch_size=1, shuffle='False', seed=42)
#The training part
#We train for 64 epochs with about 100 steps per epoch
history = base_model.fit_generator(train_generator,
steps_per_epoch=train_generator.n // train_generator.batch_size,
epochs=epochs,
validation_data=val_generator,
validation_steps=val_generator.n // val_generator.batch_size) #,
#callbacks=[earlyStopping, mcp_save, reduce_lr_loss])
#Save the model
#base_model.save_weights('/home/d/Desktop/s/base_model_weights.h5')
#base_model.save('/home/d/Desktop/s/base_model_keras.h5')
#lets plot the train and val curve
#get the details form the history object
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
#Train and validation accuracy
plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
plt.title('Training and Validation accurarcy')
plt.legend()
plt.figure()
#Train and validation loss
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()
and the training just goes on forever like this (no changes from epoch to epoch either):
2625/4002 [==================>...........] - ETA: 3:49 - loss: 7.9723 - acc: 0.5053
2626/4002 [==================>...........] - ETA: 3:49 - loss: 7.9720 - acc: 0.5053
2627/4002 [==================>...........] - ETA: 3:49 - loss: 7.9735 - acc: 0.5052
2628/4002 [==================>...........] - ETA: 3:48 - loss: 7.9732 - acc: 0.5052
2629/4002 [==================>...........] - ETA: 3:48 - loss: 7.9732 - acc: 0.5052
2630/4002 [==================>...........] - ETA: 3:48 - loss: 7.9729 - acc: 0.5052
2631/4002 [==================>...........] - ETA: 3:48 - loss: 7.9725 - acc: 0.5052
2632/4002 [==================>...........] - ETA: 3:48 - loss: 7.9729 - acc: 0.5052
2633/4002 [==================>...........] - ETA: 3:48 - loss: 7.9733 - acc: 0.5052
2634/4002 [==================>...........] - ETA: 3:47 - loss: 7.9722 - acc: 0.5053
2635/4002 [==================>...........] - ETA: 3:47 - loss: 7.9730 - acc: 0.5052
2636/4002 [==================>...........] - ETA: 3:47 - loss: 7.9719 - acc: 0.5053
2637/4002 [==================>...........] - ETA: 3:47 - loss: 7.9727 - acc: 0.5052
2638/4002 [==================>...........] - ETA: 3:47 - loss: 7.9731 - acc: 0.5052
2639/4002 [==================>...........] - ETA: 3:47 - loss: 7.9732 - acc: 0.5052
my training variable shape is (264, 120, 120, 3)
trying to give numpy array of images as input
model = Sequential()
model.add(Conv2D(8, (3, 3), activation='relu', strides=2,input_shape=(image_height,image_width,channels)))
model.add(Conv2D(16, (3, 3), activation='relu'))
model.summary()
model.compile(optimizer='rmsprop', loss='mse')
model.fit(x=X_train, y=y_train, batch_size=1, epochs=1, verbose=1)
below is the error message
________________________________________________________________
Layer (type) Output Shape Param
=================================================================
conv2d_36 (Conv2D) (None, 59, 59, 8) 224
_________________________________________________________________
conv2d_37 (Conv2D) (None, 57, 57, 16) 1168
=================================================================
Total params: 1,392
Trainable params: 1,392
Non-trainable params: 0
ValueError: Error when checking target: expected conv2d_37 to have shape (57, 57, 16) but got array with shape (120, 120, 3)
This error was because of mismatch in shape between model output and training data.
Please refer sample code in below
#Import Dependencies
import keras
from keras.models import Model, Sequential
from keras.layers import Conv2D, Flatten, Dense
# Model Building
model = Sequential()
model.add(Conv2D(8, (3, 3), activation='relu', strides=2, input_shape=(28,28,1)))
model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(Flatten())
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['mse'])
# Generate dummy data
import numpy as np
data = np.random.random((100, 28, 28, 1))
labels = np.random.randint(2, size=(100, 10))
# Train the model, iterating on the data in batches of 32 samples
model.fit(data, labels, epochs=5, batch_size=32)
Output:
Epoch 1/5
100/100 [==============================] - 0s 1ms/step - loss: 1.2342 - mse: 0.4195
Epoch 2/5
100/100 [==============================] - 0s 234us/step - loss: 1.2183 - mse: 0.4167
Epoch 3/5
100/100 [==============================] - 0s 222us/step - loss: 1.2104 - mse: 0.4151
Epoch 4/5
100/100 [==============================] - 0s 255us/step - loss: 1.2019 - mse: 0.4131
Epoch 5/5
100/100 [==============================] - 0s 239us/step - loss: 1.1938 - mse: 0.4120
I’m trying to train a dataset with AlexNet model. The task is multiclass classification (15 classes). I am wondering why I am getting very low accuracy.
I tried different learning rate but has not been improved.
Here is the snippet for the training.
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
#optimizer = optim.Adam(model.parameters(), lr=1e-2) # 1e-3, 1e-8
def train_valid_model():
num_epochs=5
since = time.time()
out_loss = open("history_loss_AlexNet_exp1.txt", "w")
out_acc = open("history_acc_AlexNet_exp1.txt", "w")
losses=[]
ACCes =[]
#losses = {}
for epoch in range(num_epochs): # loop over the dataset multiple times
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 50)
if epoch % 10 == 9:
torch.save({
'epoch': epoch + 1,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss
}, 'AlexNet_exp1_epoch{}.pth'.format(epoch+1))
for phase in ['train', 'valid', 'test']:
if phase == 'train':
model.train()
else:
model.eval()
train_loss = 0.0
total_train = 0
correct_train = 0
for t_image, target, image_path in dataLoaders[phase]:
#print(t_image.size())
#print(target)
t_image = t_image.to(device)
target = target.to(device)
optimizer.zero_grad()
with torch.set_grad_enabled(phase == 'train'):
outputs = model(t_image)
outputs = F.softmax(outputs, dim=1)
loss = criterion(outputs,target)
if phase == 'train':
loss.backward()
optimizer.step()
_, predicted = torch.max(outputs.data, 1)
train_loss += loss.item()* t_image.size(0)
correct_train += (predicted == target).sum().item()
epoch_loss = train_loss / len(dataLoaders[phase].dataset)
#losses[phase] = epoch_loss
losses.append(epoch_loss)
epoch_acc = 100 * correct_train / len(dataLoaders[phase].dataset)
ACCes.append(epoch_acc)
print('{} Loss: {:.4f} {} Acc: {:.4f}'.format(phase, epoch_loss, phase, epoch_acc))
This is the output for two epochs
Epoch 0/4
train Loss: 2.7026 train Acc: 17.2509
valid Loss: 2.6936 valid Acc: 28.7632
test Loss: 2.6936 test Acc: 28.7632
Epoch 1/4
train Loss: 2.6425 train Acc: 17.8019
valid Loss: 2.6357 valid Acc: 28.7632
test Loss: 2.6355 test Acc: 28.7632
Just a basic tip, it may help you started,
import torchvision.models as models
alexnet = models.alexnet(pretrained=True)
When using alexnet you may start with the pretrained model, I haven't saw that in your code.
If you need to have just 15 classes, make sure you remove the fully connected layer at the very end, and add your new fc layer with 15 outputs,
Your alexnet looks like this:
AlexNet(
(features): Sequential(
(0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
(1): ReLU(inplace)
(2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(4): ReLU(inplace)
(5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(7): ReLU(inplace)
(8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(9): ReLU(inplace)
(10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(11): ReLU(inplace)
(12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
(classifier): Sequential(
(0): Dropout(p=0.5)
(1): Linear(in_features=9216, out_features=4096, bias=True)
(2): ReLU(inplace)
(3): Dropout(p=0.5)
(4): Linear(in_features=4096, out_features=4096, bias=True)
(5): ReLU(inplace)
(6): Linear(in_features=4096, out_features=1000, bias=True)
)
)
So you are in a need removing just the classifier (6) layer.
I think here answered how to remove the fc6.
For multi-label classification, the last layer in the model should use a sigmoid function for label prediction, and the training process should use binary_crossentropy function or nn.BCELoss.
I have this CNN code for the MNIST data that divides the dataset into training set and test set for only 2's and 7's. On running it the code it gives about 98% Accuracy on the test set.
So, to increase the Accuracy I tried using KerasClassifier from keras.wrappers.scikit_learn. Using the Classifier with GridSearchCV I was thinking to find the optimal parameters but on running the code 1st Iteration goes all fine but throws an error from the next Iteration.
Here is the code:
# This is the normal CNN model without GridSearch
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import numpy as np
batch_size = 128
num_classes = 2
epochs = 12
# input image dimensions
img_rows, img_cols = 28, 28
# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
#Only look at 3s and 8s
train_picks = np.logical_or(y_train==2,y_train==7)
test_picks = np.logical_or(y_test==2,y_test==7)
x_train = x_train[train_picks]
x_test = x_test[test_picks]
y_train = np.array(y_train[train_picks]==7,dtype=int)
y_test = np.array(y_test[test_picks]==7,dtype=int)
if K.image_data_format() == 'channels_first':
x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
model = Sequential()
model.add(Conv2D(4, kernel_size=(3, 3),activation='relu',input_shape=input_shape))
model.add(Conv2D(8, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adadelta(),
metrics=['accuracy'])
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Improving the accuracy using GridSearch
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
def build_model(optimizer):
print(optimizer,batch_size,epochs)
model = Sequential()
model.add(Conv2D(4, kernel_size=(3, 3),activation='relu',input_shape=input_shape))
model.add(Conv2D(8, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=optimizer,
metrics=['accuracy'])
return model
model = KerasClassifier(build_fn = build_model)
parameters = {'batch_size': [128, 256],
'epochs': [10, 20],
'optimizer': ['rmsprop']}
grid_search = GridSearchCV(estimator = model,
param_grid = parameters,
scoring = 'accuracy',
cv = 10)
grid_search = grid_search.fit(x_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_
This is the Output of the code:
rmsprop 128 12
Epoch 1/10
11000/11000 [==============================] - 3s - loss: 0.1654 - acc: 0.9476
Epoch 2/10
11000/11000 [==============================] - 3s - loss: 0.0699 - acc: 0.9786
Epoch 3/10
11000/11000 [==============================] - 2s - loss: 0.0557 - acc: 0.9839
Epoch 4/10
11000/11000 [==============================] - 2s - loss: 0.0510 - acc: 0.9839
Epoch 5/10
11000/11000 [==============================] - 2s - loss: 0.0471 - acc: 0.9853
Epoch 6/10
11000/11000 [==============================] - 2s - loss: 0.0417 - acc: 0.9875
Epoch 7/10
11000/11000 [==============================] - 2s - loss: 0.0399 - acc: 0.9870
Epoch 8/10
11000/11000 [==============================] - 2s - loss: 0.0365 - acc: 0.9885
Epoch 9/10
11000/11000 [==============================] - 2s - loss: 0.0342 - acc: 0.9899
Epoch 10/10
11000/11000 [==============================] - 2s - loss: 0.0321 - acc: 0.9903
768/1223 [=================>............] - ETA: 0sTraceback (most recent call last):
File "<ipython-input-4-975b20661114>", line 30, in <module>
grid_search = grid_search.fit(x_train, y_train)
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py", line 945, in fit
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py", line 564, in _fit
for parameters in parameter_iterable
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 260, in _fit_and_score
test_score = _score(estimator, X_test, y_test, scorer)
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 288, in _score
score = scorer(estimator, X_test, y_test)
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/metrics/scorer.py", line 98, in __call__
**self._kwargs)
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py", line 172, in accuracy_score
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "/home/thakkar_/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py", line 82, in _check_targets
"".format(type_true, type_pred))
ValueError: Can't handle mix of multilabel-indicator and binary
Please help!
The error seem to be in the way you parsing the dictionary parameters..
An example from here:
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import SGD
# Function to create model, required for KerasClassifier
def create_model(learn_rate=0.01, momentum=0):
# create model
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
optimizer = SGD(lr=learn_rate, momentum=momentum)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]
# create model
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0)
# define the grid search parameters
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
param_grid = dict(learn_rate=learn_rate, momentum=momentum)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X, Y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
GridseachCV basically takes elements from the dictionary that matches with its input parameter, and train it. You are parsing the complete dictionary, but batch_size, and epochs aren't parameter within the function...
# Improving the accuracy using GridSearch
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
def build_model(optimizer = 'adam'):
model = Sequential()
model.add(Conv2D(4, kernel_size=(3, 3),activation='relu',input_shape=input_shape))
model.add(Conv2D(8, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=optimizer,
metrics=['accuracy'])
return model
model = KerasClassifier(build_fn = build_model)
parameters = {'batch_size': [128, 256],
'epochs': [10, 20],
'optimizer': ['rmsprop']}
grid_search = GridSearchCV(estimator = model,
param_grid = parameters,
scoring = 'accuracy',
cv = 10)
grid_search = grid_search.fit(x_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_
Maybe something like this would work.. have not tested it.