PyTorch model not converging - pytorch

I'm training a binary classification model on a series of images.
The model was derived from resnet18 in torchvision and I made the last FC as nn.Linear(512, 1)
The loss function is BCELoss
However, the model doesn't show any sign of converging even after 5000 iterations.
I'm suspecting I might do something wrong in the training stage? But I can't find where's the bug.
Here's my code:
Model:
## Model
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
resnet18 = models.resnet18(pretrained= True)
resnet18.fc = nn.Linear(512, 1)
Parameters, loss, optimizers:
## parameter
epochs = 200
learning_rate = 0.1
momen = 0.9
batch = 8
criterion = nn.BCELoss()
resnet18.to(device)
opt = optim.SGD(resnet18.parameters(), lr = learning_rate, momentum = momen)
Dataloaders:
# Generators
training_set = Dataset(X_train)
training_generator = torch.utils.data.DataLoader(training_set, batch_size= batch, shuffle=True)
validation_set = Dataset(X_test)
validation_generator = torch.utils.data.DataLoader(validation_set, batch_size=1, shuffle=False)
Training:
# training
history = []
for t in range(epochs):
for i, data in enumerate(training_generator, 0):
inputs, labels = data
# check if input size == batch size #
if inputs.shape[0] < batch:
break
# print("labels", labels, labels.dtype)
# move data to GPU #
inputs, labels = inputs.to(device), labels.to(device)
opt.zero_grad()
# Prediction #
y_pred = resnet18(inputs).view(batch,)
y_pred = (y_pred > 0).float().requires_grad_()
# print("y_pred", y_pred, y_pred.dtype)
# Calculating loss #
loss = criterion(y_pred, labels.view(batch,))
loss.backward()
opt.step()
if i % 10 == 0:
history.append(loss.item())
print("Epoch: {}, iter: {}, loss: {}".format(t, i, loss.item())
torch.save(resnet18, 'trained_resnet18.pt')
Edit:
The loss values are like this:
Epoch: 3, iter: 310, loss: 0.0
Epoch: 3, iter: 320, loss: 37.5
Epoch: 3, iter: 330, loss: 37.5
Epoch: 3, iter: 340, loss: 0.0
Epoch: 3, iter: 350, loss: 37.5
Epoch: 3, iter: 360, loss: 50.0
Epoch: 3, iter: 370, loss: 37.5
Epoch: 3, iter: 380, loss: 25.0
Epoch: 3, iter: 390, loss: 12.5

I belive the error lies in the following line:
y_pred = (y_pred > 0).float().requires_grad_()
You try to binarize the model prediction in a weird way, I suggest do the following instead:
y_pred = torch.sigmoid(y_pred)
And pass this to the loss function.
Explanation
The output of the model can be any value, but we want to normalize that values to reside in the [0,1] range. This is exactly what the sigmoid function does. Once we have the values in the range of [0,1] the comparison with the binary labels will make sense, closer to 1 will be "1" and the opposite.
You can refer to the following link: https://www.youtube.com/watch?v=WsFasV46KgQ

Related

Nearly Constant training and validation accuracy

I’m new to pytorch and my problem may be a little naive
I’m training a pretrained VGG16 network on my dataset which it’s size is near 33000 images in 8 classes with labels [1,2,…,8] and my classes are imbalanced. my problem is that during training, validation and training accuracy is low and doesn’t increase, is there any problem in my code?
if not, what do you suggest to improve training?
'''
import torch
import time
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from torch.optim import Adam
import cv2
import torchvision.models as models
from classify_dataset import Classification_dataset
from torchvision import transforms
transform = transforms.Compose([transforms.Resize((224,224)),
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomVerticalFlip(p=0.5),
transforms.RandomRotation(degrees=45),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
dataset = Classification_dataset(root_dir=r'//home/arisa/Desktop/Hamid/IQA/Hamid_Dataset',
csv_file=r'/home/arisa/Desktop/Hamid/IQA/new_label.csv',transform=transform)
target = dataset.labels - 1
train_indices, test_indices = train_test_split(np.arange(target.shape[0]), stratify=target)
test_dataset = torch.utils.data.Subset(dataset, indices=test_indices)
train_dataset = torch.utils.data.Subset(dataset, indices=train_indices)
class_sample_count = np.array([len(np.where(target[train_indices] == t)[0]) for t in np.unique(target)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in target[train_indices]])
samples_weight = torch.from_numpy(samples_weight)
samples_weight = samples_weight.double()
sampler = torch.utils.data.WeightedRandomSampler(samples_weight, len(samples_weight), replacement = True)
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=64,
sampler=sampler)
test_loader = torch.utils.data.DataLoader(test_dataset,
batch_size=64,
shuffle=False)
for param in model.parameters():
param.requires_grad = False
num_ftrs = model.classifier[0].in_features
model.classifier = nn.Linear(num_ftrs,8)
optimizer = Adam(model.parameters(), lr = 0.0001 )
criterion = nn.CrossEntropyLoss()
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.01)
path = '/home/arisa/Desktop/Hamid/IQA/'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
def train_model(model, train_loader,valid_loader, optimizer, criterion, scheduler=None, num_epochs=10 ):
min_valid_loss = np.inf
model.train()
start = time.time()
TrainLoss = []
model = model.to(device)
for epoch in range(num_epochs):
total = 0
correct = 0
train_loss = 0
#lr_scheduler.step()
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-' * 10)
train_loss = 0.0
for x,y in train_loader:
x = x.to(device)
#print(y.shape)
y = y.view(y.shape[0],).to(device)
y = y.to(device)
y -= 1
out = model(x)
loss = criterion(out, y)
optimizer.zero_grad()
loss.backward()
TrainLoss.append(loss.item()* y.shape[0])
train_loss += loss.item() * y.shape[0]
_,predicted = torch.max(out.data,1)
total += y.size(0)
correct += (predicted == y).sum().item()
optimizer.step()
lr_scheduler.step()
accuracy = 100*correct/total
valid_loss = 0.0
val_loss = []
model.eval()
val_correct = 0
val_total = 0
with torch.no_grad():
for x_val, y_val in test_loader:
x_val = x_val.to(device)
y_val = y_val.view(y_val.shape[0],).to(device)
y_val -= 1
target = model(x_val)
loss = criterion(target, y_val)
valid_loss += loss.item() * y_val.shape[0]
_,predicted = torch.max(target.data,1)
val_total += y_val.size(0)
val_correct += (predicted == y_val).sum().item()
val_loss.append(loss.item()* y_val.shape[0])
val_acc = 100*val_correct / val_total
print(f'Epoch {epoch + 1} \t\t Training Loss: {train_loss / len(train_loader)} \t\t Validation Loss: {valid_loss / len(test_loader)} \t\t Train Acc:{accuracy} \t\t Validation Acc:{val_acc}')
if min_valid_loss > (valid_loss / len(test_loader)):
print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss / len(test_loader):.6f}) \t Saving The Model')
min_valid_loss = valid_loss / len(test_loader)
state = {'state_dict': model.state_dict(),'optimizer': optimizer.state_dict(),}
torch.save(state,'/home/arisa/Desktop/Hamid/IQA/checkpoint.t7')
end = time.time()
print('TRAIN TIME:')
print('%.2gs'%(end-start))
train_model(model=model, train_loader=train_loader, optimizer=optimizer, criterion=criterion, valid_loader= test_loader,num_epochs=500 )
Thanks in advance
here is the result of 15 epoch
Epoch 1/500
----------
Epoch 1 Training Loss: 205.63448420514916 Validation Loss: 233.89266112356475 Train Acc:39.36360386127994 Validation Acc:24.142040038131555
Epoch 2/500
----------
Epoch 2 Training Loss: 199.05699240435197 Validation Loss: 235.08799531243065 Train Acc:41.90998291820601 Validation Acc:24.27311725452812
Epoch 3/500
----------
Epoch 3 Training Loss: 199.15626737127448 Validation Loss: 236.00033430619672 Train Acc:41.1035633416756 Validation Acc:23.677311725452814
Epoch 4/500
----------
Epoch 4 Training Loss: 199.02581041173886 Validation Loss: 233.60767459869385 Train Acc:41.86628530568466 Validation Acc:24.606768350810295
Epoch 5/500
----------
Epoch 5 Training Loss: 198.61493769454472 Validation Loss: 233.7503859202067 Train Acc:41.53656695665991 Validation Acc:25.0
Epoch 6/500
----------
Epoch 6 Training Loss: 198.71323942956585 Validation Loss: 234.17176149830675 Train Acc:41.639852222619474 Validation Acc:25.369399428026693
Epoch 7/500
----------
Epoch 7 Training Loss: 199.9395153770592 Validation Loss: 234.1744423635078 Train Acc:40.98041552456998 Validation Acc:24.84509056244042
Epoch 8/500
----------
Epoch 8 Training Loss: 199.3533399020355 Validation Loss: 235.4645173188412 Train Acc:41.26643626107337 Validation Acc:24.165872259294567
Epoch 9/500
----------
Epoch 9 Training Loss: 199.6451746921249 Validation Loss: 233.33387595956975 Train Acc:40.96452548365312 Validation Acc:24.59485224022879
Epoch 10/500
----------
Epoch 10 Training Loss: 197.9305159737011 Validation Loss: 233.76405122063377 Train Acc:41.8782028363723 Validation Acc:24.6186844613918
Epoch 11/500
----------
Epoch 11 Training Loss: 199.33247244055502 Validation Loss: 234.41085289463854 Train Acc:41.59218209986891 Validation Acc:25.119161105815063
Epoch 12/500
----------
Epoch 12 Training Loss: 199.87399289874256 Validation Loss: 234.23621463775635 Train Acc:41.028085647320545 Validation Acc:24.49952335557674
Epoch 13/500
----------
Epoch 13 Training Loss: 198.85540591944292 Validation Loss: 234.33149099349976 Train Acc:41.206848607635166 Validation Acc:24.857006673021925
Epoch 14/500
----------
Epoch 14 Training Loss: 199.92641723337513 Validation Loss: 233.37722391070741 Train Acc:41.15520597465539 Validation Acc:24.988083889418494
Epoch 15/500
----------
Epoch 15 Training Loss: 197.82172771698328 Validation Loss: 234.4943131533536 Train Acc:41.69943987605768 Validation Acc:24.380362249761678
You freezed your model through
for param in model.parameters():
param.requires_grad = False
which basically says "do not calculate any gradient for any weight" which is equivalent of not updating weights - hence no optimization
my problem was in model.train(). This phrase should be inside the training loop. but in my case I put it outside the training loop and when it comes to model.eval(), model maintained in this mode

Image classification Using Pytorch

this is the code where I was working on Image Classification using Pytorch and I'm not able to get the accuracy right.
the accuracy is exceeding 100 ,can anyone help me to find the error.
def trained_model(criterion, optimizer, epochs=5):
epoch_loss = 0.0
epoch_accuracy = 0
running_loss = 0
running_accuracy = 0
total = 0
for epoch in range(epochs):
print('epoch : {}/{}'.format(epoch+1, epochs))
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
_, predictions = torch.max(outputs, dim=1)
loss.backward()
optimizer.step()
running_loss += loss.item()
running_accuracy += torch.sum(predictions == labels.data)
epoch_loss = running_loss / len(train_dataset)
epoch_accuracy = running_accuracy / len(train_dataset)
print('Loss:{:.4f} , Accuracy : {:.4f} '.format(epoch_loss, epoch_accuracy))
return model
You should probably use torch.argmax to get the class predictions from your model output, instead of torch.max.
Assuming you are working with indices as labels. Something like the following will get you the average accuracy of the current batch:
>>> outputs = torch.rand(16, 5)
>>> pred = torch.argmax(outputs, axis=0)
tensor([14, 11, 13, 15, 7])
>>> labels = torch.tensor([14, 6, 13, 5, 8])
>>> accuracy = (pred == labels).float().mean()
tensor(0.4000)

Training a forward dynamics model of a physical system using neural network

I am trying to predict the forward dynamics of a physical system using neural networks. The equation is
f(X_t,A_t) = (X_{t+1} - X_t)/dt where X is the state vector composed of angular positions and angular velocities and A_t is the control input and dt the small increment in time t. I have 10k data points.
My code:
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
np.set_printoptions(precision=3, suppress=True)
print(tf.__version__)
"""
Trying to find the increment in states, f_(theta), from the equation
x_{t+1} = x_t + dt * f_(theta)(x_t, a_t)
x: [q1, q2, q3, q4, q5, q6, q7, q1d, q2d, q3d, q4d, q5d, q6d, q7d] # q : angular positions, qd: ang velocities
a: [a1, a2, a3, a4, a5, a6, a7] # control input
"""
data = np.load('state_actions_10k.npy', allow_pickle=True)
X = np.hstack((data[:, :7], data[:, 7:14]))
U = data[:, 14:]
dX = np.diff(X, axis=0) # state residual
dt = 1
scalarX = StandardScaler() # MinMaxScaler(feature_range=(-1,1))#StandardScaler()# RobustScaler()
scalarU = MinMaxScaler(feature_range=(-1, 1))
scalardX = MinMaxScaler(feature_range=(-1, 1))
scalarX.fit(X)
scalarU.fit(U)
scalardX.fit(dX)
normX = scalarX.transform(X)
normU = scalarU.transform(U)
normdX = scalardX.transform(dX)
inputs = np.hstack((normX, normU))
inputs = inputs[:-1]
outputs = normdX
n, test_frac, train_frac = len(X), 0.2, 0.7
val_frac = 1 - test_frac - train_frac
X_train = inputs[:int(n*train_frac)]
X_test = inputs[int(n*train_frac):int(n*(train_frac+test_frac))]
X_val = inputs[int(n*(train_frac+test_frac)):]
Y_train = outputs[:int(n*train_frac)]
Y_test = outputs[int(n*train_frac):int(n*(train_frac+test_frac))]
Y_val = outputs[int(n*(train_frac+test_frac)):]
model = tf.keras.Sequential([
tf.keras.Input(shape=(21,)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(14),
])
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
train = True
if train:
history = model.fit(
X_train,
Y_train,
batch_size=64,
epochs=100,
# We pass some validation for
# monitoring validation loss and metrics
# at the end of each epoch
validation_data=(X_val, Y_val),
)
model.save_weights('save_weights/trained_weights')
else:
model.load_weights('save_weights/trained_weights')
print("Evaluate on test data")
results = model.evaluate(X_test, Y_test, batch_size=128)
print("test loss, test acc:", results)
print("Generate predictions for 3 samples")
predictions = model.predict(X_test[:3])
print("predictions shape:", predictions.shape)
I see that the validation accuracy is 0.9 at the end which I think is quite low. The state_actions.npy and the saved networks weights are available here
Epoch 100/100
1094/1094 [==============================] - 1s 1ms/step - loss: 9.5670e-05 - accuracy: 0.9777 - val_loss: 0.0014 - val_accuracy: 0.9055
Evaluate on test data
157/157 [==============================] - 0s 759us/step - loss: 2.5453e-04 - accuracy: 0.9586
test loss, test acc: [0.0002545334573369473, 0.9585979580879211]
Can someone suggest a method to improve the accuracy?

pytorch: Getting same result in each epoch

I m try to train an rbf network... I used MNIST database. And pytorch framework...
The results are the same in each epoch...
The results....:
Epoch: 1
Accuracy: 0.785 Loss: 2.435 Recall: 0.386 Precision: 0.258
Epoch: 2
Accuracy: 0.785 Loss: 2.435 Recall: 0.386 Precision: 0.258
Epoch: 3
Accuracy: 0.785 Loss: 2.435 Recall: 0.386 Precision: 0.258
Epoch: 4
Accuracy: 0.785 Loss: 2.435 Recall: 0.386 Precision: 0.258
My Code... I think that the problem is somewhere in the linear layer. The model has no improve after the training epoch, maybe it's the linear layer. It seems like the weights no change...! But i don't know why...?
class RBF(nn.Module):
def __init__(self, in_layers, centers, sigmas):
super(RBF, self).__init__()
self.in_layers = in_layers
self.centers = nn.Parameter(centers)
self.sigmas = nn.Parameter(torch.Tensor(self.centers.size(0)))
torch.nn.init.constant_(self.sigmas, sigmas)
def forward(self, x):
x = x.view(-1, self.in_layers)
size = [self.centers.size(0), x.size(0)]
sigma = self.sigmas.view(-1).to(device)**2
dists = torch.empty(size).to(device)
for i,c in enumerate(self.centers):
c = c.reshape(-1,c.size(0))
temp = (x-c).pow(2).sum(-1).pow(0.5)
dists[i] = temp
dists = dists.permute(1,0)
phi = torch.exp(-1*(dists/(2*sigma))) #gaussian
return phi
class Net(nn.Module):
def __init__(self, in_layers, centers, sigmas):
super(Net, self).__init__()
self.rbf_layers = nn.ModuleList()
self.linear_layers = nn.ModuleList()
for i in range(len(in_layers) - 1):
self.rbf_layers.append(RBF(in_layers[i], centers, sigmas))
self.linear_layers.append(nn.Linear(centers.size(0), in_layers[i+1], bias = True))
def forward(self, x):
out = x
for i in range(len(self.rbf_layers)):
out = self.rbf_layers[i](out)
out = F.sigmoid( self.linear_layers[i](out.float()) )
return out
def training(engine, batch, device, model, criterion, optimizer):
inputs, labels = batch[0].to(device), batch[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
return outputs, labels
Ofcourse the code continuous but i think these are enough to solve the problem(if you want smthing extra 'i m here').... Do you have any ideas???
and the training part of code....
def training(engine, batch, device, model, criterion, optimizer):
inputs, labels = batch[0].to(device), batch[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
return outputs, labels
def nn_run1(batch, classes, dim, learning_rate, epochs, clusters):
# ---Load Model's Parameters---
train_loader, test_loader = data_loading(batch, shuffle=False)
kmeans_input = train_loader.dataset.train_data
kmeans_input = torch.reshape(kmeans_input.double(), (kmeans_input.size(0), -1))
_, centers = Kmeans(kmeans_input, clusters)
centers = centers.to(device)
sigma = Sigmas(centers)
layers = in_layers(dim, len(classes), layers = 1)
# ---Model Setup---
model = Net(layers, centers, sigma)
model.cuda()
criterion = nn.CrossEntropyLoss()
print(model.parameters)
optimizer = torch.optim.SGD(model.parameters(), learning_rate)

Huge increase in loss function after loading model in Keras, custom data, heavy agumentation

I got a quite simple problem. After I train my model in Keras, I use save(filepath) method to save my model. Afterward, when I want to continue training, I load up my model, start fitting the model and loss jumps to 420! (from like ~5) and I can't really find out why. According to Keras doc, save() method should save all things, architecture, optimizer state, and weights.
#preprocessing function
def get_random_eraser(p=0.5, s_l=0.02, s_h=0.4, r_1=0.3, r_2=1/0.3, v_l=0, v_h=255, pixel_level=False):
def eraser(input_img):
img_h, img_w, img_c = input_img.shape
p_1 = np.random.rand()
if p_1 > p:
return norm(input_img)
while True:
s = np.random.uniform(s_l, s_h) * img_h * img_w
r = np.random.uniform(r_1, r_2)
w = int(np.sqrt(s / r))
h = int(np.sqrt(s * r))
left = np.random.randint(0, img_w)
top = np.random.randint(0, img_h)
if left + w <= img_w and top + h <= img_h:
break
if pixel_level:
c = np.random.uniform(v_l, v_h, (h, w, img_c))
else:
c = np.random.uniform(v_l, v_h)
input_img[top:top + h, left:left + w, :] = c
input_img = norm(input_img)
input_img = random_crop(input_img, (50, 50))
return input_img
return eraser
def norm(img):
return img / 127.5 - 1.
def random_crop(img, random_crop_size):
# Note: image_data_format is 'channel_last'
assert img.shape[2] == 3
height, width = img.shape[0], img.shape[1]
dy, dx = random_crop_size
x = np.random.randint(0, width - dx + 1)
y = np.random.randint(0, height - dy + 1)
crop = img[y:(y+dy), x:(x+dx), :]
return cv2.resize(crop, (height, width), cv2.INTER_LANCZOS4)
model = mn.MobileNetV2(input_shape=None, alpha=1.0, include_top=False, weights='imagenet', classes=179)
model.summary()
l = model.layers
for layer in l:
print(layer.get_config(), '\n')
if 'kernel_regularizer' in layer.get_config():
print('found kernel regularizer')
layer.kernel_regularizer=l2(l=0.1)
print('kernel regularizer', layer.kernel_regularizer)
if 'bias_regularizer' in layer.get_config():
print('found kernel regularizer')
layer.bias_regularizer=l2(l=0.1)
print('bias regularizer', layer.bias_regularizer)
x = Dropout(0.7)(l[-1].output)
x = Conv2D(179, (1,1), activation='linear')(x)
x = ReLU()(x)
x = GlobalAveragePooling2D()(x)
x = Softmax()(x)
model_mod = Model(inputs=model.input, outputs=x)
gen_t = ImageDataGenerator(
horizontal_flip=True,
vertical_flip=True,
rotation_range=45,
width_shift_range=0.3,
height_shift_range=0.3,
shear_range = 0.3,
zoom_range = 0.3,
preprocessing_function=get_random_eraser(s_l=0, s_h=0.8),
validation_split=0.1
)
gen_v = ImageDataGenerator(
preprocessing_function=norm,
validation_split=0.1
)
early_stop = EarlyStopping(patience=10, restore_best_weights=True, verbose=True)
tb = TensorBoard(batch_size=32)
mc = ModelCheckpoint('mobilenetv2_combined.hdf5', monitor='val_loss', save_best_only=True, verbose=True)
train_generator = gen_t.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="training")
validation_generator = gen_v.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="validation")
class_weights = class_weight.compute_class_weight('balanced', np.unique(train_generator.classes), train_generator.classes)
model_mod.compile(k.optimizers.sgd(lr=0.001, momentum=0.9, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy', 'top_k_categorical_accuracy'])
hist = model_mod.fit_generator(train_generator,validation_data=validation_generator, epochs=1, initial_epoch=0, callbacks=[early_stop, tb, mc], class_weight=class_weights)
model_mod.save('mobilenet_model_save.h5')
Found 17924 images belonging to 179 classes.
Found 1910 images belonging to 179 classes.
Epoch 1/1
561/561 [==============================] - 415s 741ms/step - loss: 4.9594 - acc: 0.0322 - top_k_categorical_accuracy: 0.1134 - val_loss: 4.4137 - val_acc: 0.0921 - val_top_k_categorical_accuracy: 0.2644
Epoch 00001: val_loss improved from inf to 4.41366, saving model to mobilenetv2_combined.hdf5
So this is code I'm running for training. Now basically same code for continue training (this is just for illustration):
gen_t = ImageDataGenerator(
horizontal_flip=True,
vertical_flip=True,
rotation_range=45,
width_shift_range=0.3,
height_shift_range=0.3,
shear_range = 0.3,
zoom_range = 0.3,
preprocessing_function=get_random_eraser(s_l=0, s_h=0.8),
validation_split=0.1
)
gen_v = ImageDataGenerator(
preprocessing_function=norm,
validation_split=0.1
)
early_stop = EarlyStopping(patience=10, restore_best_weights=True, verbose=True)
tb = TensorBoard(batch_size=32)
mc = ModelCheckpoint('mobilenetv2_combined.hdf5', monitor='val_loss', save_best_only=True, verbose=True)
train_generator = gen_t.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="training")
validation_generator = gen_v.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="validation")
model_mod = load_model('mobilenet_model_save.h5')
class_weights = class_weight.compute_class_weight('balanced', np.unique(train_generator.classes), train_generator.classes)
#model_mod.compile(adam(lr=0.0001, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', 'top_k_categorical_accuracy'])
model_mod.compile(k.optimizers.sgd(lr=0.001, momentum=0.9, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy', 'top_k_categorical_accuracy'])
hist = model_mod.fit_generator(train_generator,validation_data=validation_generator, epochs=2, initial_epoch=1, callbacks=[early_stop, tb, mc], class_weight=class_weights)
model_mod.save('mobilenet_model_save.h5')
Found 17924 images belonging to 179 classes.
Found 1910 images belonging to 179 classes.
Epoch 2/2
561/561 [==============================] - 373s 665ms/step - loss: 174.3220 - acc: 0.0815 - top_k_categorical_accuracy: 0.2320 - val_loss: 49.8441 - val_acc: 0.0110 - val_top_k_categorical_accuracy: 0.0455
Epoch 00002: val_loss improved from inf to 49.84411, saving model to mobilenetv2_combined.hdf5
Does anybody have any idea what's going on? I tried a very simple toy example with MNIST and everything seems to be working fine. I'll be happy for any suggestion. One more interesting thing, it's just value of the loss function. Accuracy of the network stays the same as after training, e.g. after training, the network finish with an accuracy of 40% and when I resume training (with huge loss jump), the accuracy is still 40%.
Can't comment so post:
Isn't the problem here that model.save() will not save your optimizer's state.
I.e. the learning rate might be very high, that's why your loss jumps after restarting the training.
So I haven't figured this out, but my guess that it is either problem with saving sort of "custom" (from application module) network problem or due to using older version 2.2.0 (due to squeezenet bug).
I doubt that this question gonna get more attention than it got in the last 10 days, so I'm closing the question.
My "solution" was to train the network in a single go, without interruption.

Resources