I'm writing a code example to do a simple linear projection (like PCA) in PyTorch. Everything appears to be OK except that the loss does not change as training progresses. Changing the learning rate doesn't affect this, and it's a simple one-dimensional problem so the loss should certainly be changing. What am I missing here?
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as nnF
class PCArot2D(nn.Module):
"2D PCA rotation, expressed as a gradient-descent problem"
def __init__(self):
super(PCArot2D, self).__init__()
self.theta = nn.Parameter(torch.tensor(np.random.random() * 2 * np.pi))
def getrotation(self):
sintheta = torch.sin(self.theta)
costheta = torch.cos(self.theta)
return torch.tensor([[costheta, -sintheta], [sintheta, costheta]], requires_grad=True, dtype=torch.double)
def forward(self, x):
xmeans = torch.mean(x, dim=1, keepdim=True)
rot = self.getrotation()
return torch.mm(rot, x - xmeans)
def covariance(y):
"Calculates the covariance matrix of its input (as torch variables)"
ymeans = torch.mean(y, dim=1, keepdim=True)
ycentred = y - ymeans
return torch.mm(ycentred, ycentred.T) / ycentred.shape[1]
net = PCArot2D()
example2 = torch.tensor(np.random.randn(2, 33))
# define a loss function and an optimiser
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.1)
# train the network
num_epochs = 1000
for epoch in range(num_epochs):
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(torch.DoubleTensor(example2))
# the covariance between output channels is the measure that we wish to minimise
covariance = (outputs[0, :] * outputs[1, :]).mean()
loss = criterion(covariance, torch.tensor(0, dtype=torch.double))
loss.backward()
optimizer.step()
running_loss = loss.item()
if ((epoch & (epoch - 1)) == 0) or epoch==(num_epochs-1): # don't print on all epochs
# print statistics
print('[%d] loss: %.8f' %
(epoch, running_loss))
print('Finished Training')
Output:
[0] loss: 0.00629047
[1] loss: 0.00629047
[2] loss: 0.00629047
[4] loss: 0.00629047
[8] loss: 0.00629047
etc
It seems the problem is in your getrotation function. When creating a new tensor from the other tensors it is not back-probable anymore:
def getrotation(self):
sintheta = torch.sin(self.theta)
costheta = torch.cos(self.theta)
return torch.tensor([[costheta, -sintheta], [sintheta, costheta]], requires_grad=True, dtype=torch.double)
So you need to find some other way to construct your return tensor.
Here is one suggestion that seems to work using torch.cat:
def getrotation(self):
sintheta = torch.sin(self.theta)
costheta = torch.cos(self.theta)
#return torch.tensor([[costheta, -sintheta], [sintheta, costheta]], requires_grad=True, dtype=torch.double)
A = torch.cat([costheta.unsqueeze(0), -sintheta.unsqueeze(0)], dim=0)
B = torch.cat([sintheta.unsqueeze(0), costheta.unsqueeze(0)], dim=0)
return torch.cat([A.unsqueeze(0), B.unsqueeze(0)], dim=0).double()
After implementing this change the loss changes:
[0] loss: 0.00765365
[1] loss: 0.00764726
[2] loss: 0.00764023
[4] loss: 0.00762607
[8] loss: 0.00759777
[16] loss: 0.00754148
[32] loss: 0.00742997
[64] loss: 0.00721117
[128] loss: 0.00679025
[256] loss: 0.00601233
[512] loss: 0.00469085
[999] loss: 0.00288501
Finished Training
I hope this helps!
Edit:
A simpler and prettier version by #DanStowell:
def getrotation(self):
sintheta = torch.sin(net.theta).double().unsqueeze(0)
costheta = torch.cos(net.theta).double().unsqueeze(0)
return torch.cat([costheta, -sintheta, sintheta, costheta]).reshape((2,2))
Related
Running following code if device = 'cpu' then loss decreases as expected. However, if device = 'cuda' the loss wont decrease, and model training fails. How should I correct this?
device = 'cpu'
device = 'cuda'
import sys
from typing import Optional
import torch
from torch import Tensor
from torch_geometric.nn import Node2Vec
class Node2VecV2(Node2Vec):
def __init__(
self,
edge_index: Tensor,
embedding_dim: int,
walk_length: int,
context_size: int,
walks_per_node: int = 1,
p: float = 1.0,
q: float = 1.0,
num_negative_samples: int = 1,
num_nodes: Optional[int] = None,
sparse: bool = False,
batch_size: int = 128,
shuffle: bool = True,
num_workers: int = 0,
device: str = 'cuda'
):
super().__init__(edge_index, embedding_dim, walk_length, context_size, walks_per_node,
p, q, num_negative_samples, num_nodes, sparse)
self.device = device
self.batch_size = batch_size
self.shuffle = shuffle
self.num_workers = 0 if sys.platform.startswith('win') else num_workers
self.loader = self.loader(batch_size=128, shuffle=True,
num_workers=num_workers)
# def make_loader(
# self,
# batch_size: int = 128,
# shuffle: bool = True,
# num_workers: int = 0,
# device: str = 'cpu'
# ):
# self.device = device
# self.batch_size = batch_size
# self.shuffle = shuffle
# self.num_workers = 0 if sys.platform.startswith('win') else num_workers
# self.loader = super().loader(batch_size=128, shuffle=True,
# num_workers=num_workers)
def __repr__(self) -> str:
return (f'{self.__class__.__name__}({self.embedding.weight.size(0)}, '
f'{self.embedding.weight.size(1)})')
def train_(self):
self.train()
total_loss = 0
for pos_rw, neg_rw in self.loader:
self.optimizer.zero_grad()
loss = self.loss(pos_rw.to(device), neg_rw.to(device))
loss.backward()
self.optimizer.step()
total_loss += loss.item()
#print(total_loss / len(self.loader))
return total_loss / len(self.loader)
def train_dw_model(dw_model, dataset):
data = dataset[0]
data = data.to(device)
best_val = 0
for epoch in range(1, 101):
loss = dw_model.train_()
with torch.no_grad():
dw_model.eval()
z = dw_model()
val_acc = dw_model.test(
z[data.train_mask], data.y[data.train_mask],
z[data.val_mask], data.y[data.val_mask],
max_iter=150)
if val_acc> best_val:
best_val = val_acc
torch.save(dw_model.state_dict(), 'dw.pt')
print(f'{dataset}_{dw_model} Epoch: {epoch:02d}, Loss: {loss:.4f},'
f' Val: {val_acc*100:.2f} best Val: {best_val*100:.2f} ')
if __name__ == '__main__':
import os.path as osp
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Planetoid')
dataset = Planetoid(path, 'Cora', transform=T.NormalizeFeatures())
data = dataset[0].to(device)
dw_model = Node2VecV2(data.edge_index, embedding_dim=128, walk_length=20,
context_size=10, walks_per_node=10,
num_negative_samples=1, p=1, q=1, sparse=True,batch_size=128, shuffle=True, num_workers=4, device=device).to(device)
#dw_model.make_loader(batch_size=128, shuffle=True, num_workers=4, device=args.device)
dw_model.optimizer = torch.optim.SparseAdam(list(dw_model.parameters()), lr=0.01)
for i in range(10):
print(f'Epoch {i} loss: ',dw_model.train_())
cpu output:
Epoch 0 loss: 8.111482880332254
Epoch 1 loss: 6.081473242152821
Epoch 2 loss: 4.976185473528775
Epoch 3 loss: 4.138110041618347
Epoch 4 loss: 3.4765207875858652
Epoch 5 loss: 2.960351337086071
Epoch 6 loss: 2.5505979494615034
Epoch 7 loss: 2.2174546501853247
Epoch 8 loss: 1.955638435753909
Epoch 9 loss: 1.7383252869952808
Process finished with exit code 0
cuda output:
Epoch 0 loss: 1.3862942511385137
Epoch 1 loss: 1.3862942511385137
Epoch 2 loss: 1.3862942511385137
Epoch 3 loss: 1.3862942511385137
Epoch 4 loss: 1.3862942511385137
Epoch 5 loss: 1.3862942511385137
Epoch 6 loss: 1.3862942511385137
Epoch 7 loss: 1.3862942511385137
Epoch 8 loss: 1.3862942511385137
Epoch 9 loss: 1.3862942511385137
Process finished with exit code 0
I’m new to pytorch and my problem may be a little naive
I’m training a pretrained VGG16 network on my dataset which it’s size is near 33000 images in 8 classes with labels [1,2,…,8] and my classes are imbalanced. my problem is that during training, validation and training accuracy is low and doesn’t increase, is there any problem in my code?
if not, what do you suggest to improve training?
'''
import torch
import time
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from torch.optim import Adam
import cv2
import torchvision.models as models
from classify_dataset import Classification_dataset
from torchvision import transforms
transform = transforms.Compose([transforms.Resize((224,224)),
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomVerticalFlip(p=0.5),
transforms.RandomRotation(degrees=45),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
dataset = Classification_dataset(root_dir=r'//home/arisa/Desktop/Hamid/IQA/Hamid_Dataset',
csv_file=r'/home/arisa/Desktop/Hamid/IQA/new_label.csv',transform=transform)
target = dataset.labels - 1
train_indices, test_indices = train_test_split(np.arange(target.shape[0]), stratify=target)
test_dataset = torch.utils.data.Subset(dataset, indices=test_indices)
train_dataset = torch.utils.data.Subset(dataset, indices=train_indices)
class_sample_count = np.array([len(np.where(target[train_indices] == t)[0]) for t in np.unique(target)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in target[train_indices]])
samples_weight = torch.from_numpy(samples_weight)
samples_weight = samples_weight.double()
sampler = torch.utils.data.WeightedRandomSampler(samples_weight, len(samples_weight), replacement = True)
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=64,
sampler=sampler)
test_loader = torch.utils.data.DataLoader(test_dataset,
batch_size=64,
shuffle=False)
for param in model.parameters():
param.requires_grad = False
num_ftrs = model.classifier[0].in_features
model.classifier = nn.Linear(num_ftrs,8)
optimizer = Adam(model.parameters(), lr = 0.0001 )
criterion = nn.CrossEntropyLoss()
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.01)
path = '/home/arisa/Desktop/Hamid/IQA/'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
def train_model(model, train_loader,valid_loader, optimizer, criterion, scheduler=None, num_epochs=10 ):
min_valid_loss = np.inf
model.train()
start = time.time()
TrainLoss = []
model = model.to(device)
for epoch in range(num_epochs):
total = 0
correct = 0
train_loss = 0
#lr_scheduler.step()
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-' * 10)
train_loss = 0.0
for x,y in train_loader:
x = x.to(device)
#print(y.shape)
y = y.view(y.shape[0],).to(device)
y = y.to(device)
y -= 1
out = model(x)
loss = criterion(out, y)
optimizer.zero_grad()
loss.backward()
TrainLoss.append(loss.item()* y.shape[0])
train_loss += loss.item() * y.shape[0]
_,predicted = torch.max(out.data,1)
total += y.size(0)
correct += (predicted == y).sum().item()
optimizer.step()
lr_scheduler.step()
accuracy = 100*correct/total
valid_loss = 0.0
val_loss = []
model.eval()
val_correct = 0
val_total = 0
with torch.no_grad():
for x_val, y_val in test_loader:
x_val = x_val.to(device)
y_val = y_val.view(y_val.shape[0],).to(device)
y_val -= 1
target = model(x_val)
loss = criterion(target, y_val)
valid_loss += loss.item() * y_val.shape[0]
_,predicted = torch.max(target.data,1)
val_total += y_val.size(0)
val_correct += (predicted == y_val).sum().item()
val_loss.append(loss.item()* y_val.shape[0])
val_acc = 100*val_correct / val_total
print(f'Epoch {epoch + 1} \t\t Training Loss: {train_loss / len(train_loader)} \t\t Validation Loss: {valid_loss / len(test_loader)} \t\t Train Acc:{accuracy} \t\t Validation Acc:{val_acc}')
if min_valid_loss > (valid_loss / len(test_loader)):
print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss / len(test_loader):.6f}) \t Saving The Model')
min_valid_loss = valid_loss / len(test_loader)
state = {'state_dict': model.state_dict(),'optimizer': optimizer.state_dict(),}
torch.save(state,'/home/arisa/Desktop/Hamid/IQA/checkpoint.t7')
end = time.time()
print('TRAIN TIME:')
print('%.2gs'%(end-start))
train_model(model=model, train_loader=train_loader, optimizer=optimizer, criterion=criterion, valid_loader= test_loader,num_epochs=500 )
Thanks in advance
here is the result of 15 epoch
Epoch 1/500
----------
Epoch 1 Training Loss: 205.63448420514916 Validation Loss: 233.89266112356475 Train Acc:39.36360386127994 Validation Acc:24.142040038131555
Epoch 2/500
----------
Epoch 2 Training Loss: 199.05699240435197 Validation Loss: 235.08799531243065 Train Acc:41.90998291820601 Validation Acc:24.27311725452812
Epoch 3/500
----------
Epoch 3 Training Loss: 199.15626737127448 Validation Loss: 236.00033430619672 Train Acc:41.1035633416756 Validation Acc:23.677311725452814
Epoch 4/500
----------
Epoch 4 Training Loss: 199.02581041173886 Validation Loss: 233.60767459869385 Train Acc:41.86628530568466 Validation Acc:24.606768350810295
Epoch 5/500
----------
Epoch 5 Training Loss: 198.61493769454472 Validation Loss: 233.7503859202067 Train Acc:41.53656695665991 Validation Acc:25.0
Epoch 6/500
----------
Epoch 6 Training Loss: 198.71323942956585 Validation Loss: 234.17176149830675 Train Acc:41.639852222619474 Validation Acc:25.369399428026693
Epoch 7/500
----------
Epoch 7 Training Loss: 199.9395153770592 Validation Loss: 234.1744423635078 Train Acc:40.98041552456998 Validation Acc:24.84509056244042
Epoch 8/500
----------
Epoch 8 Training Loss: 199.3533399020355 Validation Loss: 235.4645173188412 Train Acc:41.26643626107337 Validation Acc:24.165872259294567
Epoch 9/500
----------
Epoch 9 Training Loss: 199.6451746921249 Validation Loss: 233.33387595956975 Train Acc:40.96452548365312 Validation Acc:24.59485224022879
Epoch 10/500
----------
Epoch 10 Training Loss: 197.9305159737011 Validation Loss: 233.76405122063377 Train Acc:41.8782028363723 Validation Acc:24.6186844613918
Epoch 11/500
----------
Epoch 11 Training Loss: 199.33247244055502 Validation Loss: 234.41085289463854 Train Acc:41.59218209986891 Validation Acc:25.119161105815063
Epoch 12/500
----------
Epoch 12 Training Loss: 199.87399289874256 Validation Loss: 234.23621463775635 Train Acc:41.028085647320545 Validation Acc:24.49952335557674
Epoch 13/500
----------
Epoch 13 Training Loss: 198.85540591944292 Validation Loss: 234.33149099349976 Train Acc:41.206848607635166 Validation Acc:24.857006673021925
Epoch 14/500
----------
Epoch 14 Training Loss: 199.92641723337513 Validation Loss: 233.37722391070741 Train Acc:41.15520597465539 Validation Acc:24.988083889418494
Epoch 15/500
----------
Epoch 15 Training Loss: 197.82172771698328 Validation Loss: 234.4943131533536 Train Acc:41.69943987605768 Validation Acc:24.380362249761678
You freezed your model through
for param in model.parameters():
param.requires_grad = False
which basically says "do not calculate any gradient for any weight" which is equivalent of not updating weights - hence no optimization
my problem was in model.train(). This phrase should be inside the training loop. but in my case I put it outside the training loop and when it comes to model.eval(), model maintained in this mode
I am relatively new to PyTorch and Huggingface-transformers and experimented with DistillBertForSequenceClassification on this Kaggle-Dataset.
from transformers import DistilBertForSequenceClassification
import torch.optim as optim
import torch.nn as nn
from transformers import get_linear_schedule_with_warmup
n_epochs = 5 # or whatever
batch_size = 32 # or whatever
bert_distil = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
#bert_distil.classifier = nn.Sequential(nn.Linear(in_features=768, out_features=1), nn.Sigmoid())
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(bert_distil.parameters(), lr=0.1)
X_train = []
Y_train = []
for row in train_df.iterrows():
seq = tokenizer.encode(preprocess_text(row[1]['text']), add_special_tokens=True, pad_to_max_length=True)
X_train.append(torch.tensor(seq).unsqueeze(0))
Y_train.append(torch.tensor([row[1]['target']]).unsqueeze(0))
X_train = torch.cat(X_train)
Y_train = torch.cat(Y_train)
running_loss = 0.0
bert_distil.cuda()
bert_distil.train(True)
for epoch in range(n_epochs):
permutation = torch.randperm(len(X_train))
j = 0
for i in range(0,len(X_train), batch_size):
optimizer.zero_grad()
indices = permutation[i:i+batch_size]
batch_x, batch_y = X_train[indices], Y_train[indices]
batch_x.cuda()
batch_y.cuda()
outputs = bert_distil.forward(batch_x.cuda())
loss = criterion(outputs[0],batch_y.squeeze().cuda())
loss.requires_grad = True
loss.backward()
optimizer.step()
running_loss += loss.item()
j+=1
if j == 20:
#print(outputs[0])
print('[%d, %5d] running loss: %.3f loss: %.3f ' %
(epoch + 1, i*1, running_loss / 20, loss.item()))
running_loss = 0.0
j = 0
[1, 608] running loss: 0.689 loss: 0.687
[1, 1248] running loss: 0.693 loss: 0.694
[1, 1888] running loss: 0.693 loss: 0.683
[1, 2528] running loss: 0.689 loss: 0.701
[1, 3168] running loss: 0.690 loss: 0.684
[1, 3808] running loss: 0.689 loss: 0.688
[1, 4448] running loss: 0.689 loss: 0.692 etc...
Regardless on what I tried, loss did never decrease, or even increase, nor did the prediction get better. It seems to me that I forgot something so that weights are actually not updated. Someone has an idea?
O
what I tried
Different loss functions
BCE
CrossEntropy
even MSE-loss
One-Hot Encoding vs A single neuron output
Different learning rates, and optimizers
I even changed all the targets to only one single label, but even then, the network did'nt converge.
Looking at running loss and minibatch loss is easily misleading. You should look at epoch loss, because the inputs are the same for every loss.
Besides, there are some problems in your code, fixing all of them and the behavior is as expected: the loss slowly decreases after each epoch, and it can also overfit to a small minibatch. Please look at the code, changes include: using model(x) instead of model.forward(x), cuda() only called once, smaller learning rate, etc.
Tuning and fine-tuning ML models are difficult work.
n_epochs = 5
batch_size = 1
bert_distil = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(bert_distil.parameters(), lr=1e-3)
X_train = []
Y_train = []
for row in train_df.iterrows():
seq = tokenizer.encode(row[1]['text'], add_special_tokens=True, pad_to_max_length=True)[:100]
X_train.append(torch.tensor(seq).unsqueeze(0))
Y_train.append(torch.tensor([row[1]['target']]))
X_train = torch.cat(X_train)
Y_train = torch.cat(Y_train)
running_loss = 0.0
bert_distil.cuda()
bert_distil.train(True)
for epoch in range(n_epochs):
permutation = torch.randperm(len(X_train))
for i in range(0,len(X_train), batch_size):
optimizer.zero_grad()
indices = permutation[i:i+batch_size]
batch_x, batch_y = X_train[indices].cuda(), Y_train[indices].cuda()
outputs = bert_distil(batch_x)
loss = criterion(outputs[0], batch_y)
loss.backward()
optimizer.step()
running_loss += loss.item()
print('[%d] epoch loss: %.3f' %
(epoch + 1, running_loss / len(X_train) * batch_size))
running_loss = 0.0
Output:
[1] epoch loss: 0.695
[2] epoch loss: 0.690
[3] epoch loss: 0.687
[4] epoch loss: 0.685
[5] epoch loss: 0.684
I would highlight two possible reasons for your "stable" results:
I agree that the learning rate is surely too high that prevents model from any significant updates.
But what is important to know is that based on the state-of-the-art papers finetuning has very marginal effect on the core NLP abilities of Transformers. For example, the paper says that finetuning only applies really small weight changes. Citing it: "Finetuning barely affects accuracy on NEL, COREF and REL indicating that those tasks are already sufficiently covered by pre-training". Several papers suggest that finetuning for classification tasks is basically waste of time. Thus, considering that DistilBert is actually a student model of BERT, maybe you won't get better results. Try pre-training with your data first. Generally, pre-training has a more significant impact.
I have got similar problem when I tried to use xxxForSequenceClassification to fine-tune my down-stream task.
At last, I changed xxxForSequenceClassification to xxxModel and added Dropout - FC - Softmax. Magically it's solved, loss decreased as expected.
I'm still trying to find out why.
Hope it may help you.
FYI, transformers verion: 3.5.0
Maybe the poor performance is due to gradients being applied to the BERT backbone. Validate it like so:
print([p.requires_grad for p in bert_distil.distilbert.parameters()])
As an alternative solution, try freezing the weights of your trained model:
for param in bert_distil.distilbert.parameters():
param.requires_grad = False
As you are trying to optimize the weights of a trained model during fine-tuning on your data, you face issues described, among other sources, in the ULMIfit (https://arxiv.org/abs/1801.06146) paper
I m try to train an rbf network... I used MNIST database. And pytorch framework...
The results are the same in each epoch...
The results....:
Epoch: 1
Accuracy: 0.785 Loss: 2.435 Recall: 0.386 Precision: 0.258
Epoch: 2
Accuracy: 0.785 Loss: 2.435 Recall: 0.386 Precision: 0.258
Epoch: 3
Accuracy: 0.785 Loss: 2.435 Recall: 0.386 Precision: 0.258
Epoch: 4
Accuracy: 0.785 Loss: 2.435 Recall: 0.386 Precision: 0.258
My Code... I think that the problem is somewhere in the linear layer. The model has no improve after the training epoch, maybe it's the linear layer. It seems like the weights no change...! But i don't know why...?
class RBF(nn.Module):
def __init__(self, in_layers, centers, sigmas):
super(RBF, self).__init__()
self.in_layers = in_layers
self.centers = nn.Parameter(centers)
self.sigmas = nn.Parameter(torch.Tensor(self.centers.size(0)))
torch.nn.init.constant_(self.sigmas, sigmas)
def forward(self, x):
x = x.view(-1, self.in_layers)
size = [self.centers.size(0), x.size(0)]
sigma = self.sigmas.view(-1).to(device)**2
dists = torch.empty(size).to(device)
for i,c in enumerate(self.centers):
c = c.reshape(-1,c.size(0))
temp = (x-c).pow(2).sum(-1).pow(0.5)
dists[i] = temp
dists = dists.permute(1,0)
phi = torch.exp(-1*(dists/(2*sigma))) #gaussian
return phi
class Net(nn.Module):
def __init__(self, in_layers, centers, sigmas):
super(Net, self).__init__()
self.rbf_layers = nn.ModuleList()
self.linear_layers = nn.ModuleList()
for i in range(len(in_layers) - 1):
self.rbf_layers.append(RBF(in_layers[i], centers, sigmas))
self.linear_layers.append(nn.Linear(centers.size(0), in_layers[i+1], bias = True))
def forward(self, x):
out = x
for i in range(len(self.rbf_layers)):
out = self.rbf_layers[i](out)
out = F.sigmoid( self.linear_layers[i](out.float()) )
return out
def training(engine, batch, device, model, criterion, optimizer):
inputs, labels = batch[0].to(device), batch[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
return outputs, labels
Ofcourse the code continuous but i think these are enough to solve the problem(if you want smthing extra 'i m here').... Do you have any ideas???
and the training part of code....
def training(engine, batch, device, model, criterion, optimizer):
inputs, labels = batch[0].to(device), batch[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
return outputs, labels
def nn_run1(batch, classes, dim, learning_rate, epochs, clusters):
# ---Load Model's Parameters---
train_loader, test_loader = data_loading(batch, shuffle=False)
kmeans_input = train_loader.dataset.train_data
kmeans_input = torch.reshape(kmeans_input.double(), (kmeans_input.size(0), -1))
_, centers = Kmeans(kmeans_input, clusters)
centers = centers.to(device)
sigma = Sigmas(centers)
layers = in_layers(dim, len(classes), layers = 1)
# ---Model Setup---
model = Net(layers, centers, sigma)
model.cuda()
criterion = nn.CrossEntropyLoss()
print(model.parameters)
optimizer = torch.optim.SGD(model.parameters(), learning_rate)
I am doing some image classification using inception_v3 model in keras, however, my train accuracy is lower than validation during the whole training process. And my validation accuracy is above 0.95 from the first epoch. I also find that train loss is much higher than validation loss. In the end, the test accuracy is 0.5, which is pretty bad.
At first, my optimizer is Adam with learning rate equals to 0.00001, the result is bad. Then I change it to SGD with learning rate of 0.00001, which doesn't make any change to the bad result. I also tried to increase the learning rate to 0.1, but the test accuracy is still around 0.5
import numpy as np
import pandas as pd
import keras
from keras import layers
from keras.applications.inception_v3 import preprocess_input
from keras.models import Model
from keras.layers.core import Dense
from keras.layers import GlobalAveragePooling2D
from keras.optimizers import Adam, SGD, RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.utils import plot_model
from keras.models import model_from_json
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
import math
import copy
import pydotplus
train_path = 'data/train'
valid_path = 'data/validation'
test_path = 'data/test'
top_model_weights_path = 'model_weigh.h5'
# number of epochs to train top model
epochs = 100
# batch size used by flow_from_directory and predict_generator
batch_size = 2
img_width, img_height = 299, 299
fc_size = 1024
nb_iv3_layers_to_freeze = 172
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
rotation_range=30,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True)
# this is the augmentation configuration we will use for testing:
# only rescaling
valid_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
rotation_range=30,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True)
train_batches =
train_datagen.flow_from_directory(train_path,
target_size=(img_width, img_height),
classes=None,
class_mode='categorical',
batch_size=batch_size,
shuffle=True)
valid_batches =
valid_datagen.flow_from_directory(valid_path,
target_size=(img_width,img_height),
classes=None,
class_mode='categorical',
batch_size=batch_size,
shuffle=True)
test_batches =
ImageDataGenerator().flow_from_directory(test_path,
target_size=(img_width,
img_height),
classes=None,
class_mode='categorical',
batch_size=batch_size,
shuffle=False)
nb_train_samples = len(train_batches.filenames)
# get the size of the training set
nb_classes_train = len(train_batches.class_indices)
# get the number of classes
predict_size_train = int(math.ceil(nb_train_samples / batch_size))
nb_valid_samples = len(valid_batches.filenames)
nb_classes_valid = len(valid_batches.class_indices)
predict_size_validation = int(math.ceil(nb_valid_samples / batch_size))
nb_test_samples = len(test_batches.filenames)
nb_classes_test = len(test_batches.class_indices)
predict_size_test = int(math.ceil(nb_test_samples / batch_size))
def add_new_last_layer(base_model, nb_classes):
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(fc_size, activation='relu')(x)
pred = Dense(nb_classes, activation='softmax')(x)
model = Model(input=base_model.input, output=pred)
return model
# freeze base_model layer in order to get the bottleneck feature
def setup_to_transfer_learn(model, base_model):
for layer in base_model.layers:
layer.trainable = False
model.compile(optimizer=Adam(lr=0.00001),
loss='categorical_crossentropy',
metrics=['accuracy'])
base_model = keras.applications.inception_v3.InceptionV3(weights='imagenet', include_top=False)
model = add_new_last_layer(base_model, nb_classes_train)
setup_to_transfer_learn(model, base_model)
model.summary()
train_labels = train_batches.classes
train_labels = to_categorical(train_labels, num_classes=nb_classes_train)
validation_labels = valid_batches.classes
validation_labels = to_categorical(validation_labels, num_classes=nb_classes_train)
history = model.fit_generator(train_batches,
epochs=epochs,
steps_per_epoch=nb_train_samples // batch_size,
validation_data=valid_batches,
validation_steps=nb_valid_samples // batch_size,
class_weight='auto')
# save model to json
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
# serialize model to HDF5
model.save_weights(top_model_weights_path)
print("Saved model to disk")
# model visualization
plot_model(model,
show_shapes=True,
show_layer_names=True,
to_file='model.png')
(eval_loss, eval_accuracy) = model.evaluate_generator(
valid_batches,
steps=nb_valid_samples // batch_size,
verbose=1)
print("[INFO] evaluate accuracy: {:.2f}%".format(eval_accuracy * 100))
print("[INFO] evaluate loss: {}".format(eval_loss))
test_batches.reset()
predictions = model.predict_generator(test_batches,
steps=nb_test_samples / batch_size,
verbose=0)
# print(predictions)
predicted_class_indices = np.argmax(predictions, axis=1)
# print(predicted_class_indices)
labels = train_batches.class_indices
labels = dict((v, k) for k, v in labels.items())
final_predictions = [labels[k] for k in predicted_class_indices]
# print(final_predictions)
# save as csv file
filenames = test_batches.filenames
results = pd.DataFrame({"Filename": filenames,
"Predictions": final_predictions})
results.to_csv("results.csv", index=False)
# evaluation test result
(test_loss, test_accuracy) = model.evaluate_generator(
test_batches,
steps=nb_train_samples // batch_size,
verbose=1)
print("[INFO] test accuracy: {:.2f}%".format(test_accuracy * 100))
print("[INFO] test loss: {}".format(test_loss))
Here is a brief summary of training process:
Epoch 1/100
2000/2000 [==============================] - 146s 73ms/step - loss: 0.4941 - acc: 0.7465 - val_loss: 0.1612 - val_acc: 0.9770
Epoch 2/100
2000/2000 [==============================] - 140s 70ms/step - loss: 0.4505 - acc: 0.7725 - val_loss: 0.1394 - val_acc: 0.9765
Epoch 3/100
2000/2000 [==============================] - 139s 70ms/step - loss: 0.4505 - acc: 0.7605 - val_loss: 0.1643 - val_acc: 0.9560
......
Epoch 98/100
2000/2000 [==============================] - 141s 71ms/step - loss: 0.1348 - acc: 0.9467 - val_loss: 0.0639 - val_acc: 0.9820
Epoch 99/100
2000/2000 [==============================] - 140s 70ms/step - loss: 0.1495 - acc: 0.9365 - val_loss: 0.0780 - val_acc: 0.9770
Epoch 100/100
2000/2000 [==============================] - 138s 69ms/step - loss: 0.1401 - acc: 0.9458 - val_loss: 0.0471 - val_acc: 0.9890
Here is the result that I get:
[INFO] evaluate accuracy: 98.55%
[INFO] evaluate loss: 0.05201659869024259
2000/2000 [==============================] - 47s 23ms/step
[INFO] test accuracy: 51.70%
[INFO] test loss: 7.737395915810134
I wish someone can help me deal with this problem.
As the code is now, you're not freezing the layers of the model for transfer learning. In the setup_to_transfer_learn you're freezing the layer in base_model, and then compiling the new model (containing layers from the base model), but not actually freezing on the new model. Just change setup_to_transfer_learn:
def setup_to_transfer_learn(model):
for layer in model.layers[:-3]: # since you added three new layers (which should not freeze)
layer.trainable = False
model.compile(optimizer=Adam(lr=0.00001),
loss='categorical_crossentropy',
metrics=['accuracy'])
Then call the function like this:
model = add_new_last_layer(base_model, nb_classes_train)
setup_to_transfer_learn(model)
You should see a large difference in the number of trainable parameters when calling model.summary()
Finally, I solved the problem. I forget to do image preprocessing to my test data. After I add this, everything works really fine.
I change this:
test_batches = ImageDataGenerator().flow_from_directory(test_path,
target_size=(img_width, img_height),
classes=None,
class_mode='categorical',
batch_size=batch_size,
shuffle=False)
to this:
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_batches = test_datagen.flow_from_directory(test_path,
target_size=(img_width, img_height),
classes=None,
class_mode='categorical',
batch_size=batch_size,
shuffle=False)
And the test accuracy is 0.98, test loss is 0.06.
What actually happens is that when you use preprocessing the model may actually start learning those techniques. One way to check if your model is learning good features is using Grad-CAM