Related
So I'm studiying pytorch coming from a background with tensorflow.
I'm trying to replicate a simple convnet, that I've developed with success in tensorflow, to classify cat vs dogs images.
In pytorch I see some strange behaviors:
Using a Learning Rate of 0.001 make the CNet predicting only 0 after the first batch (might be exploding gradients?)
Using a Learning Rate of 0.0005 gives a smooth learning curve and the CNet converge
Can anyone help me to understand what I'm doing wrong? that the code:
import pathlib
import torch
import torch.nn.functional as F
import torchvision
from torch.utils.data.dataloader import DataLoader
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CNet(torch.nn.Module):
def __init__(self):
super(CNet, self).__init__() #input is 180x180 image
self.conv1 = torch.nn.Conv2d(3, 32, 3) # out -> 178x178x32
self.conv2 = torch.nn.Conv2d(32, 64, 3)
self.conv3 = torch.nn.Conv2d(64, 128, 3)
self.conv4 = torch.nn.Conv2d(128, 256, 3)
self.conv5 = torch.nn.Conv2d(256, 256, 3)
self.flatten = torch.nn.Flatten()
#self.fc = torch.nn.LazyLinear(1)
self.fc = torch.nn.Linear(7*7*256, 1)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv4(x)), (2, 2))
x = F.relu(self.conv5(x))
x = self.flatten(x)
o = torch.sigmoid(self.fc(x))
return o
def train(model : CNet, train_data : DataLoader, criterion, optimizer : torch.optim.Optimizer, epochs = 10, validation_data : DataLoader = None):
losses = []
for epoch in range(epochs):
epoch_loss = 0.0
running_loss = 0.0
for i, data in enumerate(train_data, 0):
imgs, labels = data
imgs, labels = imgs.to(device), labels.to(device, dtype=torch.float)
labels = labels.unsqueeze(-1)
# run
output = net(imgs)
# zero out accumulated grads
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
epoch_loss += loss.item()
#if i % 50 == 49:
# print(f'[{epoch+1}, {i:5d}] loss: {running_loss / 50.0:.3f}')
# running_loss = 0.0
losses.append(epoch_loss / len(train_data.dataset))
print(f'[{epoch+1}, {epochs:5d}] loss: {losses[-1]:.3f}')
return losses
if __name__=="__main__":
transforms = torchvision.transforms.Compose([
torchvision.transforms.Resize((180, 180)),
torchvision.transforms.ToTensor(),
])
dataset_dir = pathlib.Path("E:\Datasets\\torch\Cat_Dog\cats_vs_dogs_small")
train_data = torchvision.datasets.ImageFolder(dataset_dir / "train", transform=transforms)
validation_data = torchvision.datasets.ImageFolder(dataset_dir / "validation", transform=transforms)
test_data = torchvision.datasets.ImageFolder(dataset_dir / "test", transform=transforms)
train_data_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=2, persistent_workers=True, pin_memory=True)
validation_data_loader = DataLoader(validation_data, batch_size=32, num_workers=2, shuffle=True, pin_memory=True)
test_data_loader = DataLoader(test_data, batch_size=32, shuffle=True, pin_memory=True, num_workers=2)
import matplotlib.pyplot as plt
#plt.figure()
#for i in range(1, 10):
# plt.subplot(3, 3, i)
# plt.axis('off')
# rand_idx = np.random.random_integers(0, len(train_data))
# plt.imshow(np.moveaxis(test_data[rand_idx][0].numpy(), 0, 2))
#plt.show()
net = CNet()
net = net.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.RMSprop(net.parameters(), 0.001)
net.train()
# TODO save best model
losses = train(net, train_data_loader, criterion, optimizer, epochs=30)
epochs = range(1, len(losses) + 1)
plt.plot(epochs, losses, 'bo', label='Training Loss')
plt.show()
print('Training Finished')
correct_count, all_count = 0, 0
for images,labels in test_data_loader:
images,labels = images.to(device), labels.to(device, dtype=torch.float)
with torch.no_grad():
ps = net(images)
pred_label = (ps > 0.5).to(torch.float)
true_label = labels.unsqueeze(1)
correct_count += (pred_label == true_label).sum().item()
all_count += len(labels)
print("Number Of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))
and here some screenshot of the loss for each point:
LR=0.001 (not convering on pytorch, converging on tensorflow)
LR=0.0005 (converging in 30 epochs) [I know that the validation loss is not 0, accuracy is ~70% but is expected]
As you can see the loss on the two experiment are very different in scale. What might cause that such a weird behavior? I call it 'wierd' cause I never seen that happen on tensorflow.
Is typicall such different behavior between those 2 framework? or am I loosing something?
I am really new to Machine Learning and I am not so well versed in coding in general. However there is need to look through the customers feedback at our store, that average quite a lot each year, yet we cannot tell % of positive, negative and neutral.
Currently I am trying to train a Bert Model to do simple multi labeled sentiment analysis. The input is our store's customers feedback. The customers feedback is not always so clearly defined since customers do tend to tell long and long about their experience and their sentiment is not always so clear. However we managed to get positive, negative and neutral, each set 2247 samples.
But when I try to train it the training accuracy is around 0.4% which is super low. Validation score is around 60%. F1-score is around 60% for each of the label. I wonder what can be done to improve this training accuracy. I have been stuck for a while. Please take a look at my codes and help me out with this.
I have tried changing learning rate (tried all learning rate Bert suggested and 1e-5),changing Max LEN, changing amount of EPOCH, changing drop out rate (0.1, 0.2, 0.3, 0.4, 0.5), but so far nothing yielded results.
#read dataset
df = pd.read_csv("data.csv",header=None, names=['content', 'sentiment'], sep='\;', lineterminator='\r',encoding = "ISO-8859-1",engine="python")
from sklearn.utils import shuffle
df = shuffle(df)
df['sentiment'] = df['sentiment'].replace(to_replace = [-1, 0, 1], value = [0, 1, 2])
df.head()
#Load pretrained FinBert model and get bert tokenizer from it
PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
#Choose sequence Length
token_lens = []
for txt in df.content:
tokens = tokenizer.encode(txt, max_length=512)
token_lens.append(len(tokens))
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count');
MAX_LEN = 260
#Make a PyTorch dataset
class FIDataset(Dataset):
def __init__(self, texts, targets, tokenizer, max_len):
self.texts = texts
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = str(self.texts[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
#split test and train
df_train, df_test = train_test_split(
df,
test_size=0.1,
random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
df_test,
test_size=0.5,
random_state=RANDOM_SEED
)
df_train.shape, df_val.shape, df_test.shape
#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = FIDataset(
texts=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
#Load data into train, test, val
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
returned = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
pooled_output = returned["pooler_output"]
output = self.drop(pooled_output)
return self.out(output)
#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)
model = model.to(device)
#Optimize with AdamW
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr= 2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
#Train each Epoch function
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
#Eval model function
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
torch.cuda.empty_cache()
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
#training loop through each epochs
import torch
torch.cuda.empty_cache()
history = defaultdict(list)
best_accuracy = 0
if __name__ == '__main__':
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
print(f'Train loss {train_loss} accuracy {train_acc}')
val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)
print(f'Val loss {val_loss} accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc
-- Edit: I have printed out preds and targets as well as train and val accuracy
Here _, preds = torch.max(outputs, dim=1), you probably want argmax, not max?
Print out preds and targets to better see what's going on.
Edit after preds and targets printed out. For epochs 4 and 5, preds matches targets exactly, so train accuracy should be 1. I think the issue is that the accuracy is divided by n_examples, which is a number of examples in the whole train dataset, while it should be divided by the number of examples in the epoch.
For the following training program, training and validation are all ok.
Once reach to Test method, I have CUDA out of memory. What should I change so that I have enough memory to test as well.
import torch
from torchvision import datasets, transforms
import torch.nn.functional as f
class CnnLstm(nn.Module):
def __init__(self):
super(CnnLstm, self).__init__()
self.cnn = CNN()
self.rnn = nn.LSTM(input_size=180000, hidden_size=256, num_layers=2, batch_first=True)#stacked LSTM with 2 layers
#print(num_classes)
self.linear = nn.Linear(256, num_classes)
#print('after num_classes')
def forward(self, x):
#print(x.shape)
batch_size, time_steps, channels, height, width = x.size()
c_in = x.view(batch_size * time_steps, channels, height, width)
_, c_out = self.cnn(c_in)
r_in = c_out.view(batch_size, time_steps, -1)
r_out, (_, _) = self.rnn(r_in)
r_out2 = self.linear(r_out[:, -1, :])
return f.log_softmax(r_out2, dim=1)
class TrainCNNLSTM:
def __init__(self):
self.seed = 1
self.batch_size = 8
self.validate_batch_size = 8
self.test_batch_size = 1
self.epoch = 20
self.learning_rate = 0.01
self.step = 100
self.train_loader = None
self.validate_loader = None
self.test_loader = None
#print('before')
self.model = CnnLstm().to(device)
#print('after')
self.criterion = nn.CrossEntropyLoss()
def load_data(self):
data_loader = DataLoader()
self.train_loader = data_loader.get_train_data(self.batch_size)
self.validate_loader = data_loader.get_validate_data(self.validate_batch_size)
self.test_loader = data_loader.get_test_data(self.test_batch_size)
def train(self):
optimizer = torch.optim.SGD(self.model.parameters(), lr=self.learning_rate, momentum=0.9)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=self.learning_rate/100.0, max_lr=self.learning_rate, step_size_up=13)
#optimizer = torch.optim.SGD(self.model.parameters(), lr=self.learning_rate)
for epoch in range(self.epoch):
t_losses=[]
for iteration, (data, target) in enumerate(self.train_loader):
data = np.expand_dims(data, axis=1)
data = torch.FloatTensor(data)
data, target = data.cuda(), target.cuda()
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = self.model(data)
loss = self.criterion(output, target)
#loss = f.nll_loss(output, target)
t_losses.append(loss)
loss.backward()
optimizer.step()
scheduler.step()
if iteration % self.step == 0:
print('Epoch: {} | train loss: {:.4f}'.format(epoch, loss.item()))
avgd_trainloss = sum(t_losses)/len(t_losses)
self.validate(epoch, avgd_trainloss)
def validate(self, epoch, avg_tloss):
v_losses=[]
with torch.no_grad():
for iteration, (data, target) in enumerate(self.validate_loader):
data = np.expand_dims(data, axis=1)
data = torch.FloatTensor(data)
data, target = data.cuda(), target.cuda()
data, target = Variable(data), Variable(target)
output = self.model(data)
loss = self.criterion(output, target)
#loss = f.nll_loss(output, target)
v_losses.append(loss)
avgd_validloss = sum(v_losses)/len(v_losses)
print('Epoch: {} | train loss: {:.4f} | validate loss: {:.4f}'.format(epoch, avg_tloss, avgd_validloss))
def test(self):
test_loss = []
correct = 0
for data, target in self.test_loader:
data = np.expand_dims(data, axis=1)
data = torch.FloatTensor(data)
data, target = data.cuda(), target.cuda()
data, target = Variable(data, volatile=True), Variable(target)
output = self.model(data)
loss = self.criterion(output, target)
#f.nll_loss(output, target, size_average=False).item() # sum up batch loss
test_loss.append(loss)
pred = torch.max(output, 1)[1].data.squeeze()
correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()
test_loss = sum(test_loss)/len(test_loss)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(test_loss, correct, len(self.test_loader.dataset),
100. * correct / len(self.test_loader.dataset)))
train = TrainCNNLSTM()
train.load_data()
train.train()
train.test()
You should call .item() on your loss when appending it to the list of losses:
loss = self.criterion(output, target)
test_loss.append(loss.item())
This avoids accumulating tensors in a list which are still attached to the computational graph. I would say the same for your accuracy.
Here are my code by using Pysft
class Arguments:
def __init__(self):
# self.cuda = False
self.no_cuda = True
self.seed = 1
self.batch_size = 50
self.test_batch_size = 1000
self.epochs = 10
self.lr = 0.01
self.momentum = 0.5
self.log_interval = 10
hook = sy.TorchHook(torch)
bob = sy.VirtualWorker(hook, id="bob")
alice = sy.VirtualWorker(hook, id="alice")
Here is my LSTM model, in can run successfully by only use pytorch, but it can't run with pysyft
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.rnn = torch.nn.RNN(input_size=28,
hidden_size=16,
num_layers=2,
batch_first=True,
bidirectional=True)
self.fc = torch.nn.Linear(32, 10)
def forward(self, x):
print(np.shape(x))
x = x.squeeze()
x, _ = self.rnn(x)
x = self.fc(x[:, -1, :])
return x.view(-1, 10)
def train(args, model, device, federated_train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(federated_train_loader):
model.send(data.location) # <-- NEW: send the model to the right location
data, target = data.to(device), target.to(device)
# data, target = data.cuda(), target.cuda()
optimizer.zero_grad()
output = model(data.to(device))
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
model.get() # <-- NEW: get the model back
if batch_idx % args.log_interval == 0:
loss = loss.get() # <-- NEW: get the loss back
losses.append(loss.item())
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * args.batch_size, len(federated_train_loader) * args.batch_size,
100. * batch_idx / len(federated_train_loader), loss.item()))
When I use Pysyft to run my LSTM model,there is a mistakes.But if I use my model without Pysyft,it an run scuccessfully.I don't know how to resolve it?
import torch
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
import torch.nn.functional as F
import time
import numpy as np
import syft as sy
class Arguments:
def __init__(self):
self.cuda = False
self.no_cuda = True
self.seed = 1
self.batch_size = 50
self.test_batch_size = 1000
self.epochs = 10
self.lr = 0.01
self.momentum = 0.5
self.log_interval = 10
hook = sy.TorchHook(torch) # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning
bob = sy.VirtualWorker(hook, id="bob") # <-- NEW: define remote worker bob
alice = sy.VirtualWorker(hook, id="alice") # <-- NEW: and alice
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.rnn = torch.nn.RNN(input_size=28,
hidden_size=16,
num_layers=2,
batch_first=True,
bidirectional=True)
self.fc = torch.nn.Linear(32, 10)
def forward(self, x):
print(np.shape(x))
x = x.squeeze()
x, _ = self.rnn(x)
x = self.fc(x[:, -1, :])
return x.view(-1, 10)
def train(args, model, device, federated_train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(federated_train_loader): # <-- now it is a distributed dataset
model.send(data.location) # <-- NEW: send the model to the right location
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data.to(device))
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
model.get() # <-- NEW: get the model back
if batch_idx % args.log_interval == 0:
loss = loss.get() # <-- NEW: get the loss back
losses.append(loss.item())
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * args.batch_size, len(federated_train_loader) * args.batch_size,
100. * batch_idx / len(federated_train_loader), loss.item()))
if __name__ == '__main__':
args = Arguments()
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
losses = []
federated_train_loader = sy.FederatedDataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
.federate((bob, alice)), # <-- NEW: we distribute the dataset across all the workers, it's now a FederatedDataset
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
model = Model().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
t = time.time()
for epoch in range(1, args.epochs + 1):
train(args, model, device, federated_train_loader, optimizer, epoch)
test(args, model, device, test_loader)
plt.plot(range(0,160),losses,marker='o')
plt.xlabel("iterator")
plt.ylabel("loss")
plt.show()
total_time = time.time() - t
print(total_time)
Here are the whole codes
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import syft as sy
hook = sy.TorchHook(torch)
bob = sy.VirtualWorker(hook, id="bob")
alice = sy.VirtualWorker(hook, id="alice")
class Arguments():
def __init__(self):
self.batch_size = 64
self.test_batch_size = 1000
self.epochs = 10
self.lr = 0.01
self.momentum = 0.5
self.no_cuda = False
self.seed = 1
self.log_interval = 10
self.save_model = False
args = Arguments()
use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
federated_train_loader = sy.FederatedDataLoader( # <-- this is now a FederatedDataLoader
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
.federate((bob, alice)),
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4*4*50, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4*4*50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
model = Net()
model = model.to(device) #pushing the model into available device.
optimizer = optim.SGD(model.parameters(), lr=0.01)
for epoch in range(1, args.epochs + 1):
# Train the model
model.train()
for batch_idx, (data, target) in enumerate(federated_train_loader): # iterate through each worker's dataset
model.send(data.location) #send the model to the right location ; data.location returns the worker name in which the data is present
data, target = data.to(device), target.to(device) # pushing both the data and target labels onto the available device.
optimizer.zero_grad() # 1) erase previous gradients (if they exist)
output = model(data) # 2) make a prediction
loss = F.nll_loss(output, target) # 3) calculate how much we missed
loss.backward() # 4) figure out which weights caused us to miss
optimizer.step() # 5) change those weights
model.get() # get the model back (with gradients)
if batch_idx % args.log_interval == 0:
loss = loss.get() #get the loss back
print('Epoch: {} [Training: {:.0f}%]\tLoss: {:.6f}'.format(epoch, 100. * batch_idx / len(federated_train_loader), loss.item()))
# Test the model
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data) # Getting a prediction
test_loss += F.nll_loss(output, target, reduction='sum').item() #updating test loss
pred = output.argmax(1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item() #correct pred in the current test set.
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
torch.save(model.state_dict(), "mnist_cnn.pt")
I hav tested the above code in torch 1.x and pysyft 0.2.5,And its working. (but with cnn model)...
just change the dataloader and model here.
I’m trying to solve a VQA classification problem. my training loss is not changing at all while training the model.
I put in comment the CNN model and try to run it with the text only, but still, no change in loss value.
I pass through those models:
class question_lstm(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, output_dim, que_size):
super(question_lstm, self).__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(input_dim, emb_dim)
self.tanh = nn.Tanh()
self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
self.dropout = nn.Dropout(dropout)
#self.fc1=nn.Linear(n_layers*hid_dim,que_size)
self.fc1=nn.Linear(n_layers*output_dim,que_size)
def forward(self, question):
emb_question=self.embedding(question) #(batchsize, input_dim, emb_dim=256)
emb_question=self.dropout(emb_question)
emb_question=self.tanh(emb_question)
emb_question = emb_question.transpose(0, 1) #(input_dim, batchsize, emb_dim)
output, (hidden, cell) = self.lstm(emb_question)
qu_feature = torch.cat((hidden, cell), dim=2)
qu_feature = qu_feature.transpose(0, 1) #(batchsize=100, num_layer=2, hid_dim=2048)
question_output =self.fc1(qu_feature)
return question_output
class vqamodel(nn.Module):
def __init__(self, output_dim,input_dim, emb_dim, hid_dim, n_layers, dropout, answer_len, que_size,):
super(vqamodel,self).__init__()
#self.image=img_CNN(img_size,image_feature)
self.question=question_lstm(input_dim, emb_dim, hid_dim, n_layers, dropout,output_dim,que_size)
self.tanh=nn.Tanh()
self.relu=nn.ReLU()
self.dropout=nn.Dropout(dropout)
self.fc1=nn.Linear(que_size,output_dim)
self.fc2=nn.Linear(output_dim,answer_len)
def forward(self, image, question):
question_emb=self.question(question)
combine =question_emb #*img_emb
out_feature=self.fc1(combine) #(batchsize=100, output_dim=2048)
out_feature=self.relu(out_feature)
out_feature=self.dropout(out_feature)
out_feature=self.fc2(out_feature) #(batchsize=100, answer_len=1000)
return (out_feature)
I’m using cross entropy loss and Adam:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vqa_model.parameters(),lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
any idea what can cause this constant loss value?
the train loop:
def train(model,criterion,optimizer,scheduler):
start_time = time.time() #the time we start the train
for epoch in range(num_epochs):
train_loss = 0
#test_loss = 0
train_correct = 0
#test_correct = 0
vqa_model.train()
for i,sample in enumerate(train_VQAdataset_loader):
#image = sample['image'].to(device=device)
question = sample['question'].to(torch.int64).to(device=device)
label = sample['answer'].to(device=device)
output = vqa_model(image, question) # forward
loss = criterion(output, label)
optimizer.zero_grad() # Zero the gradients
loss.backward() # backprop
optimizer.step() # Update weights
scheduler.step()
# Statitcs
train_loss += loss.item() # save the loss for the entire epoch
_, predictions = torch.max(output, 1)
train_correct += (predictions == label).sum() #number of success - cumulative
train_losses.append(train_loss / len(train_VQAdataset_loader))