PyTorch: Add validation error in training - python-3.x

I am using PyTorch to train a cnn model. Here is my Network architecture:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as I
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 5)
self.pool = nn.MaxPool2d(2,2)
self.conv1_bn = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(32, 64, 5)
self.conv2_drop = nn.Dropout2d()
self.conv2_bn = nn.BatchNorm2d(64)
self.fc1 = torch.nn.Linear(53*53*64, 256)
self.fc2 = nn.Linear(256, 136)
def forward(self, x):
x = F.relu(self.conv1_bn(self.pool(self.conv1(x))))
x = F.relu(self.conv2_bn(self.pool(self.conv2_drop(self.conv2(x)))))
x = x.view(-1, 53*53*64)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return x
Then I train the model like below:
# prepare the net for training
net.train()
for epoch in range(n_epochs): # loop over the dataset multiple times
running_loss = 0.0
# train on batches of data, assumes you already have train_loader
for batch_i, data in enumerate(train_loader):
# get the input images and their corresponding labels
images = data['image']
key_pts = data['keypoints']
# flatten pts
key_pts = key_pts.view(key_pts.size(0), -1)
# wrap them in a torch Variable
images, key_pts = Variable(images), Variable(key_pts)
# convert variables to floats for regression loss
key_pts = key_pts.type(torch.FloatTensor)
images = images.type(torch.FloatTensor)
# forward pass to get outputs
output_pts = net(images)
# calculate the loss between predicted and target keypoints
loss = criterion(output_pts, key_pts)
# zero the parameter (weight) gradients
optimizer.zero_grad()
# backward pass to calculate the weight gradients
loss.backward()
# update the weights
optimizer.step()
# print loss statistics
running_loss += loss.data[0]
I am wondering if it is possible to add the validation error in the training? I mean something like this (validation split) in Keras:
myModel.fit(trainX, trainY, epochs=50, batch_size=1, verbose=2, validation_split = 0.1)

Here is an example how to split your dataset for training and validation, then switch between the two phases every epoch:
import numpy as np
import torch
from torchvision import datasets
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler
# Examples:
my_dataset = datasets.MNIST(root="/home/benjamin/datasets/mnist", train=True, download=True)
validation_split = 0.1
dataset_len = len(my_dataset)
indices = list(range(dataset_len))
# Randomly splitting indices:
val_len = int(np.floor(validation_split * dataset_len))
validation_idx = np.random.choice(indices, size=val_len, replace=False)
train_idx = list(set(indices) - set(validation_idx))
# Contiguous split
# train_idx, validation_idx = indices[split:], indices[:split]
## Defining the samplers for each phase based on the random indices:
train_sampler = SubsetRandomSampler(train_idx)
validation_sampler = SubsetRandomSampler(validation_idx)
train_loader = torch.utils.data.DataLoader(my_dataset, sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(my_dataset, sampler=validation_sampler)
data_loaders = {"train": train_loader, "val": validation_loader}
data_lengths = {"train": len(train_idx), "val": val_len}
# Training with Validation (your code + code from Pytorch tutorial: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html)
n_epochs = 40
net = ...
for epoch in range(n_epochs):
print('Epoch {}/{}'.format(epoch, n_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
optimizer = scheduler(optimizer, epoch)
net.train(True) # Set model to training mode
else:
net.train(False) # Set model to evaluate mode
running_loss = 0.0
# Iterate over data.
for data in data_loaders[phase]:
# get the input images and their corresponding labels
images = data['image']
key_pts = data['keypoints']
# flatten pts
key_pts = key_pts.view(key_pts.size(0), -1)
# wrap them in a torch Variable
images, key_pts = Variable(images), Variable(key_pts)
# convert variables to floats for regression loss
key_pts = key_pts.type(torch.FloatTensor)
images = images.type(torch.FloatTensor)
# forward pass to get outputs
output_pts = net(images)
# calculate the loss between predicted and target keypoints
loss = criterion(output_pts, key_pts)
# zero the parameter (weight) gradients
optimizer.zero_grad()
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
# update the weights
optimizer.step()
# print loss statistics
running_loss += loss.data[0]
epoch_loss = running_loss / data_lengths[phase]
print('{} Loss: {:.4f}'.format(phase, epoch_loss))

Related

PyTorch simple ConvNet diverge so easly

So I'm studiying pytorch coming from a background with tensorflow.
I'm trying to replicate a simple convnet, that I've developed with success in tensorflow, to classify cat vs dogs images.
In pytorch I see some strange behaviors:
Using a Learning Rate of 0.001 make the CNet predicting only 0 after the first batch (might be exploding gradients?)
Using a Learning Rate of 0.0005 gives a smooth learning curve and the CNet converge
Can anyone help me to understand what I'm doing wrong? that the code:
import pathlib
import torch
import torch.nn.functional as F
import torchvision
from torch.utils.data.dataloader import DataLoader
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CNet(torch.nn.Module):
def __init__(self):
super(CNet, self).__init__() #input is 180x180 image
self.conv1 = torch.nn.Conv2d(3, 32, 3) # out -> 178x178x32
self.conv2 = torch.nn.Conv2d(32, 64, 3)
self.conv3 = torch.nn.Conv2d(64, 128, 3)
self.conv4 = torch.nn.Conv2d(128, 256, 3)
self.conv5 = torch.nn.Conv2d(256, 256, 3)
self.flatten = torch.nn.Flatten()
#self.fc = torch.nn.LazyLinear(1)
self.fc = torch.nn.Linear(7*7*256, 1)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv4(x)), (2, 2))
x = F.relu(self.conv5(x))
x = self.flatten(x)
o = torch.sigmoid(self.fc(x))
return o
def train(model : CNet, train_data : DataLoader, criterion, optimizer : torch.optim.Optimizer, epochs = 10, validation_data : DataLoader = None):
losses = []
for epoch in range(epochs):
epoch_loss = 0.0
running_loss = 0.0
for i, data in enumerate(train_data, 0):
imgs, labels = data
imgs, labels = imgs.to(device), labels.to(device, dtype=torch.float)
labels = labels.unsqueeze(-1)
# run
output = net(imgs)
# zero out accumulated grads
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
epoch_loss += loss.item()
#if i % 50 == 49:
# print(f'[{epoch+1}, {i:5d}] loss: {running_loss / 50.0:.3f}')
# running_loss = 0.0
losses.append(epoch_loss / len(train_data.dataset))
print(f'[{epoch+1}, {epochs:5d}] loss: {losses[-1]:.3f}')
return losses
if __name__=="__main__":
transforms = torchvision.transforms.Compose([
torchvision.transforms.Resize((180, 180)),
torchvision.transforms.ToTensor(),
])
dataset_dir = pathlib.Path("E:\Datasets\\torch\Cat_Dog\cats_vs_dogs_small")
train_data = torchvision.datasets.ImageFolder(dataset_dir / "train", transform=transforms)
validation_data = torchvision.datasets.ImageFolder(dataset_dir / "validation", transform=transforms)
test_data = torchvision.datasets.ImageFolder(dataset_dir / "test", transform=transforms)
train_data_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=2, persistent_workers=True, pin_memory=True)
validation_data_loader = DataLoader(validation_data, batch_size=32, num_workers=2, shuffle=True, pin_memory=True)
test_data_loader = DataLoader(test_data, batch_size=32, shuffle=True, pin_memory=True, num_workers=2)
import matplotlib.pyplot as plt
#plt.figure()
#for i in range(1, 10):
# plt.subplot(3, 3, i)
# plt.axis('off')
# rand_idx = np.random.random_integers(0, len(train_data))
# plt.imshow(np.moveaxis(test_data[rand_idx][0].numpy(), 0, 2))
#plt.show()
net = CNet()
net = net.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.RMSprop(net.parameters(), 0.001)
net.train()
# TODO save best model
losses = train(net, train_data_loader, criterion, optimizer, epochs=30)
epochs = range(1, len(losses) + 1)
plt.plot(epochs, losses, 'bo', label='Training Loss')
plt.show()
print('Training Finished')
correct_count, all_count = 0, 0
for images,labels in test_data_loader:
images,labels = images.to(device), labels.to(device, dtype=torch.float)
with torch.no_grad():
ps = net(images)
pred_label = (ps > 0.5).to(torch.float)
true_label = labels.unsqueeze(1)
correct_count += (pred_label == true_label).sum().item()
all_count += len(labels)
print("Number Of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))
and here some screenshot of the loss for each point:
LR=0.001 (not convering on pytorch, converging on tensorflow)
LR=0.0005 (converging in 30 epochs) [I know that the validation loss is not 0, accuracy is ~70% but is expected]
As you can see the loss on the two experiment are very different in scale. What might cause that such a weird behavior? I call it 'wierd' cause I never seen that happen on tensorflow.
Is typicall such different behavior between those 2 framework? or am I loosing something?

Could the output label in turn with the text?

I am new in NLP.
On Pytorch, I'm training BertForSequenceClassification for a multi-label task.
Is it possible for the output label to match the text?
nput is a text like : name, phon, address,output : label_name, label_phone, label_address.
In this case, input : phon, address, name,output : label_name, label_phone, label_address.
However, I'd like the output to look something like this: label_phone, label_address, label_name.
Is there anything I can change?
## Package
# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import torch.utils.data as data
# BERT Related Libraries
from transformers import BertTokenizer, BertForSequenceClassification
# Python
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
# ML Parameters
epoch = 8
device = 'cuda'
num_labels = 9
batch_size = 32
epsilon = 1e-8
learning_rate = 5e-5
## Define model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
optimizer = optim.AdamW(model.parameters(), lr = learning_rate, eps = epsilon, weight_decay = 1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 1, gamma=0.1, last_epoch=-1)
criterion = nn.BCEWithLogitsLoss()
train_losses = []
train_acces = []
val_losses = []
val_acces = []
## Train with training set
def train(model, iterator, optimizer, criterion, device):
model.train()
train_loss = 0
correct = 0
total = 0
for batch_idx, (sentences, labels) in enumerate(iterator):
# tokenize the sentences
encoding = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
# generate prediction
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask) # NOT USING INTERNAL CrossEntropyLoss
# compute gradients and update weights
loss = criterion(outputs.logits, labels) # BCEWithLogitsLoss has sigmoid
loss.backward()
optimizer.step()
# accumulate train loss
train_loss += loss.item()
# record processed data count
prob = outputs.logits.sigmoid()
total += (labels.size(0)*labels.size(1))
# take the index of the highest prob as prediction output
THRESHOLD = 0.7
prediction = prob.detach().clone()
prediction[prediction > THRESHOLD] = 1
prediction[prediction <= THRESHOLD] = 0
correct += prediction.eq(labels).sum().item()
train_acc = correct/total
print('train_loss: %f\t' % (train_loss))
print('train_acc: %f\t' % (train_acc))
return train_loss, train_acc
## Validate with testing set
def test(model, iterator, optimizer, criterion, device):
model.eval()
val_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch_idx, (sentences, labels) in enumerate(iterator):
# tokenize the sentences
encoding = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
# generate prediction
outputs = model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs.logits, labels)
val_loss += loss.item()
# record processed data count
prob = outputs.logits.sigmoid()
total += (labels.size(0)*labels.size(1))
# take the index of the highest prob as prediction output
THRESHOLD = 0.7
prediction = prob.detach().clone()
prediction[prediction > THRESHOLD] = 1
prediction[prediction <= THRESHOLD] = 0
correct += prediction.eq(labels).sum().item()
val_acc = correct/total
print('val_loss: %f\t' % (val_loss))
print('val_acc: %f\t' % (val_acc))
print('correct: %i , total: %i' % (correct, total))
return val_loss, val_acc
for e in range(epoch):
print("===== Epoch %i =====" % e)
print("Training started ...")
train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
train_losses.append(train_loss/len(train_loader))
train_acces.append(train_acc/len(train_loader))
# validation testing
print("Testing started ...")
val_loss, val_acc = test(model, test_loader, optimizer, criterion, device)
val_losses.append(val_loss/len(test_loader))
val_acces.append(val_acc/len(test_loader))
scheduler.step()
If you need more information,please let me konw.
Thanks for all.

Function AddmmBackward returned an invalid gradient

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 3)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = NeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
def UploadData(path, train):
#set up transforms for train and test datasets
train_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.RandomRotation(30),
transforms.RandomHorizontalFlip(), transforms.transforms.ToTensor()])
valid_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.RandomRotation(30),
transforms.RandomHorizontalFlip(), transforms.transforms.ToTensor()])
test_transforms = transforms.Compose([transforms.Grayscale(num_output_channels=1), transforms.Resize(255), transforms.CenterCrop(224), transforms.ToTensor()])
#set up datasets from Image Folders
train_dataset = datasets.ImageFolder(path + '/train', transform=train_transforms)
valid_dataset = datasets.ImageFolder(path + '/validation', transform=valid_transforms)
test_dataset = datasets.ImageFolder(path + '/test', transform=test_transforms)
#set up dataloaders with batch size of 32
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)
return trainloader, validloader, testloader
trainloader, validloader, testloader = UploadData("/home/lns/research/dataset", True)
epochs = 5
min_valid_loss = np.inf
for e in range(epochs):
train_loss = 0.0
for data, labels in trainloader:
# Transfer Data to GPU if available
if torch.cuda.is_available():
print("using GPU for data")
data, labels = data.cuda(), labels.cuda()
# Clear the gradients
optimizer.zero_grad()
# Forward Pass
target = net(data)
# Find the Loss
loss = criterion(target,labels)
# Calculate gradients
loss.backward()
# Update Weights
optimizer.step()
# Calculate Loss
train_loss += loss.item()
valid_loss = 0.0
model.eval() # Optional when not using Model Specific layer
for data, labels in validloader:
# Transfer Data to GPU if available
if torch.cuda.is_available():
print("using GPU for data")
data, labels = data.cuda(), labels.cuda()
# Forward Pass
target = net(data)
# Find the Loss
loss = criterion(target,labels)
# Calculate Loss
valid_loss += loss.item()
print('Epoch ',e+1, '\t\t Training Loss: ',train_loss / len(trainloader),' \t\t Validation Loss: ',valid_loss / len(validloader))
if min_valid_loss > valid_loss:
print("Validation Loss Decreased(",min_valid_loss,"--->",valid_loss,") \t Saving The Model")
min_valid_loss = valid_loss
# Saving State Dict
torch.save(net.state_dict(), '/home/lns/research/MODEL.pth')
After searching a lot i am asking for help. Can someone help me
understand why this error is occuring in backward propagation.
i followed pytorch cnn tutorail and geeksforgeeks tutorial
dataset is x ray images transformed into grayscale and resize to 255
Is my neural network is wrong or data is not processed correctly?
This is a size mismmatch between the output of your CNN and the number of neurons on on your first fully-connected layer. Because of missing padding, the number of elements when flattened is 16*4*4 i.e. 256 (and not 16*5*5):
self.fc1 = nn.Linear(256, 120)
Once modified, the model will run correctly:
>>> model = NeuralNetwork()
>>> model(torch.rand(1, 1, 28, 28)).shape
torch.Size([1, 3])
Alternatively, you can use an nn.LazyLinear which will deduce the in_feature argument during the very first inference based on its input shape.
self.fc1 = nn.LazyLinear(120)

vgg probability doesn't add up to 1, pytorch

I've trained a vgg16 model to predict 102 classes of flowers.
It works however now that I'm trying to understand one of it's predictions I feel it's not acting normally.
model layout
# Imports here
import os
import numpy as np
import torch
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import json
from pprint import pprint
from scipy import misc
%matplotlib inline
data_dir = 'flower_data'
train_dir = data_dir + '/train'
test_dir = data_dir + '/valid'
json_data=open('cat_to_name.json').read()
main_classes = json.loads(json_data)
main_classes = {int(k):v for k,v in classes.items()}
train_transform_2 = transforms.Compose([transforms.RandomResizedCrop(224),
transforms.RandomRotation(30),
transforms.RandomHorizontalFlip(),
transforms.ToTensor()])
test_transform_2= transforms.Compose([transforms.RandomResizedCrop(224),
transforms.ToTensor()])
# TODO: Load the datasets with ImageFolder
train_data = datasets.ImageFolder(train_dir, transform=train_transform_2)
test_data = datasets.ImageFolder(test_dir, transform=test_transform_2)
# define dataloader parameters
batch_size = 20
num_workers=0
# TODO: Using the image datasets and the trainforms, define the dataloaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
num_workers=num_workers, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
num_workers=num_workers, shuffle=True)
vgg16 = models.vgg16(pretrained=True)
# Freeze training for all "features" layers
for param in vgg16.features.parameters():
param.requires_grad = False
import torch.nn as nn
n_inputs = vgg16.classifier[6].in_features
# add last linear layer (n_inputs -> 102 flower classes)
# new layers automatically have requires_grad = True
last_layer = nn.Linear(n_inputs, len(classes))
vgg16.classifier[6] = last_layer
import torch.optim as optim
# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
# specify optimizer (stochastic gradient descent) and learning rate = 0.001
optimizer = optim.SGD(vgg16.classifier.parameters(), lr=0.001)
pre_trained_model=torch.load("model.pt")
new=list(pre_trained_model.items())
my_model_kvpair=vgg16.state_dict()
count=0
for key,value in my_model_kvpair.items():
layer_name, weights = new[count]
my_model_kvpair[key] = weights
count+=1
# number of epochs to train the model
n_epochs = 6
# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
for epoch in range(1, n_epochs+1):
# keep track of training and validation loss
train_loss = 0.0
valid_loss = 0.0
###################
# train the model #
###################
# model by default is set to train
vgg16.train()
for batch_i, (data, target) in enumerate(train_loader):
# clear the gradients of all optimized variables
optimizer.zero_grad()
# forward pass: compute predicted outputs by passing inputs to the model
output = vgg16(data)
# calculate the batch loss
loss = criterion(output, target)
# backward pass: compute gradient of the loss with respect to model parameters
loss.backward()
# perform a single optimization step (parameter update)
optimizer.step()
# update training loss
train_loss += loss.item()
if batch_i % 20 == 19: # print training loss every specified number of mini-batches
print('Epoch %d, Batch %d loss: %.16f' %
(epoch, batch_i + 1, train_loss / 20))
train_loss = 0.0
######################
# validate the model #
######################
vgg16.eval() # prep model for evaluation
for data, target in test_loader:
# forward pass: compute predicted outputs by passing inputs to the model
output = vgg16(data)
# calculate the loss
loss = criterion(output, target)
# update running validation loss
valid_loss += loss.item()
# print training/validation statistics
# calculate average loss over an epoch
train_loss = train_loss/len(train_loader.dataset)
valid_loss = valid_loss/len(test_loader.dataset)
print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
epoch+1,
train_loss,
valid_loss
))
# save model if validation loss has decreased
if valid_loss <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(
valid_loss_min,
valid_loss))
torch.save(vgg16.state_dict(), 'model.pt')
valid_loss_min = valid_loss
testing on a single image
tensor = torch.from_numpy(test_image)
reshaped = tensor.permute(2, 0, 1).unsqueeze(0)
floatified = reshaped.to(torch.float32) / 255
vgg16(floatified)
>>>
tensor([[ 2.5686, -1.1964, -0.0872, -1.7010, -1.6669, -1.0638, 0.4515, 0.1124,
0.0166, 0.3156, 1.1699, 1.5374, 1.8720, 2.5184, 2.9046, -0.8241,
-1.1949, -0.5700, 0.8692, -1.0485, 0.0390, -1.3783, -3.4632, -0.0143,
1.0986, 0.2667, -1.1127, -0.8515, 0.7759, -0.7528, 1.6366, -0.1170,
-0.4983, -2.6970, 0.7545, 0.0188, 0.1094, 0.5002, 0.8838, -0.0006,
-1.7993, -1.3706, 0.4964, -0.3251, -1.7313, 1.8731, 2.4963, 1.1713,
-1.5726, 1.5476, 3.9576, 0.7388, 0.0228, 0.3947, -1.7237, -1.8350,
-2.0297, 1.4088, -1.3469, 1.6128, -1.0851, 2.0257, 0.5881, 0.7498,
0.0738, 2.0592, 1.8034, -0.5468, 1.9512, 0.4534, 0.7746, -1.0465,
-0.7254, 0.3333, -1.6506, -0.4242, 1.9529, -0.4542, 0.2396, -1.6804,
-2.7987, -0.6367, -0.3599, 1.0102, 2.6319, 0.8305, -1.4333, 3.3043,
-0.4021, -0.4877, 0.9125, 0.0607, -1.0326, 1.3186, -2.5861, 0.1211,
-2.3177, -1.5040, 1.0416, 1.4008, 1.4225, -2.7291]],
grad_fn=<ThAddmmBackward>)
sum([ 2.5686, -1.1964, -0.0872, -1.7010, -1.6669, -1.0638, 0.4515, 0.1124,
0.0166, 0.3156, 1.1699, 1.5374, 1.8720, 2.5184, 2.9046, -0.8241,
-1.1949, -0.5700, 0.8692, -1.0485, 0.0390, -1.3783, -3.4632, -0.0143,
1.0986, 0.2667, -1.1127, -0.8515, 0.7759, -0.7528, 1.6366, -0.1170,
-0.4983, -2.6970, 0.7545, 0.0188, 0.1094, 0.5002, 0.8838, -0.0006,
-1.7993, -1.3706, 0.4964, -0.3251, -1.7313, 1.8731, 2.4963, 1.1713,
-1.5726, 1.5476, 3.9576, 0.7388, 0.0228, 0.3947, -1.7237, -1.8350,
-2.0297, 1.4088, -1.3469, 1.6128, -1.0851, 2.0257, 0.5881, 0.7498,
0.0738, 2.0592, 1.8034, -0.5468, 1.9512, 0.4534, 0.7746, -1.0465,
-0.7254, 0.3333, -1.6506, -0.4242, 1.9529, -0.4542, 0.2396, -1.6804,
-2.7987, -0.6367, -0.3599, 1.0102, 2.6319, 0.8305, -1.4333, 3.3043,
-0.4021, -0.4877, 0.9125, 0.0607, -1.0326, 1.3186, -2.5861, 0.1211,
-2.3177, -1.5040, 1.0416, 1.4008, 1.4225, -2.7291])
>>>
5.325799999999998
given this as how I test it on a single image (and the model as usual is trained and tested on batches it returns a prediction matrix that doesn't seem to be normalized or add up to 1.
Is this normal?
Yes, official network implementations in PyTorch don't apply softmax to the last linear layer. Check the code for VGG. You can use nn.softmax to achieve what you want:
m = nn.Softmax()
out = vgg16(floatified)
out = m(out)
You can also use nn.functional.softmax:
out = nn.functional.softmax(vgg16(floatified))

Failing to train SkipGram word embedding in Pytorch

I am training the skipgram word embeddings using the famous model described in https://arxiv.org/abs/1310.4546. I want to train it in PyTorch but I am getting errors and I can't figure out where they are coming from. Below I have provided my model class, training loop, and batching method. Does anyone have any insight into whats going on?
I am getting an error on the output = loss(data, target) line. It is having a problem with <class 'torch.LongTensor'> which is weird because CrossEntropyLoss takes a long tensor. The output shape might be wrong which is: torch.Size([1000, 100, 1000]) after the feedforward.
I have my model defined as:
import torch
import torch.nn as nn
torch.manual_seed(1)
class SkipGram(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(SkipGram, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.hidden_layer = nn.Linear(embedding_dim, vocab_size)
# Loss needs to be input: (minibatch (N), C) target: (minibatch, 1), each label is a class
# Calculate loss in training
def forward(self, x):
embeds = self.embeddings(x)
x = self.hidden_layer(embeds)
return x
My training is defined as:
import torch.optim as optim
from torch.autograd import Variable
net = SkipGram(1000, 300)
optimizer = optim.SGD(net.parameters(), lr=0.01)
batch_size = 100
size = len(train_ints)
batches = batch_index_gen(batch_size, size)
inputs, targets = build_tensor_from_batch_index(batches[0], train_ints)
for i in range(100):
running_loss = 0.0
for batch_idx, batch in enumerate(batches):
data, target = build_tensor_from_batch_index(batch, train_ints)
# if (torch.cuda.is_available()):
# data, target = data.cuda(), target.cuda()
# net = net.cuda()
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = net.forward(data)
loss = nn.CrossEntropyLoss()
output = loss(data, target)
output.backward()
optimizer.step()
running_loss += loss.data[0]
optimizer.step()
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
i, batch_idx * len(batch_size), len(size),
100. * (batch_idx * len(batch_size)) / len(size), loss.data[0]))
If useful my batching is:
def build_tensor_from_batch_index(index, train_ints):
minibatch = []
for i in range(index[0], index[1]):
input_arr = np.zeros( (1000,1), dtype=np.int )
target_arr = np.zeros( (1000,1), dtype=np.int )
input_index, target_index = train_ints[i]
input_arr[input_index] = 1
target_arr[input_index] = 1
input_tensor = torch.from_numpy(input_arr)
target_tensor = torch.from_numpy(target_arr)
minibatch.append( (input_tensor, target_tensor) )
# Concatenate all tensors into a minibatch
#x = [tensor[0] for tensor in minibatch]
#print(x)
input_minibatch = torch.cat([tensor[0] for tensor in minibatch], 1)
target_minibatch = torch.cat([tensor[1] for tensor in minibatch], 1)
#target_minibatch = minibatch[0][1]
return input_minibatch, target_minibatch
I'm not sure about that since I did not read the paper, but seems weird that you are computing the loss with the original data and the targets:
output = loss(data, target)
Considering that the output of the network is output = net.forward(data) I think you should compute your loss as:
error = loss(output, target)
If this doesn't help, briefly point me out what the paper says about the loss function.

Resources