Training not speeding up even after using GPU - pytorch

transform = transforms.Compose([transforms.Resize(IMG_SIZE),
transforms.CenterCrop(CROP_SIZE),
transforms.ToTensor()])
class LandmarksDatasetTrain(Dataset):
"""Landmarks dataset."""
def __init__(self, landmarks_frame, root_dir, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.landmarks_frame = landmarks_frame
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.landmarks_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir,self.landmarks_frame.loc[idx, 'id'][0],self.landmarks_frame.loc[idx, 'id'][1], self.landmarks_frame.loc[idx, 'id'][2], self.landmarks_frame.loc[idx, 'id'])
img_name += ".jpg"
image = Image.open(img_name)
landmarks = self.landmarks_frame.loc[idx, 'landmark_id']
sample = {'image': image, 'landmarks': landmarks}
if self.transform:
sample['image'] = self.transform(sample['image'])
sample['landmarks'] = torch.tensor(sample['landmarks'])
return sample
dataset_train = LandmarksDatasetTrain(landmarks_frame = frame,
root_dir='/kaggle/input/landmark-recognition-2020/train',
transform=transform)
train_loader = DataLoader(dataset_train, batch_size=4, shuffle=True, num_workers=4, drop_last=False)
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(CROP_SIZE*CROP_SIZE*3, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 64)
self.fc4 = nn.Linear(64, frame['landmark_id'].nunique())
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.fc4(x)
return F.log_softmax(x, dim=1)
net = Net()
net.to(device)
for epoch in range(3):
optimizer = optim.Adam(net.parameters(), lr=0.001)
for data in tqdm(train_loader):
X = data['image'].to(device)
y = data['landmarks'].to(device)
net.zero_grad()
output = net(X.view(-1,CROP_SIZE*CROP_SIZE*3))
loss = F.nll_loss(output, y)
loss.backward()
optimizer.step()
print(loss)
batch size is 4
data['image'] and data['landmarks'] are tensors device = torch.device("cuda:0") and deep learning library I am using is Pytorch but GPU is still not working for me. Its usage shows 5% and total time for 1 epoch 3.5 to 4 hours
Will be really helpful if someone points out my mistake.
Attaching an image of resource usage and GPU config.
Attaching image for showing that GPU is on
Here is the link to my notebook
https://www.kaggle.com/hiteshsom/google-landmark-recognition

Related

Reshape data to be usable for training GCN in PyTorch

I am trying to build Graph Convolutional Network. I converted my dataframe to PyTorch
required format using below code.
class S_Dataset(Dataset):
def __init__(self, df, transform=None):
self.df = df
self.transform = transform
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
row = self.df.iloc[idx]
x = torch.tensor([row.date.to_pydatetime().timestamp(), row.s1, row.s2, row.s3, row.s4, row.temp ,row.rh, row.Location, row.Node ], dtype=torch.float)
y = torch.tensor([row.Location], dtype=torch.long)
weight1 = torch.tensor([row.neighbor1_distance], dtype=torch.float)
weight2 = torch.tensor([row.neighbor2_distance], dtype=torch.float)
weight3 = torch.tensor([row.neighbor3_distance], dtype=torch.float)
edge_index1 = torch.tensor([[row.Location, row.neighbor1_name]], dtype=torch.long).t()
edge_index2 = torch.tensor([[row.Location, row.neighbor2_name]], dtype=torch.long).t()
edge_index3 = torch.tensor([[row.Location, row.neighbor3_name]], dtype=torch.long).t()
edge_index = torch.cat([edge_index1, edge_index2, edge_index3 ], dim=1)
weight = torch.cat([weight1, weight2, weight3], dim=0)
if self.transform:
x, y, edge_index, weight = self.transform(x, y, edge_index, weight)
return x, y, edge_index, weight
Process_Data = S_Dataset(df)
Next I divided data into train and test set:
train_size = int(len(Process_Data) * 0.8)
test_size = len(Process_Data) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(Process_Data, [train_size, test_size])
# Create dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True )
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True )
I designed a simple model:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
# Create the model
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = GCNConv(9, 128)
self.conv2 = GCNConv(128, 64)
self.fc1 = nn.Linear(64, 32)
self.fc2 = nn.Linear(32, len(location_to_id))
def forward(self, x, edge_index, weight):
x = self.conv1(x, edge_index, weight)
x = torch.relu(x)
x = self.conv2(x, edge_index, weight)
x = torch.relu(x)
x = x.view(-1, 64)
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
return x
Finally to train the model:
model = Net()
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
for epoch in range(100):
total_loss = 0
for batch in train_loader:
optimizer.zero_grad()
x, y, edge_index, weight = batch
y_pred = model(x, edge_index, weight)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
total_loss += loss.item()
print('Epoch: {} Loss: {:.4f}'.format(epoch, total_loss / len(train_loader)))
I am facing following error:
IndexError: The shape of the mask [2, 3] at index 0 does not match the shape of the indexed tensor [32, 3] at index 0
x, y, edge_index, weight = batch
This line is causing error.
How can I resphae my data so I can train my model?
The batch size is set at 32, but there might not be enough samples to fit in the batch size of 32.
I am assuming, this error occurs after the code runs for some time, I would appreciate more context on the problem
A general solution could be decreasing the size of batch to something smaller and trying the code again. Making sure all samples are covered in the epoch.

Stack expects tensor to be equal size, but got [66, 67, 4] at entry 0 and [66, 68, 4] at entry 7

class customDataset(Dataset):
def __init__(self,csv_file, root_dir, transform=None):
self.annotations = pd.read_csv(csv_file)
self.root_dir = root_dir
#self.transform = transform
def __len__(self):
return len(self.annotations)
def __getitem__(self,index):
img_path = os.path.join(self.root_dir,self.annotations.iloc[index,0])
image = io.imread(img_path)
y_label = torch.tensor(int(self.annotations.iloc[index,1]))
#if self.transform:
# image = self.transform(image)
return (image,y_label)
device = torch.device("cuda")
in_channel = 1
num_classes = 1
learning_rate = 0.001
batch_size = 32
num_epochs = 1
dataset = customDataset(csv_file="biomass.csv", root_dir = "biomassMerged", transform = transforms.ToTensor())
train_set, test_set = torch.utils.data.random_split(dataset, [len(dataset)- 10000,10000])
train_loader = DataLoader(dataset=train_set,batch_size=batch_size,shuffle=True)
test_loader = DataLoader(dataset=test_set,batch_size=batch_size,shuffle=True)
model = torchvision.models.googlenet(pretrained = True)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
for epoch in range(num_epochs):
losses = []
for batch_idx, (data, targets) in enumerate(train_loader):
data = data.to(device=device)
targets = targets.to(device=device)
scores = model(data)
loss = criterion(scores,targests)
losses.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Cost at each {epoch} is {sum(losses)/len(losses)}")
I have created a customDataset class since I need to go through a lot of image data that I have gathered, the issue is that every time I go through it there is a new entry that tensors dont match. How do I fix this? The images should all be the same size.

How to extract the encoded features after running a PyTorch LSTM autoencoder model?

I am very new to PyTorch and Python in general, and I am now struggling to get the encoded features from my pre-trained LSTM autoencoder which can be seen below:
import torch
import torch.nn as nn
# Bulding an LSTM autoencoder
class Encoder(nn.Module):
def __init__(self, seq_len, n_features, embedding_dim=32):
super(Encoder, self).__init__()
self.seq_len, self.n_features = seq_len, n_features
self.embedding_dim, self.hidden_dim1, self.hidden_dim2 = embedding_dim, 4 * embedding_dim, 2* embedding_dim
self.rnn1 = nn.LSTM(
input_size=n_features,
hidden_size=self.hidden_dim1, #128
num_layers=1,
batch_first=True
)
self.rnn2 = nn.LSTM(
input_size=self.hidden_dim1,
hidden_size=self.hidden_dim2, #64
num_layers=1,
batch_first=True
)
self.rnn3 = nn.LSTM(
input_size=self.hidden_dim2,
hidden_size=embedding_dim, #32
num_layers=1,
batch_first=True
)
def forward(self, x):
x = x.reshape((1, self.seq_len, self.n_features))
x, (_, _) = self.rnn1(x)
x, (_, _) = self.rnn2(x)
x, (hidden_n, _) = self.rnn3(x)
return hidden_n.reshape((self.n_features, self.embedding_dim))
class Decoder(nn.Module):
def __init__(self, seq_len, input_dim=32, n_features=1):
super(Decoder, self).__init__()
self.seq_len, self.input_dim = seq_len, input_dim
self.hidden_dim2, self.hidden_dim1, self.n_features = 4 * input_dim,2 * input_dim, n_features
self.rnn1 = nn.LSTM(
input_size=input_dim,
hidden_size=input_dim,
num_layers=1,
batch_first=True
)
self.rnn2 = nn.LSTM(
input_size=input_dim,
hidden_size=self.hidden_dim1,
num_layers=1,
batch_first=True
)
self.rnn3 = nn.LSTM(
input_size=self.hidden_dim1,
hidden_size=self.hidden_dim2,
num_layers=1,
batch_first=True
)
self.output_layer = nn.Linear(self.hidden_dim2, n_features)
def forward(self, x):
x = x.repeat(self.seq_len, self.n_features)
x = x.reshape((self.n_features, self.seq_len, self.input_dim))
x, (hidden_n, cell_n) = self.rnn1(x)
x, (hidden_n, cell_n) = self.rnn2(x)
x, (hidden_n, cell_n) = self.rnn3(x)
x = x.reshape((self.seq_len, self.hidden_dim2))
return self.output_layer(x)
class RAE(nn.Module):
def __init__(self,seq_len, n_features, embedding_dim=32):
super(RAE, self).__init__()
self.seq_len, self.n_features = seq_len, n_features
self.embedding_dim = embedding_dim
self.encoder = Encoder (seq_len, n_features, embedding_dim).to(device)
self.decoder = Decoder (seq_len, embedding_dim, n_features).to(device)
def forward(self,x):
x = self.encoder(x)
x = self.decoder(x)
return x
### TRAINING
def train_model(model,train_dataset,val_dataset, n_epochs):
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.MSELoss(reduction='mean').to(device) # nn.L1Loss sum
history = dict(train = [], val = [])
for epoch in range(1, n_epochs + 1):
model = model.train()
train_losses = []
for seq_true in train_dataset:
optimizer.zero_grad()
seq_true = seq_true.to(device)
seq_pred = model(seq_true)
loss = criterion(seq_pred, seq_true)
loss.backward()
optimizer.step()
train_losses.append(loss.item())
val_losses = []
model = model.eval()
with torch.no_grad():
for seq_true in val_dataset:
seq_true = seq_true.to(device)
seq_pred =model(seq_true)
loss = criterion(seq_pred, seq_true)
val_losses.append(loss.item())
#add accuracy
train_loss = np.mean(train_losses)
val_loss = np.mean(val_losses)
history['train'].append(train_loss)
history['val'].append(val_loss)
print(f'Epoch {epoch}: train loss {train_loss} val loss {val_loss}')
return model.eval(),history
Once I trained my model I followed the advice given by ptrblck here and implemented it as follows:
activation = {}
def get_activation(name):
def hook(model, input, output):
activation[name] = output.detach()
return hook
model.encoder.register_forward_hook(get_activation('encoder'))
x = test_dataset_SR[1] # instead of using his random example I used one example from my training set
x = x.cuda()
output = model(x)
print(activation['encoder'])
but this gives me this error:
2 def get_activation(name):
3 def hook(model, input, output):
----> 4 activation[name] = output.detach()
5 return hook
AttributeError: 'tuple' object has no attribute 'detach'
Can you please help me solve this issue? I want to take these encoded features, store them and use them as input to another network. I know I could probably train the encoder separately(not sure), but I will need both encoder and decoder so I thought hooks will be my salvation.

While running pytorch i got an error 'TypeError: object of type'CatsAndDogsDataset' has no len()' and I want to know how to fix it

I got an error while running pytorch
I train artificial intelligence with ResNet, and I wrote my own custom dataset for the dataset. After loading the data set from ResnNet, training data and test data were set separately by learning with artificial intelligence. But even though I ran it, an error occurred, but I don't know what kind of problem occurred.
Next, I will attach my ResNet.py code and my own data set CustomDataset.py code.
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, datasets, models
from customDataset import CatsAndDogsDataset
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
EPOCHS = 3
BATCH_SIZE = 10
dataset = CatsAndDogsDataset(csv_file = 'cats_dogs.csv', root_dir = 'cats_dogs_resized',transform = transforms.ToTensor())
train_set, test_set = torch.utils.data.random_split(dataset, [28,4])
train_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, shuffle=True)
class BasicBlock(nn.Module):
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3,stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != planes:
self.shortcut = nn.Sequential(nn.Conv2d(in_planes, planes,kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes))
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, num_classes=10):
super(ResNet, self).__init__()
self.in_planes = 16
self.conv1 = nn.Conv2d(3, 16, kernel_size=3,stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(16)
self.layer1 = self._make_layer(16, 2, stride=1)
self.layer2 = self._make_layer(32, 2, stride=2)
self.layer3 = self._make_layer(64, 2, stride=2)
self.linear = nn.Linear(64, num_classes)
def _make_layer(self, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(BasicBlock(self.in_planes, planes, stride))
self.in_planes = planes
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = F.avg_pool2d(out, 8)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
model = ResNet().to(DEVICE)
optimizer = optim.SGD(model.parameters(), lr=0.1,momentum=0.9, weight_decay=0.0005)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
print(model)
def train(model, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(DEVICE), target.to(DEVICE)
optimizer.zero_grad()
output = model(data)
loss = F.cross_entropy(output, target)
loss.backward()
optimizer.step()
def evaluate(model, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(DEVICE), target.to(DEVICE)
output = model(data)
test_loss += F.cross_entropy(output, target,reduction='sum').item()
pred = output.max(1, keepdim=True)[1]
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= length(test_loader.dataset)
test_accuracy = 100. * correct / length(test_loader.dataset)
return test_loss, test_accuracy
for epoch in range(1, EPOCHS + 1):
scheduler.step()
train(model, train_loader, optimizer, epoch)
test_loss, test_accuracy = evaluate(model, test_loader)
print('[{}] Test Loss: {:.4f}, Accuracy: {:.2f}%'.format(
epoch, test_loss, test_accuracy))
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from skimage import io
class CatsAndDogsDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
self.annotations = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __length__(self):
return length(self.annotations)
def __getitem__(self, index):
img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
image = io.imread(img_path)
y_label = torch.tensor(int(self.annotations.iloc[index, 1]))
if self.transform:
image = self.transform(image)
return (image, ylabel)
In this way, if I write the code and then run it
TypeError: object of type'CatsAndDogsDataset' has no len()
I wonder why I can't have len(). In addition, an error occurred in the result of running Backend.ai instead of pycharm, but the error content is
Cannot verify that dataset is Sized
if sum(lengths) != len(dataset):
It is raise ValueError("sum of input lengths does not equal the length of the input dataset!").
Error appears. Is there a workaround? help me plz
You need to define the function __len__ for your custom dataset (which you seem to have currently incorrectly defined as __length__).
This documentation provides details. Relevant excerpt:
torch.utils.data.Dataset is an abstract class representing a dataset.
Your custom dataset should inherit Dataset and override the following
methods:
__len__ so that len(dataset) returns the size of the dataset.
__getitem__ to support the indexing such that dataset[i] can be used to get i th sample.

5 fold cross validation using pytorch

Need to perform 5 fold cross validation on my dataset. I was able to find 2 examples of doing this but could not integrate to my current pipeline.Could anyone please help me with this.
###############################################################################################
class leukemiaClassifier(Dataset):
def __init__(self, csv_file, transform):
self.data = pd.read_csv(csv_file)
self.data = self.data
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
img_name = self.data.loc[idx][0]
img = Image.open(img_name).convert('RGB')
img = cv2.imread(img_name)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
image = self.transform(image=img)
image = image['image']
labels = torch.tensor(self.data.loc[idx][1])
return image, labels
train_file = 'train.csv'
val_file = 'test.csv'
batch_size = 28
train_dataset = leukemiaClassifier(
csv_file=train_file,transform = data_transforms)
val_dataset = leukemiaClassifier(
csv_file=val_file, transform = data_transforms_test)
read_target = pd.read_csv('train.csv')
target = read_target['label'].values
data_loader_train = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,num_workers=64)
data_loader_val = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=64)
###############################################################################################
#Model Utils Hyperparameter etc
###############################################################################################
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def efficientnet(version,num_classes):
model = EfficientNet.from_pretrained('efficientnet-b{}'.format(version), num_classes=num_classes)
num_ftrs = model._fc.in_features
model._fc = nn.Linear(num_ftrs, num_classes)
return model.cuda()
target_names = ['Lymphocyte(atypical)', 'Monoblast', 'Promyelocyte(bilobed)', 'Metamyelocyte', 'Erythroblast', 'Neutrophil(segmented)', 'Myeloblast','Promyelocyte','Monocyte','Lymphocyte(typical)','Neutrophil(band)','Smudge cell', 'Eosinophil', 'Myelocyte', 'Basophil']
model= efficientnet(5,15)
model = nn.DataParallel(model)
wandb.watch(model)
# criterion = torch.nn.CrossEntropyLoss()
criterion = FocalLoss()
labels = torch.tensor((0,1,2,3,4,5,6,7,8,9,10,11,12,13,14)).cuda()
no_of_classes = 15
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.01, patience=5, verbose=True)
global_validation_loss= 100
###############################################################################################
#Training and validation loop
###############################################################################################
for epoch in range(300000): # loop over the dataset multiple times
running_loss = 0.0
label_list =[]
predicted_list=[]
model = model.train()
for batch_idx, data in enumerate( tqdm.tqdm(data_loader_train)):
inputs,labels = data
inputs,labels = inputs.cuda(), labels.cuda().long()
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs,labels)
# print(loss)
loss.backward()
optimizer.step()
for item in torch.argmax(outputs,1).detach().cpu().numpy():predicted_list.append(item)
for item in labels.detach().cpu().numpy():label_list.append(item)
# print statistics
running_loss += loss.item()

Resources