PyTorch: unable to use the Gloo backend - pytorch

I am trying to use PyTorch to train a model in a (multi-CPU) distributed manner. However, when I ran the code, I got the following error message
RuntimeError: the Gloo backend is not available; try to recompile the THD package with Gloo support at /opt/conda/conda-bld/pytorch-cpu_1544218188686/work/torch/lib/THD/process_group/General.cpp:20
I tried this both on my own Macbook and the university's slurm cluster (Red Hat 4.8.5-16). The Python version is 3.6. I installed PyTorch using the command conda install pytorch-cpu torchvision-cpu -c pytorch (the version without CUDA support).
I was wondering if I have to re-install PyTorch from the source or install Gloo manually. I was a little confused since according to PyTorch's documentation,
Since version 0.2.0, the Gloo backend is automatically included with the pre-compiled binaries of PyTorch
The code is exactly the example code available at https://github.com/seba-1511/dist_tuto.pth/blob/gh-pages/train_dist.py
import os
import torch
import torch.distributed.deprecated as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from math import ceil
from random import Random
from torch.multiprocessing import Process
from torch.autograd import Variable
from torchvision import datasets, transforms
class Partition(object):
""" Dataset-like object, but only access a subset of it. """
def __init__(self, data, index):
self.data = data
self.index = index
def __len__(self):
return len(self.index)
def __getitem__(self, index):
data_idx = self.index[index]
return self.data[data_idx]
class DataPartitioner(object):
""" Partitions a dataset into different chuncks. """
def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234):
self.data = data
self.partitions = []
rng = Random()
rng.seed(seed)
data_len = len(data)
indexes = [x for x in range(0, data_len)]
rng.shuffle(indexes)
for frac in sizes:
part_len = int(frac * data_len)
self.partitions.append(indexes[0:part_len])
indexes = indexes[part_len:]
def use(self, partition):
return Partition(self.data, self.partitions[partition])
class Net(nn.Module):
""" Network architecture. """
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
def partition_dataset():
""" Partitioning MNIST """
dataset = datasets.MNIST(
'./data',
train=True,
download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307, ), (0.3081, ))
]))
size = dist.get_world_size()
bsz = int(128 / float(size))
partition_sizes = [1.0 / size for _ in range(size)]
partition = DataPartitioner(dataset, partition_sizes)
partition = partition.use(dist.get_rank())
train_set = torch.utils.data.DataLoader(
partition, batch_size=bsz, shuffle=True)
return train_set, bsz
def average_gradients(model):
""" Gradient averaging. """
size = float(dist.get_world_size())
for param in model.parameters():
dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0)
param.grad.data /= size
def run(rank, size):
""" Distributed Synchronous SGD Example """
torch.manual_seed(1234)
train_set, bsz = partition_dataset()
model = Net()
model = model
# model = model.cuda(rank)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
num_batches = ceil(len(train_set.dataset) / float(bsz))
for epoch in range(10):
epoch_loss = 0.0
for data, target in train_set:
data, target = Variable(data), Variable(target)
# data, target = Variable(data.cuda(rank)), Variable(target.cuda(rank))
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
epoch_loss += loss.data.item()
loss.backward()
average_gradients(model)
optimizer.step()
print('Rank ',
dist.get_rank(), ', epoch ', epoch, ': ',
epoch_loss / num_batches)
def init_processes(rank, size, fn, backend='gloo'):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'
dist.init_process_group(backend, rank=rank, world_size=size)
fn(rank, size)
if __name__ == "__main__":
size = 2
processes = []
for rank in range(size):
p = Process(target=init_processes, args=(rank, size, run))
p.start()
processes.append(p)
for p in processes:
p.join()

Related

Custom layer caused the batch dimension mismatch in pytorch. How to fix this problem?

I have tried to train a GCN model.I defined the custom layer I needed. However, It cause some dimension mismatch when I do some batch training.
the codes are as following :
import math
import numpy as np
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
# =============================================================================
# model define
# =============================================================================
class GraphConvolution(Module):
"""
Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
"""
def __init__(self, in_features, out_features, bias=True):
super(GraphConvolution, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = Parameter(torch.FloatTensor(in_features, out_features))
if bias:
self.bias = Parameter(torch.FloatTensor(out_features))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
stdv = 1. / math.sqrt(self.weight.size(1))
self.weight.data.uniform_(-stdv, stdv)
if self.bias is not None:
self.bias.data.uniform_(-stdv, stdv)
def forward(self, input, adj):
support = torch.mm(input, self.weight)
output = torch.spmm(adj, support)
if self.bias is not None:
return output + self.bias
else:
return output
def __repr__(self):
return self.__class__.__name__ + ' (' \
+ str(self.in_features) + ' -> ' \
+ str(self.out_features) + ')'
class GCN(nn.Module):
def __init__(self, nfeat, nhid, nclass):
super(GCN, self).__init__()
self.gc1 = GraphConvolution(nfeat, nhid)
self.gc2 = GraphConvolution(nhid, nclass)
self.linear = nn.Linear(nclass, 1)
# self.dropout = dropout
def forward(self, x, adj):
x = F.relu(self.gc1(x, adj))
# x = F.dropout(x, self.dropout, training=self.training)
x = F.relu(self.gc2(x, adj))
x = self.linear(x)
return x
def train(dataloader, model, loss_fn, optimizer,adj):
size = len(dataloader.dataset)
model.train()
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = model(X,adj)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test(dataloader, model, loss_fn,adj):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X,adj)
test_loss += loss_fn(pred, y).item()
# correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
# correct /= size
# Accuracy: {(100*correct):>0.1f}%,
print(f"Test Error: \n Avg loss: {test_loss:>8f} \n")
when I run the code :
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model = GCN(1,1,1).to(device)
print(model)
# model(X).shape
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
epochs = 10
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train(train_dataloader, model, loss_fn, optimizer,Adjacency_matrix)
test(test_dataloader, model, loss_fn,Adjacency_matrix)
print("Done!")
I got the error :
when I looking inside this ,I find the model is working well when I drop the dimension of batch-size. How I need to do to tell the model that this dimension is the batch-size which don't need to compute?
the error you're seeing is due to you trying to matrix multiple a 3d tensor (your input) by your 2D weights.
To get around this you can simply reshape your data, as we only really care about the last dim when doing matmuls:
def forward(self, input, adj):
b_size = input.size(0)
input = input.view(-1, input.shape[-1])
support = torch.mm(input, self.weight)
output = torch.spmm(adj, support)
output = output.view(b_size,-1,output.shape[-1])
if self.bias is not None:
return output + self.bias
else:
return output

ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 2]))

I'm trying to implement a code for sentiment analysis( positive or negative labels) using BERT and i want to add a BiLSTM layer to see if I can increase the accuracy of the pretrained model from HuggingFace. I have the below code and a few questions :
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from torch import cuda
import re
import torch.nn as nn
device = 'cuda' if cuda.is_available() else 'cpu'
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05 #5e-5, 3e-5 or 2e-5
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.tokenizer = tokenizer
self.data = dataframe
self.comment_text = dataframe.review
self.targets = self.data.sentiment
self.max_len = max_len
def __len__(self):
return len(self.comment_text)
def __getitem__(self, index):
comment_text = str(self.comment_text[index])
comment_text = " ".join(comment_text.split())
inputs = self.tokenizer.encode_plus(comment_text,None,add_special_tokens=True,max_length=self.max_len,
pad_to_max_length=True,return_token_type_ids=True)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'targets': torch.tensor(self.targets[index], dtype=torch.float)
}
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,'shuffle': True,'num_workers': 0}
test_params = {'batch_size': VALID_BATCH_SIZE,'shuffle': True,'num_workers': 0}
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
class BERTClass(torch.nn.Module):
def __init__(self):
super(BERTClass, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-uncased',return_dict=False, num_labels =2)
self.lstm = nn.LSTM(768, 256, batch_first=True, bidirectional=True)
self.linear = nn.Linear(256*2,2)
def forward(self, ids , mask,token_type_ids):
sequence_output, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids = token_type_ids)
lstm_output, (h, c) = self.lstm(sequence_output) ## extract the 1st token's embeddings
hidden = torch.cat((lstm_output[:, -1, :256], lstm_output[:, 0, 256:]), dim=-1)
linear_output = self.linear(lstm_output[:, -1].view(-1, 256 * 2))
return linear_output
model = BERTClass()
model.to(device)
print(model)
def loss_fn(outputs, targets):
return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
def train(epoch):
model.train()
for _, data in enumerate(training_loader, 0):
ids = data['ids'].to(device, dtype=torch.long)
mask = data['mask'].to(device, dtype=torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
targets = data['targets'].to(device, dtype=torch.float)
outputs = model(ids, mask, token_type_ids)
optimizer.zero_grad()
loss = loss_fn(outputs, targets)
if _ % 5000 == 0:
print(f'Epoch: {epoch}, Loss: {loss.item()}')
optimizer.zero_grad()
loss.backward()
optimizer.step()
for epoch in range(EPOCHS):
train(epoch)
So on the above code I ran into the error : Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 2])) . Checked online and tried to use targets = targets.unsqueeze(2) but then I get another error that I must use values from [-2,1] for unsqueeze. I also tried to modify the loss function to
def loss_fn(outputs, targets):
return torch.nn.BCELoss()(outputs, targets)
but I still receive the same error. Can someone advise if there is a solution to this problem? Or what can I do to make this work fine? Many thanks in advance.

While running pytorch i got an error 'TypeError: object of type'CatsAndDogsDataset' has no len()' and I want to know how to fix it

I got an error while running pytorch
I train artificial intelligence with ResNet, and I wrote my own custom dataset for the dataset. After loading the data set from ResnNet, training data and test data were set separately by learning with artificial intelligence. But even though I ran it, an error occurred, but I don't know what kind of problem occurred.
Next, I will attach my ResNet.py code and my own data set CustomDataset.py code.
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, datasets, models
from customDataset import CatsAndDogsDataset
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
EPOCHS = 3
BATCH_SIZE = 10
dataset = CatsAndDogsDataset(csv_file = 'cats_dogs.csv', root_dir = 'cats_dogs_resized',transform = transforms.ToTensor())
train_set, test_set = torch.utils.data.random_split(dataset, [28,4])
train_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, shuffle=True)
class BasicBlock(nn.Module):
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3,stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != planes:
self.shortcut = nn.Sequential(nn.Conv2d(in_planes, planes,kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes))
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, num_classes=10):
super(ResNet, self).__init__()
self.in_planes = 16
self.conv1 = nn.Conv2d(3, 16, kernel_size=3,stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(16)
self.layer1 = self._make_layer(16, 2, stride=1)
self.layer2 = self._make_layer(32, 2, stride=2)
self.layer3 = self._make_layer(64, 2, stride=2)
self.linear = nn.Linear(64, num_classes)
def _make_layer(self, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(BasicBlock(self.in_planes, planes, stride))
self.in_planes = planes
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = F.avg_pool2d(out, 8)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
model = ResNet().to(DEVICE)
optimizer = optim.SGD(model.parameters(), lr=0.1,momentum=0.9, weight_decay=0.0005)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
print(model)
def train(model, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(DEVICE), target.to(DEVICE)
optimizer.zero_grad()
output = model(data)
loss = F.cross_entropy(output, target)
loss.backward()
optimizer.step()
def evaluate(model, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(DEVICE), target.to(DEVICE)
output = model(data)
test_loss += F.cross_entropy(output, target,reduction='sum').item()
pred = output.max(1, keepdim=True)[1]
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= length(test_loader.dataset)
test_accuracy = 100. * correct / length(test_loader.dataset)
return test_loss, test_accuracy
for epoch in range(1, EPOCHS + 1):
scheduler.step()
train(model, train_loader, optimizer, epoch)
test_loss, test_accuracy = evaluate(model, test_loader)
print('[{}] Test Loss: {:.4f}, Accuracy: {:.2f}%'.format(
epoch, test_loss, test_accuracy))
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from skimage import io
class CatsAndDogsDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
self.annotations = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __length__(self):
return length(self.annotations)
def __getitem__(self, index):
img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
image = io.imread(img_path)
y_label = torch.tensor(int(self.annotations.iloc[index, 1]))
if self.transform:
image = self.transform(image)
return (image, ylabel)
In this way, if I write the code and then run it
TypeError: object of type'CatsAndDogsDataset' has no len()
I wonder why I can't have len(). In addition, an error occurred in the result of running Backend.ai instead of pycharm, but the error content is
Cannot verify that dataset is Sized
if sum(lengths) != len(dataset):
It is raise ValueError("sum of input lengths does not equal the length of the input dataset!").
Error appears. Is there a workaround? help me plz
You need to define the function __len__ for your custom dataset (which you seem to have currently incorrectly defined as __length__).
This documentation provides details. Relevant excerpt:
torch.utils.data.Dataset is an abstract class representing a dataset.
Your custom dataset should inherit Dataset and override the following
methods:
__len__ so that len(dataset) returns the size of the dataset.
__getitem__ to support the indexing such that dataset[i] can be used to get i th sample.

Training not speeding up even after using GPU

transform = transforms.Compose([transforms.Resize(IMG_SIZE),
transforms.CenterCrop(CROP_SIZE),
transforms.ToTensor()])
class LandmarksDatasetTrain(Dataset):
"""Landmarks dataset."""
def __init__(self, landmarks_frame, root_dir, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.landmarks_frame = landmarks_frame
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.landmarks_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir,self.landmarks_frame.loc[idx, 'id'][0],self.landmarks_frame.loc[idx, 'id'][1], self.landmarks_frame.loc[idx, 'id'][2], self.landmarks_frame.loc[idx, 'id'])
img_name += ".jpg"
image = Image.open(img_name)
landmarks = self.landmarks_frame.loc[idx, 'landmark_id']
sample = {'image': image, 'landmarks': landmarks}
if self.transform:
sample['image'] = self.transform(sample['image'])
sample['landmarks'] = torch.tensor(sample['landmarks'])
return sample
dataset_train = LandmarksDatasetTrain(landmarks_frame = frame,
root_dir='/kaggle/input/landmark-recognition-2020/train',
transform=transform)
train_loader = DataLoader(dataset_train, batch_size=4, shuffle=True, num_workers=4, drop_last=False)
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(CROP_SIZE*CROP_SIZE*3, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 64)
self.fc4 = nn.Linear(64, frame['landmark_id'].nunique())
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.fc4(x)
return F.log_softmax(x, dim=1)
net = Net()
net.to(device)
for epoch in range(3):
optimizer = optim.Adam(net.parameters(), lr=0.001)
for data in tqdm(train_loader):
X = data['image'].to(device)
y = data['landmarks'].to(device)
net.zero_grad()
output = net(X.view(-1,CROP_SIZE*CROP_SIZE*3))
loss = F.nll_loss(output, y)
loss.backward()
optimizer.step()
print(loss)
batch size is 4
data['image'] and data['landmarks'] are tensors device = torch.device("cuda:0") and deep learning library I am using is Pytorch but GPU is still not working for me. Its usage shows 5% and total time for 1 epoch 3.5 to 4 hours
Will be really helpful if someone points out my mistake.
Attaching an image of resource usage and GPU config.
Attaching image for showing that GPU is on
Here is the link to my notebook
https://www.kaggle.com/hiteshsom/google-landmark-recognition

Simple Data recall RNN in Pytorch

I am learning Pytorch and am trying to make a network that can remember previous inputs.
I have tried 2 different input/output structures(see below) but haven't gotten anything to work the way I would like.
input 1:
in:[4,2,7,8]
output [[0,0,4],[0,4,2],[4,2,7],[2,7,8]]
code:
def histroy(num_samples=4,look_back=3):
data=np.random.randint(10,size=(num_samples)).tolist()
lab=[[0]*look_back]
for i in data:
lab.append(lab[-1][1:]+[i])
return data,lab[1:]
input2:
in:[4,2,7,8]
out:[0,4,2,7]
def histroy(num_samples=4):
data=np.random.randint(10,size=(num_samples)).tolist()
lab=[0]
for i in data:
lab.append(i)
return data,lab
I have tried a number of different network structures and training methods but nothing seems to stick.
The only things I think I have right are net.hidden = net.init_hidden()
should go outside of each epoch and loss.backward(retain_graph=True) but that doesnt seem to do anything
Currently, it can learn the last number in the sequence but never seems to learn any of the others
My last attempt:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
def histroy(num_samples=4,look_back=3):
data=np.random.randint(10,size=(num_samples)).tolist()
lab=[[0]*look_back]
for i in data:
lab.append(lab[-1][1:]+[i])
return data,lab[1:]
class Net(nn.Module):
def __init__(self, input_dim, hidden_dim, batch_size, output_dim=10, num_layers=1):
super(Net, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.num_layers = num_layers
self.memory = nn.RNN(self.input_dim,self.hidden_dim,self.num_layers)
self.linear = nn.Linear(self.hidden_dim, output_dim)
self.first=True
def init_hidden(self):
# This is what we'll initialise our hidden state as
return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
def forward(self, input):
self.memory_out, self.hidden = self.memory(input.view(len(input), self.batch_size, -1))
y_pred = self.linear(self.memory_out[-1].view(self.batch_size, -1))
return y_pred.view(-1)
if __name__ == '__main__':
data_amount = 10000
batch_size = 1 # default is 32
data_amount-=data_amount%batch_size
number_of_times_on_the_same_data = 250
look_back=5
net=Net(input_dim=1,hidden_dim=25,batch_size=batch_size,output_dim=look_back)
data,labs=histroy(data_amount,look_back)
data = torch.Tensor(data).float()
labs = torch.Tensor(labs).float()
optimizer = optim.Adam(net.parameters())
criterion = torch.nn.MSELoss(size_average=False)
for epoch in range(number_of_times_on_the_same_data): # loop over the dataset multiple times
running_loss = 0.0
data, labs = histroy(data_amount, look_back)
data = torch.Tensor(data).float()
labs = torch.Tensor(labs).float()
net.hidden = net.init_hidden()
print("epoch",epoch)
for i in range(0, data_amount, batch_size):
inputs = data[i:i + batch_size]
labels = labs[i:i + batch_size]
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward(retain_graph=True)
optimizer.step()
running_loss += loss.item()
if i >= data_amount-batch_size:
print("loss",loss)
net.hidden = net.init_hidden()
print("Outputs",outputs)
print("Input", data[-1*look_back:])
print("labels",labels)
The problem that your network presents, it's the fact that your input is of shape 1:
for i in range(0, data_amount, batch_size):
inputs = data[i:i + batch_size]
labels = labs[i:i + batch_size]
print(inputs.shape,labels.shape)
>>>torch.Size([1]) torch.Size([1, 5])
>>>torch.Size([1]) torch.Size([1, 5])...
That's the reason your RNN is predicting only your last number, because in this case you're not using your look_back attribute. You have to fix your code in order to have inputs of size [1,5]. Your code should look something like this:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
def histroy(num_samples=4,look_back=3):
data=np.random.randint(10,size=(num_samples)).tolist()
lab=[[0]*look_back]
for i in data:
lab.append(lab[-1][1:]+[i])
return lab[:-1],lab[1:]
class Net(nn.Module):
def __init__(self, input_dim, hidden_dim, batch_size, output_dim=10, num_layers=1):
super(Net, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.num_layers = num_layers
self.memory = nn.RNN(self.input_dim,self.hidden_dim,self.num_layers)
self.linear = nn.Linear(self.hidden_dim, output_dim)
self.first=True
def init_hidden(self):
# This is what we'll initialise our hidden state as
return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
def forward(self, input):
self.memory_out, self.hidden = self.memory(input.view(len(input), self.batch_size, -1))
y_pred = self.linear(self.memory_out[-1].view(self.batch_size, -1))
return y_pred.view(-1)
if __name__ == '__main__':
data_amount = 10000
batch_size = 1 # default is 32
data_amount-=data_amount%batch_size
number_of_times_on_the_same_data = 250
look_back=5
net=Net(input_dim=1,hidden_dim=25,batch_size=batch_size,output_dim=look_back)
data,labs=histroy(data_amount,look_back)
data = torch.Tensor(data).float()
labs = torch.Tensor(labs).float()
optimizer = optim.Adam(net.parameters())
criterion = torch.nn.MSELoss(size_average=False)
for epoch in range(number_of_times_on_the_same_data): # loop over the dataset multiple times
running_loss = 0.0
data, labs = histroy(data_amount, look_back)
data = torch.Tensor(data).float()
labs = torch.Tensor(labs).float()
net.hidden = net.init_hidden()
print("epoch",epoch)
for i in range(0, data_amount, batch_size):
inputs = data[i:i + batch_size].view(-1)
labels = labs[i:i + batch_size]
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward(retain_graph=True)
optimizer.step()
running_loss += loss.item()
if i >= data_amount-batch_size:
print("loss",loss)
net.hidden = net.init_hidden()
print("Outputs",outputs)
print("Input", data[i:i + batch_size][-1])
print("labels",labels)
Output:
>>>epoch 0
>>>loss tensor(17.7415, grad_fn=<MseLossBackward>)
>>>Outputs tensor([2.0897, 3.1410, 4.7382, 1.0532, 4.2003], grad_fn=<ViewBackward>)
>>>Input tensor([8., 2., 3., 5., 1.])
>>>labels tensor([[2., 3., 5., 1., 0.]])...

Resources