This is a toy example as I'm learning PyTorch and using it on one-dimensional time series, in this case a sine wave.
I'm trying to use Conv1d, but I get the following error:
RuntimeError: Given groups=1, weight of size [5, 1, 2], expected input[1, 994, 5] to have 1 channels, but got 994 channels instead
My 'lookback' is 5 time steps, and the shape of my data batch is [994, 5].
What am I doing wrong?
import torch;from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F;import pytorch_lightning as pl
from torch import nn, tensor
class TsDs(torch.utils.data.Dataset):
def __init__(self, s, l=5): super().__init__();self.l,self.s=l,s
def __len__(self): return self.s.shape[0] - 1 - self.l
def __getitem__(self, i): return self.s[i:i+self.l], torch.log(self.s[i+self.l+1]/self.s[i+self.l])
def plt(self): plt.plot(self.s)
class TsDm(pl.LightningDataModule):
def __init__(self, length=5000, batch_size=1000): super().__init__();self.batch_size=batch_size;self.s = torch.sin(torch.arange(length)*0.2) + 5
def train_dataloader(self): return DataLoader(TsDs(self.s[:3999]), batch_size=self.batch_size, shuffle=False)
def val_dataloader(self): return DataLoader(TsDs(self.s[4000:]), batch_size=self.batch_size)
dm = TsDm()
class MyModel(pl.LightningModule):
def __init__(self, learning_rate=0.01):
super().__init__();self.learning_rate = learning_rate
super().__init__();self.learning_rate = learning_rate
self.network = nn.Sequential(nn.Conv1d(1,5,2),nn.ReLU(),nn.Linear(5,3),nn.ReLU(),nn.Linear(3,1), nn.Tanh())
# self.network = nn.Sequential(nn.Linear(5,5),nn.ReLU(),nn.Linear(5,3),nn.ReLU(),nn.Linear(3,1), nn.Tanh())
def forward(self, x): return self.network(x)
def step(self, batch, batch_idx, stage):
x, y = batch
loss = -torch.mean(self(x)*y)
print(loss)
return loss
def training_step(self, batch, batch_idx): return self.step(batch, batch_idx, "train")
def validation_step(self, batch, batch_idx): return self.step(batch, batch_idx, "val")
def configure_optimizers(self): return torch.optim.SGD(self.parameters(), lr=self.learning_rate)
mm = MyModel(0.01);trainer = pl.Trainer(max_epochs=10)
trainer.fit(mm, datamodule=dm)
There are two issues in your code:
Looking at the documentation of nn.Conv1d, your input shape should be (B, C, L). In your default case, you have L=5, the sequence length, but you need to create that extra dimension representing the feature size of a sequence element, here C=1. You can do so by changing TsDs's __getitem__ function to:
def __getitem__(self, i):
x = self.s[i:i+self.l] # minibatch x shaped (1, self.l)
y = torch.log(self.s[i+self.l+1]/self.s[i+self.l]) # minibatch y shaped (1,)
return x, y
Your convolutional layer has a stride of 1 and a size of 2, this means its output will be shaped (B, 5, L-1=4). The following layer is a fully connected layer instantiated as nn.Linear(5, 3), which means it expects (*, H_in=5) and will output (*, H_out). You can either
You can flatten the conv1d output with nn.Flatten and feed it to a bigger fully connected layer (for instance nn.Linear(20, 3).
You can use a convolutional layer with a wider kernel, if you use a kernel of 5 (your sequence length you will end up with a tensor of (B, 5, 1) which you feed to a nn.Linear(5, 3). Although this approach doesn't really scale when L is changed.
You could apply a nn.AvgPool1d to get an average representation of the sequence after the convolutional layers have been applied.
Those are just a few directions...
Related
I'm trying to get my toy network to learn a sine wave.
I output (via tanh) a number between -1 and 1, and I want the network to minimise the following loss, where self(x) are the predictions.
loss = -torch.mean(self(x)*y)
This should be equivalent to trading a stock with a sinusoidal price, where self(x) is our desired position, and y are the returns of the next time step.
The issue I'm having is that the network doesn't learn anything. It does work if I change the loss function to be torch.mean((self(x)-y)**2) (MSE), but this isn't what I want. I'm trying to focus the network on 'making a profit', not making a prediction.
I think the issue may be related to the convexity of the loss function, but I'm not sure, and I'm not certain how to proceed. I've experimented with differing learning rates, but alas nothing works.
What should I be thinking about?
Actual code:
%load_ext tensorboard
import matplotlib.pyplot as plt; plt.rcParams["figure.figsize"] = (30,8)
import torch;from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F;import pytorch_lightning as pl
from torch import nn, tensor
def piecewise(x): return 2*(x>0)-1
class TsDs(torch.utils.data.Dataset):
def __init__(self, s, l=5): super().__init__();self.l,self.s=l,s
def __len__(self): return self.s.shape[0] - 1 - self.l
def __getitem__(self, i): return self.s[i:i+self.l], torch.log(self.s[i+self.l+1]/self.s[i+self.l])
def plt(self): plt.plot(self.s)
class TsDm(pl.LightningDataModule):
def __init__(self, length=5000, batch_size=1000): super().__init__();self.batch_size=batch_size;self.s = torch.sin(torch.arange(length)*0.2) + 5 + 0*torch.rand(length)
def train_dataloader(self): return DataLoader(TsDs(self.s[:3999]), batch_size=self.batch_size, shuffle=True)
def val_dataloader(self): return DataLoader(TsDs(self.s[4000:]), batch_size=self.batch_size)
dm = TsDm()
class MyModel(pl.LightningModule):
def __init__(self, learning_rate=0.01):
super().__init__();self.learning_rate = learning_rate
super().__init__();self.learning_rate = learning_rate
self.conv1 = nn.Conv1d(1,5,2)
self.lin1 = nn.Linear(20,3);self.lin2 = nn.Linear(3,1)
# self.network = nn.Sequential(nn.Conv1d(1,5,2),nn.ReLU(),nn.Linear(20,3),nn.ReLU(),nn.Linear(3,1), nn.Tanh())
# self.network = nn.Sequential(nn.Linear(5,5),nn.ReLU(),nn.Linear(5,3),nn.ReLU(),nn.Linear(3,1), nn.Tanh())
def forward(self, x):
out = x.unsqueeze(1)
out = self.conv1(out)
out = out.reshape(-1,20)
out = nn.ReLU()(out)
out = self.lin1(out)
out = nn.ReLU()(out)
out = self.lin2(out)
return nn.Tanh()(out)
def step(self, batch, batch_idx, stage):
x, y = batch
loss = -torch.mean(self(x)*y)
# loss = torch.mean((self(x)-y)**2)
print(loss)
self.log("loss", loss, prog_bar=True)
return loss
def training_step(self, batch, batch_idx): return self.step(batch, batch_idx, "train")
def validation_step(self, batch, batch_idx): return self.step(batch, batch_idx, "val")
def configure_optimizers(self): return torch.optim.SGD(self.parameters(), lr=self.learning_rate)
#logger = pl.loggers.TensorBoardLogger(save_dir="/content/")
mm = MyModel(0.1);trainer = pl.Trainer(max_epochs=10)
# trainer.tune(mm, dm)
trainer.fit(mm, datamodule=dm)
#
If I understand you correctly, I think that you were trying to maximize the unnormalized correlation between the network's prediction, self(x), and the target value y.
As you mention, the problem is the convexity of the loss wrt the model weights. One way to see the problem is to consider that the model is a simple linear predictor w'*x, where w is the model weights, w' it's transpose, and x the input feature vector (assume a scalar prediction for now). Then, if you look at the derivative of the loss wrt the weight vector (i.e., the gradient), you'll find that it no longer depends on w!
One way to fix this is change the loss to,
loss = -torch.mean(torch.square(self(x)*y))
or
loss = -torch.mean(torch.abs(self(x)*y))
You will have another big problem, however: these loss functions encourage unbound growth of the model weights. In the linear case, one solves this by a Lagrangian relaxation of a hard constraint on, for example, the norm of the model weight vector. I'm not sure how this would be done with neural networks as each layer would need it's own Lagrangian parameter...
I wrote a short snippet to train a classification model, and learn the learning rate of its optimization algorithm. In my example I tried to update weights of a network in an inner optimization loop and to learn the learning rate of the weight updates using an outer optimization loop (meta-optimization). I'm getting the error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3, 10]], which is output 0 of AsStridedBackward0, is at version 12; expected version 2 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
My code snippet is as following (NOTE: I'm using _stateless, an experimental functional API for nn. You need to run with the nightly build of pytorch.)
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import _stateless
class MyDataset(Dataset):
def __init__(self, N):
self.N = N
self.x = torch.rand(self.N, 10)
self.y = torch.randint(0, 3, (self.N,))
def __len__(self):
return self.N
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.fc1 = nn.Linear(10, 10)
self.fc2 = nn.Linear(10, 3)
self.relu = nn.ReLU()
self.alpha = nn.Parameter(torch.randn(1))
self.beta = nn.Parameter(torch.randn(1))
def forward(self, x):
y = self.relu(self.fc1(x))
return self.fc2(y)
epochs = 20
N = 100
dataset = DataLoader(dataset=MyDataset(N), batch_size=10)
model = MyModel()
loss_func = nn.CrossEntropyLoss()
optim = optim.Adam([model.alpha], lr=1e-3)
params = dict(model.named_parameters())
for i in range(epochs):
model.train()
train_loss = 0
for batch_idx, (x, y) in enumerate(dataset):
logits = _stateless.functional_call(model, params, x) # predict
loss_inner = loss_func(logits, y) # loss
optim.zero_grad() # reset grad
loss_inner.backward(create_graph=True, inputs=params.values()) # compute grad
train_loss += loss_inner.item() # store loss
for k, p in params.items():
if k is not 'alpha' and k is not 'beta':
p.update = - model.alpha * p.grad
params[k] = p + p.update # update weight
print('Train Epoch: {}\tLoss: {:.6f}'.format(i, train_loss / N))
logits = _stateless.functional_call(model, params, x) # predict
loss_meta = loss_func(logits, y)
loss_meta.backward()
loss_meta.step()
From the error message, I understand that the issue comes from weight update for the weights of the second layer of the network, which points to an error in my inner loop optimization. Any suggestions would be appreciated.
Check this link and save PARAMs per each epoch and use same inner batch:
https://discuss.pytorch.org/t/issue-using-parameters-internal-method/134549/11
for i in range(epochs):
model.train()
train_loss = 0
params = dict(model.named_parameters()) # add this
for batch_idx, (x, y) in enumerate(dataset):
params = {k: v.clone() for k,v in params.items()} # add this
logits = _stateless.functional_call(model, params, x) # predict
loss_inner = loss_func(logits, y)
..................
You should be updating params[k].data instead of params[k]
(Deleted the example to avoid distraction)
Let me enter in a kind of fundamental discussion (not an answer to your question).
If I undertand correctly you want to compute loss(f(w[i], x)) , and computing the w[i+1,j] = w[i,j] + g(v[j], w[i,j].grad(w.r.t loss)) . Then in the end you want to compute v[j+1] = v[j] + v[j].grad(w.r.t loss).
The gradient of v[j] is computed using the backward propagation, as a function of grad w[i,j]. So what you are trying to do is to choose v[j] that results in a good w[i,j]. I would ask: why would you bother about v[j] if you can control w[i,j] directly? And that's what the standard approach.
I'm working with two tensors, inputs and labels, and I want to have them together to train a model. I'm using torch 1.7, but I can't use the function TensorDataset() and then apply DataLoader(), due to some incompatibilities with other packages when I use TensorDataset(). There is another solution to my problem?
Summary:
2 Tensors --> DataLoader without using TensorDataset()
You can construct your own custom DataSet:
class MyDataSet(torch.utils.data.Dataset):
def __init__(self, x, y):
super(MyDataSet, self).__init__()
# store the raw tensors
self._x = x
self._y = y
def __len__(self):
# a DataSet must know it size
return self._x.shape[0]
def __getitem__(self, index):
x = self._x[index, :]
y = self._y[index, :]
return x, y
I remember in the past, nn.Linear only accepts 2D tensors.
But today, I discover that nn.Linear now accepts 3D, or even tensors with arbitrary dimensions.
X = torch.randn((20,20,20,20,10))
linear_layer = nn.Linear(10,5)
output = linear_layer(X)
print(output.shape)
>>> torch.Size([20, 20, 20, 20, 5])
When I check the documentation for Pytorch, it does say that it now takes
Input: :math:(N, *, H_{in}) where :math:* means any number of
additional dimensions and :math:H_{in} = \text{in\_features}
So it seems to me that Pytorch nn.Linear now reshape the input by x.view(-1, input_dim) automatically.
But I cannot find any x.shape or x.view in the source code:
class Linear(Module):
__constants__ = ['bias']
def __init__(self, in_features, out_features, bias=True):
super(Linear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = Parameter(torch.Tensor(out_features, in_features))
if bias:
self.bias = Parameter(torch.Tensor(out_features))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
#weak_script_method
def forward(self, input):
return F.linear(input, self.weight, self.bias)
def extra_repr(self):
return 'in_features={}, out_features={}, bias={}'.format(
self.in_features, self.out_features, self.bias is not None
)
Can anyone confirms this?
torch.nn.Linear uses torch.nn.functional.linear function under the hood, that's where the operations are taking places (see documentation).
It looks like this (removed docstrings and decorators for brevity):
def linear(input, weight, bias=None):
if input.dim() == 2 and bias is not None:
# fused op is marginally faster
ret = torch.addmm(bias, input, weight.t())
else:
output = input.matmul(weight.t())
if bias is not None:
output += bias
ret = output
return ret
First case is addmm, which implements beta*mat + alpha*(mat1 # mat2) and is supposedly faster (see here for example).
Second operation is matmul, and as one can read in their docs it performs various operations based on the shape of tensors provided (five cases, not going to copy them blatantly here).
In summary it preserves dimensions between first batch and last features dimension. No view() is used whatsoever, especially not this x.view(-1, input_dim), check the code below:
import torch
tensor1 = torch.randn(10, 3, 4)
tensor2 = torch.randn(10, 4, 5)
print(torch.matmul(tensor1, tensor2).shape)
print(torch.matmul(tensor1, tensor2).view(-1, tensor1.shape[1]).shape)
which gives:
torch.Size([10, 3, 5]) # preserves input's 3
torch.Size([50, 3]) # destroys the batch even
Following the "Temporal Encoding" section on page 5 of https://arxiv.org/pdf/1503.08895.pdf (an excellent paper by the way), I have say N many embedded vectors of dimension M. So my Keras tensor is (batch size, N, M) and I want to add an N by M matrix of weights to each of the batch-size-many samples. To that end I've created my own Keras layer:
from constants import BATCH_SIZE
class Added_Weights(Layer):
def __init__(self, input_dim, output_dim, **kwargs):
self.output_dim = output_dim
self.input_dim = input_dim
super(Added_Weights, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.kernel = self.add_weight(name='kernel',
shape=(BATCH_SIZE, self.input_dim[0], self.input_dim[1]),
initializer=RandomNormal(mean=0., stddev=0.05, seed=None),
trainable=True)
print("kernel has shape "+self.kernel.shape + " or "+K.int_shape(self.kernel))
super(Added_Weights, self).build(input_shape)
def call(self, x, **kwargs):
return Add()([x, self.kernel])
def compute_output_shape(self, input_shape):
return (BATCH_SIZE, self.input_dim[0], self.input_dim[1])
And this WORKS, but the problem is that each of the BATCH_SIZE many matrices has DifferenT weights. I need to be adding the same weights to each of the samples in the batch.
So I've tried a couple things. Keras has a built in RepeatVector layer, so I tried giving the kernel shape (N, M) and doing RepeatVector (BATCH_SIZE)(kernel), but for some reason that ends up with shape (N, BATCH_SIZE, M). I'd like to use a Reshape there, but Reshape() treats the first dimension as the batch_size and won't allow me to modify it. Permute() has the same problem.
Another thought was to make the initial shape as it is in the code, and then loop over the tensor to set slices 1 through BATCH_SIZE-1 equal to slice 0, so they're all holding the same weights, but I'm not allowed to assign values to Keras tensors that way.
The only other thought I had was to just try it with shape (N, M) and hope Keras is smart enough to add it to each slice of the input, but after the Add() is applied to my (?, N, M) and the (N, M) kernel, somehow I end up with an (N, N, M) tensor, at which point we're dead.
I think you are overcomplicating things. Just define the weights as a N x M tensor in build and perform a sum with the input tensor in call. I tweaked your code as follows:
from keras.engine.topology import Layer
from keras.models import Model
from keras.layers import Input
import numpy as np
N = 3
M = 4
BATCH_SIZE = 1
class Added_Weights(Layer):
def __init__(self, **kwargs):
super(Added_Weights, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.kernel = self.add_weight(name='kernel',
shape=(input_shape[1], input_shape[2]),
initializer='ones', # TODO: Choose your initializer
trainable=True)
super(Added_Weights, self).build(input_shape)
def call(self, x, **kwargs):
# Implicit broadcasting occurs here.
# Shape x: (BATCH_SIZE, N, M)
# Shape kernel: (N, M)
# Shape output: (BATCH_SIZE, N, M)
return x + self.kernel
def compute_output_shape(self, input_shape):
return input_shape
a = Input(shape=(N, M))
layer = Added_Weights()(a)
model = Model(inputs=a,
outputs=layer)
a = np.zeros(shape=(BATCH_SIZE, N, M))
pred = model.predict(a)
print(pred)
Note that self.kernel is being implicitly broadcast in call to match the shape of x, so the same weights are being added to each sample in the batch.