How to fix 'Expected object of scalar type Float but got scalar type Double for argument #4 'mat1''? - pytorch

I am trying to build an lstm model. My model code is below.
My input has 4 features, Sequence length of 5 and batch size of 32.
class RNN(nn.Module):
def __init__(self, feature_dim, output_size, hidden_dim, n_layers, dropout=0.5):
"""
Initialize the PyTorch RNN Module
:param feature_dim: The number of input dimensions of the neural network
:param output_size: The number of output dimensions of the neural network
:param hidden_dim: The size of the hidden layer outputs
:param dropout: dropout to add in between LSTM/GRU layers
"""
super(RNN, self).__init__()
# set class variables
self.output_size = output_size
self.n_layers = n_layers
self.hidden_dim = hidden_dim
# define model layers
self.lstm = nn.LSTM(feature_dim, hidden_dim, n_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_size)
self.dropout = nn.Dropout(dropout)
def forward(self, nn_input, hidden):
"""
Forward propagation of the neural network
:param nn_input: The input to the neural network
:param hidden: The hidden state
:return: Two Tensors, the output of the neural network and the latest hidden state
"""
# Get Batch Size
batch_size = nn_input.size(0)
# Pass through LSTM layer
lstm_out, hidden = self.lstm(nn_input, hidden)
# Stack up LSTM outputs
lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
# Add dropout and pass through fully connected layer
x = self.dropout(lstm_out)
x = self.fc(lstm_out)
# reshape to be batch_size first
output = x.view(batch_size, -1, self.output_size)
# get last batch of labels
out = output[:, -1]
# return one batch of output word scores and the hidden state
return out, hidden
def init_hidden(self, batch_size):
'''
Initialize the hidden state of an LSTM/GRU
:param batch_size: The batch_size of the hidden state
:return: hidden state of dims (n_layers, batch_size, hidden_dim)
'''
# Implement function
# initialize state with zero weights, and move to GPU if available
weight = next(self.parameters()).data
if is_gpu_available:
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
else:
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
return hidden
When I train, I got the error
RuntimeError Traceback (most recent call last)
/usr/local/bin/kernel-launchers/python/scripts/launch_ipykernel.py in <module>
3
4 # training the model
----> 5 trained_rnn = train_rnn(rnn, batch_size, optimizer, num_epochs, show_every_n_batches)
6
7 # saving the trained model
/usr/local/bin/kernel-launchers/python/scripts/launch_ipykernel.py in train_rnn(rnn, batch_size, optimizer, n_epochs, show_every_n_batches)
18
19 # forward, back prop
---> 20 loss, hidden = forward_back_prop(rnn, optimizer, inputs, labels, hidden)
21 # record loss
22 batch_losses.append(loss)
/usr/local/bin/kernel-launchers/python/scripts/launch_ipykernel.py in forward_back_prop(rnn, optimizer, inp, target, hidden)
22
23 # get the output from the model
---> 24 output, h = rnn(inp, h)
25
26 # calculate the loss and perform backprop
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
491 result = self._slow_forward(*input, **kwargs)
492 else:
--> 493 result = self.forward(*input, **kwargs)
494 for hook in self._forward_hooks.values():
495 hook_result = hook(self, input, result)
/usr/local/bin/kernel-launchers/python/scripts/launch_ipykernel.py in forward(self, nn_input, hidden)
36
37 # Pass through LSTM layer
---> 38 lstm_out, hidden = self.lstm(nn_input, hidden)
39 # Stack up LSTM outputs
40 lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
491 result = self._slow_forward(*input, **kwargs)
492 else:
--> 493 result = self.forward(*input, **kwargs)
494 for hook in self._forward_hooks.values():
495 hook_result = hook(self, input, result)
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
557 return self.forward_packed(input, hx)
558 else:
--> 559 return self.forward_tensor(input, hx)
560
561
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py in forward_tensor(self, input, hx)
537 unsorted_indices = None
538
--> 539 output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
540
541 return output, self.permute_hidden(hidden, unsorted_indices)
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py in forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices)
520 if batch_sizes is None:
521 result = _VF.lstm(input, hx, self._get_flat_weights(), self.bias, self.num_layers,
--> 522 self.dropout, self.training, self.bidirectional, self.batch_first)
523 else:
524 result = _VF.lstm(input, batch_sizes, hx, self._get_flat_weights(), self.bias,
RuntimeError: Expected object of scalar type Float but got scalar type Double for argument #4 'mat1'
I am not able to figure the cause of this error. How to fix it? Please help.
Also, is it the correct way of implementing the LSTM or is there a better way to achieve the same?

torch.nn.LSTM does not need any initialization, as it's initialized to zeros by default (see documentation).
Furthermore, torch.nn.Module already has predefined cuda() method, so one can move module to GPU simply, hence you can safely delete init_hidden(self, batch_size).
You have this error because your input is of type torch.Double, while modules by default use torch.Float (as it's accurate enough, faster and smaller than torch.Double).
You can cast your input Tensors by calling .float(), in your case it could look like that:
def forward(self, nn_input, hidden):
nn_input = nn_input.float()
... # rest of your code
Finally, there is no need for hidden argument if it's always zeroes, you can simply use:
lstm_out, hidden = self.lstm(nn_input) # no hidden here
as hidden is zeroes by default as well.

Related

Pytorch : Expected all tensors on same device

I have my model and inputs moved on the same device but I still get the runtime error :
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)
Here is my code,
First my model implementation :
import torch
import torch.nn.functional as F
class Net(torch.nn.Module):
def __init__(self, n_hiddens, n_feature= 2, n_output= 1):
super().__init__()
self.hiddens = []
n_hidden_in = n_feature
for n_hidden in n_hiddens :
self.hiddens.append( torch.nn.Linear(n_hidden_in, n_hidden) ) # hidden layer
n_hidden_in = n_hidden
self.predict = torch.nn.Linear(n_hidden, n_output) # output layer
def forward(self, x):
for hidden in self.hiddens :
x = F.relu(hidden(x)) # activation function for hidden layer
x = self.predict(x) # linear output
return x
Then I define my dataloaders. Here, X and y are numpy arrays
from torch.utils.data import TensorDataset, DataLoader
# Split training/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 42)
X_train_tensor = torch.from_numpy(X_train)
y_train_tensor = torch.from_numpy(y_train)
X_test_tensor = torch.from_numpy(X_test)
y_test_tensor = torch.from_numpy(y_test)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor) # create your datset
train_dataloader = DataLoader(train_dataset, batch_size= 1000) # create your dataloader
test_dataset = TensorDataset(X_test_tensor, y_test_tensor) # create your datset
test_dataloader = DataLoader(test_dataset, batch_size= 1000) # create your dataloader
Here I train my model. The error occurs during the line "outputs = regressor(inputs)"
NUM_EPOCHS = 2000
BATCH_SIZE = 1000
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device used : {device}")
# 1 hidden layer
total_num_nodes = 256
regressor = Net(n_hiddens= [total_num_nodes]).to(device)
optimizer = torch.optim.SGD(regressor.parameters(), lr=0.2, momentum= 0.1, nesterov= True)
loss_func = torch.nn.MSELoss() # this is for regression mean squared loss
for epoch in range(NUM_EPOCHS):
running_loss = 0.0
for i, data in enumerate(train_dataloader, 0):
inputs, values = data
inputs = inputs.float().to(device)
values = values.float().to(device)
optimizer.zero_grad() # clear gradients for next train
print(f"Input device is : cuda:{inputs.get_device()}")
print(f"Target value device is : cuda:{values.get_device()}")
print(f"Is model on cuda ? : {next(regressor.parameters()).is_cuda}")
outputs = regressor(inputs) # <-- This is where I have the error
loss = loss_func(outputs, values)
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
Here are the outputs of my print statements :
Device used : cuda:0
Input device is : cuda:0
Target value device is : cuda:0
Is model on cuda ? :True
This should mean that my model and my tensors are all on the same device so why do I still have this error ?
The error log is :
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-6-5234b830bebc> in <module>()
24 print(f"Target value device is : cuda:{values.get_device()}")
25 print(f"Is model on cuda ? : {next(regressor.parameters()).is_cuda}")
---> 26 outputs = regressor(inputs)
27 loss = loss_func(outputs, values)
28 loss.backward() # backpropagation, compute gradients
4 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
<ipython-input-4-56c54b30b771> in forward(self, x)
16 def forward(self, x):
17 for hidden in self.hiddens :
---> 18 x = F.relu(hidden(x)) # activation function for hidden layer
19 x = self.predict(x) # linear output
20 return x
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/linear.py in forward(self, input)
101
102 def forward(self, input: Tensor) -> Tensor:
--> 103 return F.linear(input, self.weight, self.bias)
104
105 def extra_repr(self) -> str:
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1846 if has_torch_function_variadic(input, weight, bias):
1847 return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias)
-> 1848 return torch._C._nn.linear(input, weight, bias)
1849
1850
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)
Thank you very much
TL;DR use nn.ModuleList instead of a pythonic one to store the hidden layers in Net.
All your hidden layers are stored in a simple pythonic list self.hidden in Net. When you move your model to GPU, using .to(device), pytorch has no way to tell that all the elements of this pythonic list should also be moved to the same device.
however, if you make self.hidden = nn.ModuleLis(), pytorch now knows to treat all elements of this special list as nn.Modules and recursively move them to the same device as Net.
See these answers 1, 2, 3 for more details.

Ragged Tensors of non-text data as input for LSTM

I am learning about ragged tensors with applications for particle tracking. I have the following minimal example which reproduces a the error i keep experiencing.
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Input, TimeDistributed
from tensorflow.keras.models import Sequential
n=10
data_n = 32
batch_size=8
window_length=8
splits = [n]*data_n
#### Create a ragged tensor with shape (32, None, 8)
t0 = tf.zeros([data_n * n, window_length])
t1 = tf.RaggedTensor.from_row_lengths(t0, splits)
max_seq = t1.bounding_shape()[-1]
#### Define Model
def create_model(batch_size, window_length, max_seq):
lstm_model = Sequential([
Input(
batch_shape=[batch_size, None, window_length],
batch_size=batch_size,
dtype=tf.float32,
ragged=True
),
LSTM(
max_seq,
return_sequences=True,
input_shape=(window_length, None)
),
TimeDistributed(Dense(units=1))
])
return lstm_model
lstm_model = create_model(batch_size=batch_size, window_length=window_length, max_seq=max_seq)
lstm_model(t1[0:8])
When i execute the above code i get the following error:
---------------------------------------------------------------------------
_FallbackException Traceback (most recent call last)
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/ops/gen_cudnn_rnn_ops.py in cudnn_rnnv3(input, input_h, input_c, params, sequence_lengths, rnn_mode, input_mode, direction, dropout, seed, seed2, num_proj, is_training, time_major, name)
1889 try:
-> 1890 _result = pywrap_tfe.TFE_Py_FastPathExecute(
1891 _ctx._context_handle, tld.device_name, "CudnnRNNV3", name,
_FallbackException: Expecting float value for attr dropout, got int
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-19-7609e2877e20> in <module>
1 lstm_model = create_model(batch_size=batch_size, window_length=window_length, max_seq=max_seq)
----> 2 lstm_model(t1[0:8])
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self, *args, **kwargs)
966 with base_layer_utils.autocast_context_manager(
967 self._compute_dtype):
--> 968 outputs = self.call(cast_inputs, *args, **kwargs)
969 self._handle_activity_regularization(inputs, outputs)
970 self._set_mask_metadata(inputs, outputs, input_masks)
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/sequential.py in call(self, inputs, training, mask)
275 if not self.built:
276 self._init_graph_network(self.inputs, self.outputs, name=self.name)
--> 277 return super(Sequential, self).call(inputs, training=training, mask=mask)
278
279 outputs = inputs # handle the corner case where self.layers is empty
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/network.py in call(self, inputs, training, mask)
715 ' implement a `call` method.')
716
--> 717 return self._run_internal_graph(
718 inputs, training=training, mask=mask,
719 convert_kwargs_to_constants=base_layer_utils.call_context().saving)
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/network.py in _run_internal_graph(self, inputs, training, mask, convert_kwargs_to_constants)
886
887 # Compute outputs.
--> 888 output_tensors = layer(computed_tensors, **kwargs)
889
890 # Update tensor_dict.
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py in __call__(self, inputs, initial_state, constants, **kwargs)
652
653 if initial_state is None and constants is None:
--> 654 return super(RNN, self).__call__(inputs, **kwargs)
655
656 # If any of `initial_state` or `constants` are specified and are Keras
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self, *args, **kwargs)
966 with base_layer_utils.autocast_context_manager(
967 self._compute_dtype):
--> 968 outputs = self.call(cast_inputs, *args, **kwargs)
969 self._handle_activity_regularization(inputs, outputs)
970 self._set_mask_metadata(inputs, outputs, input_masks)
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent_v2.py in call(self, inputs, mask, training, initial_state)
1178 # GPU implementation when GPU is available.
1179 if can_use_gpu:
-> 1180 last_output, outputs, new_h, new_c, runtime = gpu_lstm(
1181 **gpu_lstm_kwargs)
1182 else:
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent_v2.py in gpu_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask, time_major, go_backwards, sequence_lengths)
1404 inputs = array_ops.reverse_sequence_v2(
1405 inputs, sequence_lengths, seq_axis=seq_axis, batch_axis=batch_axis)
-> 1406 outputs, h, c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(
1407 inputs,
1408 input_h=init_h,
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/ops/gen_cudnn_rnn_ops.py in cudnn_rnnv3(input, input_h, input_c, params, sequence_lengths, rnn_mode, input_mode, direction, dropout, seed, seed2, num_proj, is_training, time_major, name)
1899 except _core._FallbackException:
1900 try:
-> 1901 return cudnn_rnnv3_eager_fallback(
1902 input, input_h, input_c, params, sequence_lengths,
1903 rnn_mode=rnn_mode, input_mode=input_mode, direction=direction,
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/ops/gen_cudnn_rnn_ops.py in cudnn_rnnv3_eager_fallback(input, input_h, input_c, params, sequence_lengths, rnn_mode, input_mode, direction, dropout, seed, seed2, num_proj, is_training, time_major, name, ctx)
1999 "direction", direction, "dropout", dropout, "seed", seed, "seed2", seed2,
2000 "num_proj", num_proj, "is_training", is_training, "time_major", time_major)
-> 2001 _result = _execute.execute(b"CudnnRNNV3", 5, inputs=_inputs_flat,
2002 attrs=_attrs, ctx=ctx, name=name)
2003 if _execute.must_record_gradient():
~/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
57 try:
58 ctx.ensure_initialized()
---> 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
InvalidArgumentError: Invalid input_h shape: [1,8,8] [10,8,8] [Op:CudnnRNNV3]
The 10 refers to the number of units in the LSTM-layer, which is equal to the bounding shape of t1. The two 8's refer to batch_size and window_length. I thought that 1 refered to the output shape, but that is not the case, since it does not change when i add more units to the Dense-layer the number remains the same.
When working with tf.RaggedTensor with sequences of variable sizes, you want to set the batch_size = 1 and you want to ensure that sequence_length passed to the LSTM is None.
This is because even though tf.RaggedTensor are a great storage for variable sized sequences in form of numpy arrays, the LSTM still expects the same length sequence for each batch. You can however have variable sized sequences across batches.
Making those changes should fix the issue you are facing.
This is a bug I encountered as well. It is specific to combination of the below:
Keras implementation of RNN / LSTM Tensorflow version Using GPU And using RaggedTensor
The erroneous source code is comparing shape of ragged tensor of input hidden state of RNN cell with shape of hidden state
The ragged dimension is shown as 1 in the error message.
I was able to replicate same error using your code and Tensorflow 2.2.0
Changing code to remove ragged tensor or running on cpu or different tensorflow version I was able to get this code to work

Removing last 2 layers from a BERT classifier results in " 'tuple' object has no attribute 'dim' " error. Why?

I fine tuned a huggingface transformer using Keras (with ktrain) and then reloaded the model in Pytorch.
I want to access the third to last layer (pre_classifier), so I removed the two last layers:
BERT2 = torch.nn.Sequential(*(list(BERT.children())[:-2]))
Running an encoded sentence through this yields the following error message:
AttributeError Traceback (most recent call last)
<ipython-input-38-640702475573> in <module>
----> 1 ans2=BERT2(torch.tensor([e1]))
2 print (ans2)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\container.py in forward(self, input)
90 def forward(self, input):
91 for module in self._modules.values():
---> 92 input = module(input)
93 return input
94
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\linear.py in forward(self, input)
85
86 def forward(self, input):
---> 87 return F.linear(input, self.weight, self.bias)
88
89 def extra_repr(self):
C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\functional.py in linear(input, weight, bias)
1366 - Output: :math:`(N, *, out\_features)`
1367 """
-> 1368 if input.dim() == 2 and bias is not None:
1369 # fused op is marginally faster
1370 ret = torch.addmm(bias, input, weight.t())
AttributeError: 'tuple' object has no attribute 'dim'
Meanwhile deleting the classifier entirely (all three layers)
BERT3 = torch.nn.Sequential(*(list(BERT.children())[:-3]))
Yields the expected tensor (within a size 1 tuple) with the expected shape ([sentence_num,token_num,768]).
Why does the removal of two (but not three) layers breaks the model?
And how can I access the pre_classifier results?
It is not accessible by setting config with output_hidden_states=True as this flag returns the hidden values of the BERT transformer stack, not those of the classifier layers downstream to it.
--
PS
The code used to initialize the BERT model:
def collect_data_for_FT():
from sklearn.datasets import fetch_20newsgroups
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
print('size of training set: %s' % (len(train_b['data'])))
print('size of validation set: %s' % (len(test_b['data'])))
print('classes: %s' % (train_b.target_names))
x_train = train_data.data
y_train = train_data.target
x_test = test_data.data
y_test = test_data.target
return(x_train,y_train,x_test,y_test)
bert_name = 'distilbert-base-uncased'
from transformers import DistilBertForSequenceClassification,AutoConfig,AutoTokenizer
import os
dir_path = os.getcwd()
dir_path=os.path.join(dir_path,'models')
config = AutoConfig.from_pretrained(bert_name,num_labels=20) # change model configuration to access hidden values.
try:
BERT = DistilBertForSequenceClassification.from_pretrained(dir_path,config=config)
print ("Finetuned predictor loaded")
except:
import tensorflow.keras as keras
print ("No finetuned predictor found.\nTraining.")
(x_train,y_train,x_test,y_test)=collect_data_for_FT()
####
# prework:
import ktrain
from ktrain import text
t = text.Transformer(bert_name, maxlen=500, classes=train_b.target_names)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
pre_trained_model = t.get_classifier()
learner = ktrain.get_learner(pre_trained_model, train_data=trn, val_data=val, batch_size=6)
####
####
# Find best learning rate
learner.lr_find()
learner.lr_plot()
####
learner.fit_onecycle(2e-4, 4) # choosen based on the learning rate/loss plot.
####
# prepare and save:
predictor = ktrain.get_predictor(learner.model, preproc=t)
predictor.save('my_distilbertbase_predictor')
predictor.model.save_pretrained(dir_path)
####
BERT = DistilBertForSequenceClassification.from_pretrained(os.path.join(dir_path), from_tf=True,config=config) # re-load tensorflow to pytorch
BERT.save_pretrained(dir_path) # save as a "full blooded" pytorch model
BERT = DistilBertForSequenceClassification.from_pretrained(dir_path,config=config) # re-load
from tensorflow.keras import backend as K
K.clear_session() # loading from tensorflow takes up space and the GPU. This releases it/

Pytorch: ValueError: Expected input batch_size (32) to match target batch_size (64)

Tried to run the CNN examples on MNIST dataset, batch size=64, channel =1, n_h=28, n_w=28, n_iters = 1000. The program runs for first 500 interation and then gives the above mentioned error.
There are same topics already being discussed on the forum such as : topic 1
and topic 2, but none of them could help me identify the mistake in the following code:
class CNN_MNIST(nn.Module):
def __init__(self):
super(CNN_MNIST,self).__init__()
# convolution layer 1
self.cnn1 = nn.Conv2d(in_channels=1, out_channels= 32, kernel_size=5,
stride=1,padding=2)
# ReLU activation
self.relu1 = nn.ReLU()
# maxpool 1
self.maxpool1 = nn.MaxPool2d(kernel_size=2,stride=2)
# convolution 2
self.cnn2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5,
stride=1,padding=2)
# ReLU activation
self.relu2 = nn.ReLU()
# maxpool 2
self.maxpool2 = nn.MaxPool2d(kernel_size=2,stride=2)
# fully connected 1
self.fc1 = nn.Linear(7*7*64,1000)
# fully connected 2
self.fc2 = nn.Linear(1000,10)
def forward(self,x):
# convolution 1
out = self.cnn1(x)
# activation function
out = self.relu1(out)
# maxpool 1
out = self.maxpool1(out)
# convolution 2
out = self.cnn2(out)
# activation function
out = self.relu2(out)
# maxpool 2
out = self.maxpool2(out)
# flatten the output
out = out.view(out.size(0),-1)
# fully connected layers
out = self.fc1(out)
out = self.fc2(out)
return out
# model trainning
count = 0
loss_list = []
iteration_list = []
accuracy_list = []
for epoch in range(int(n_epochs)):
for i, (image,labels) in enumerate(train_loader):
train = Variable(image)
labels = Variable(labels)
# clear gradient
optimizer.zero_grad()
# forward propagation
output = cnn_model(train)
# calculate softmax and cross entropy loss
loss = error(output,label)
# calculate gradients
loss.backward()
# update the optimizer
optimizer.step()
count += 1
if count % 50 ==0:
# calculate the accuracy
correct = 0
total = 0
# iterate through the test data
for image, labels in test_loader:
test = Variable(image)
# forward propagation
output = cnn_model(test)
# get prediction
predict = torch.max(output.data,1)[1]
# total number of labels
total += len(labels)
# correct prediction
correct += (predict==labels).sum()
# accuracy
accuracy = 100*correct/float(total)
# store loss, number of iteration, and accuracy
loss_list.append(loss.data)
iteration_list.append(count)
accuracy_list.append(accuracy)
# print loss and accurcay as the algorithm progresses
if count % 500 ==0:
print('Iteration :{} Loss :{} Accuracy :
{}'.format(count,loss.item(),accuracy))
The error is as follows:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-19-9e93a242961b> in <module>
18
19 # calculate softmax and cross entropy loss
---> 20 loss = error(output,label)
21
22 # calculate gradients
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
545 result = self._slow_forward(*input, **kwargs)
546 else:
--> 547 result = self.forward(*input, **kwargs)
548 for hook in self._forward_hooks.values():
549 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\torch\nn\modules\loss.py in forward(self, input, target)
914 def forward(self, input, target):
915 return F.cross_entropy(input, target, weight=self.weight,
--> 916 ignore_index=self.ignore_index, reduction=self.reduction)
917
918
~\Anaconda3\lib\site-packages\torch\nn\functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
1993 if size_average is not None or reduce is not None:
1994 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 1995 return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
1996
1997
~\Anaconda3\lib\site-packages\torch\nn\functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
1820 if input.size(0) != target.size(0):
1821 raise ValueError('Expected input batch_size ({}) to match target batch_size ({}).'
-> 1822 .format(input.size(0), target.size(0)))
1823 if dim == 2:
1824 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
ValueError: Expected input batch_size (32) to match target batch_size (64).
You are providing the wrong target to your loss:
loss = error(output, label)
While your loader gives you
for i, (image,labels) in enumerate(train_loader):
train = Variable(image)
labels = Variable(labels)
So you have a variable name labels (with s) from the loader, yet you feed label (no s) to your loss.
Batch size is the least of your worries.

Input Dimensions Tensorflow v1.8 ConvLSTMCell

ConvLSTMCell Official Docs
GitHub _conv where the error occurs
Issue
I'm experimenting with the ConvLSTMCell in tensorflow r1.8. The error I'm continuing to generate occurs in the __call__ method of ConvLSTMCell. The _conv method is invoked and the error is raised.
ValueError: Conv Linear Expects 3D, 4D, 5D
The error is raised from the unstacked inputs. unstacked (in this example) has dimensions of [BATCH_SIZE, N_INPUTS] = [2,5]. I am using tf.unstack to generate the required sequence that the ConvLSTMCell requires.
Why use tf.unstack?
If the input array is not unstacked, the TypeError below is raised.
TypeError: inputs must be a sequence
Question
What am I missing on the formatting? I've read through related issues but have not found anything that has guided me into a working implementation.
Are the placeholder dimensions correct?
Should I be unstacking or is there a better way?
Am I providing the proper input dimension into the ConvLSTMCell?
Code
# Parameters
TIME_STEPS = 28
N_INPUT = 5
N_HIDDEN = 128
LEARNING_RATE = 0.001
NUM_UNITS = 28
CHANNEL = 1
tf.reset_default_graph()
# Input placeholders
x = tf.placeholder(tf.float32, [BATCH_SIZE, TIME_STEPS, N_INPUT])
y = tf.placeholder(tf.float32, [None, 1])
# Format input as a sequence for LSTM Input
unstacked = tf.unstack(x, TIME_STEPS, 1) # shape=(timesteps, batch, inputs)
# Convolutional LSTM Layer
lstm_layer = tf.contrib.rnn.ConvLSTMCell(
conv_ndims=1,
input_shape=[BATCH_SIZE, N_INPUT],
output_channels=5,
kernel_shape=[7,5]
)
# Error is generated when the lstm_layer is invoked
outputs, _ = tf.contrib.rnn.static_rnn(
lstm_layer,
unstacked,
dtype=tf.float32)
Error Message
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-83-3568a097e4ea> in <module>()
10 lstm_layer,
11 unstacked,
---> 12 dtype=tf.float32)
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py in static_rnn(cell, inputs, initial_state, dtype, sequence_length, scope)
1322 state_size=cell.state_size)
1323 else:
-> 1324 (output, state) = call_cell()
1325
1326 outputs.append(output)
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py in <lambda>()
1309 varscope.reuse_variables()
1310 # pylint: disable=cell-var-from-loop
-> 1311 call_cell = lambda: cell(input_, state)
1312 # pylint: enable=cell-var-from-loop
1313 if sequence_length is not None:
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/python/ops/rnn_cell_impl.py in __call__(self, inputs, state, scope)
230 setattr(self, scope_attrname, scope)
231 with scope:
--> 232 return super(RNNCell, self).__call__(inputs, state)
233
234 def _rnn_get_variable(self, getter, *args, **kwargs):
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/python/layers/base.py in __call__(self, inputs, *args, **kwargs)
715
716 if not in_deferred_mode:
--> 717 outputs = self.call(inputs, *args, **kwargs)
718 if outputs is None:
719 raise ValueError('A layer\'s `call` method should return a Tensor '
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/contrib/rnn/python/ops/rnn_cell.py in call(self, inputs, state, scope)
2110 cell, hidden = state
2111 new_hidden = _conv([inputs, hidden], self._kernel_shape,
-> 2112 4 * self._output_channels, self._use_bias)
2113 gates = array_ops.split(
2114 value=new_hidden, num_or_size_splits=4, axis=self._conv_ndims + 1)
~/miniconda3/envs/MultivariateTimeSeries/lib/python3.6/site-packages/tensorflow/contrib/rnn/python/ops/rnn_cell.py in _conv(args, filter_size, num_features, bias, bias_start)
2184 if len(shape) not in [3, 4, 5]:
2185 raise ValueError("Conv Linear expects 3D, 4D "
-> 2186 "or 5D arguments: %s" % str(shapes))
2187 if len(shape) != len(shapes[0]):
2188 raise ValueError("Conv Linear expects all args "
ValueError: Conv Linear expects 3D, 4D or 5D arguments: [[2, 5], [2, 2, 5]]
Here's an example with a couple tweaks, which at least passes static shape checking:
import tensorflow as tf
# Parameters
TIME_STEPS = 28
N_INPUT = 5
N_HIDDEN = 128
LEARNING_RATE = 0.001
NUM_UNITS = 28
CHANNEL = 1
BATCH_SIZE = 16
# Input placeholders
x = tf.placeholder(tf.float32, [BATCH_SIZE, TIME_STEPS, N_INPUT])
y = tf.placeholder(tf.float32, [None, 1])
# Format input as a sequence for LSTM Input
unstacked = tf.unstack(x[..., None], TIME_STEPS, 1) # shape=(timesteps, batch, inputs)
# Convolutional LSTM Layer
lstm_layer = tf.contrib.rnn.ConvLSTMCell(
conv_ndims=1,
input_shape=[N_INPUT, 1],
output_channels=5,
kernel_shape=[7]
)
# Error is generated when the lstm_layer is invoked
outputs, _ = tf.contrib.rnn.static_rnn(
lstm_layer,
unstacked,
dtype=tf.float32)
Notes:
input_shape does not include the batch dimension (see docstring)
The input needs a channels dimension. Fine for it to be one in the input (that's what I've done).
Not sure what more than one dimension on kernel_shape would mean for a 1-D convolution.

Resources