Conversion of facebook/nllb-200-3.3B to AWS neuron - nlp
I am trying to convert the new translation model developed by Facebook (Meta), No Language Left Behind, to AWS's neuron model that can be used with the AWS SageMaker Inference using the Inferentia chips. However, I cannot figure out how to trace the model without errors. This post shows exactly what I am trying to do and working the AWS developers. I will copy my code to here as well for clarity:
import copy
import itertools
from typing import List, Optional, Tuple
import torch
import torch.nn.functional as F
from transformers import M2M100Config
from transformers.generation_utils import GenerationMixin
def _convert_past_list_to_tuple(past_key_values):
"""
In Bart model, the type of past_key_values is tuple(tuple(torch.FloatTensor)) which is not
TorchScript-compatible. To support this, we have to convert it during the export process.
This function will convert past values from a list to tuple(tuple(torch.FloatTensor)) for
the inner decoder.
According to the definition of past_key_values, each inner tuple(torch.FloatTensor) has 4 tensors,
so we convert every 4 elements in the list as a tuple(torch.FloatTensor).
"""
count_of_each_inner_tuple = 4
results = ()
temp_result = ()
count_n = len(past_key_values) // count_of_each_inner_tuple
for idx in range(count_n):
real_idx = idx * count_of_each_inner_tuple
temp_result = tuple(past_key_values[real_idx : real_idx + count_of_each_inner_tuple])
results += ((temp_result),)
return results
class EncoderForONNX(torch.nn.Module):
def __init__(self, encoder):
super().__init__()
self.encoder = encoder
def forward(self, input_ids, attention_mask):
return self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=False,
)
class DecoderForONNX(torch.nn.Module):
def __init__(self, decoder):
super().__init__()
self.decoder = decoder
def forward(self, input_ids, encoder_state, attention_mask, past=None):
all_results = None
if past is not None:
all_results = _convert_past_list_to_tuple(past)
input_ids = input_ids[:, -1:]
last_hidden_state, past_key_values = self.decoder(
input_ids=input_ids,
encoder_hidden_states=encoder_state,
encoder_attention_mask=attention_mask,
past_key_values=all_results,
return_dict=False,
)
past_values = []
for past in past_key_values:
past_values = past_values + list(past)
return last_hidden_state, past_values
def _create_traced_encoder(encoder, input_ids, attention_mask):
encoder_c = copy.deepcopy(encoder)
print("shapes",input_ids.shape, attention_mask.shape)
encoder_for_onnx = EncoderForONNX(encoder_c)
compiler_args = ['--fp32-cast', 'matmult', '--fast-math', 'no-fast-relayout']
inputs = (
input_ids,
attention_mask,
)
return torch_neuron.trace(encoder_for_onnx, inputs,compiler_args=compiler_args)
def _create_traced_decoder(decoder, input_ids, encoder_state, attention_mask, past=None):
decoder_c = copy.deepcopy(decoder)
print(input_ids.shape,encoder_state.shape,attention_mask.shape)
decoder_for_onnx = DecoderForONNX(decoder_c)
past_values = list(itertools.chain.from_iterable(past or ()))
compiler_args = ['--fp32-cast', 'matmult', '--fast-math', 'no-fast-relayout']
print(past_values)
# Do this twice so we got 2 different decoders for further work.
if past_values:
inputs = (
input_ids,
encoder_state,
attention_mask,
past_values,
)
return torch_neuron.trace(decoder_for_onnx, inputs,compiler_args=compiler_args)
else:
inputs = (
input_ids,
encoder_state,
attention_mask,
)
return torch_neuron.trace(decoder_for_onnx, inputs,compiler_args=compiler_args)
class M2M100ConfigTS(M2M100Config, torch.nn.Module):
"""
BartConfigTS is a TorchScript-compatible transformers.models.bart.configuration_bart.BartConfig.
TorchScript only supports sub-classes of torch.nn.Module.
"""
def __init__(self, config):
M2M100Config.__init__(self, config)
torch.nn.Module.__init__(self)
class MinLengthLogitsProcessorTS(torch.nn.Module):
r"""
:class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
Args:
min_length (:obj:`int`):
The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
eos_token_id (:obj:`int`):
The id of the `end-of-sequence` token.
"""
def __init__(self, min_length: int, eos_token_id: int):
super().__init__()
if not isinstance(min_length, int) or min_length < 0:
raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
if not isinstance(eos_token_id, int) or eos_token_id < 0:
raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
self.min_length = min_length
self.eos_token_id = eos_token_id
def forward(self, input_ids, scores) -> torch.Tensor:
cur_len = input_ids.shape[-1]
if cur_len < self.min_length:
scores[:, self.eos_token_id] = -float("inf")
return scores
class NLLBGenerator(torch.nn.Module, GenerationMixin):
def __init__(self, model):
super().__init__()
self.config = M2M100ConfigTS(model.config)
self.config.force_bos_token_to_be_generated = False
self._trace_modules(model)
self.logits_processor = MinLengthLogitsProcessorTS(self.config.min_length, self.config.eos_token_id)
self.final_logits_weight = model.model.shared.weight
self.final_logits_bias = model.final_logits_bias
self.decoder_layers = model.config.decoder_layers
self.d_model = model.config.d_model
def _trace_modules(self, model):
# input_ids = torch.tensor(
# [
# [
# 19,669,18,420,8,664,57,42,8,664,21,3028,195,4445,331,1293,34,21,10,6174,1100,6,69,104,42,32,2621,1638,144,4,6174,558,108,4419,1091,28,4,1668,9,1509,1621,279,35,867,2734,85,11,2216,2734,85,203,2244,7,6,15,8102,7,57,8629,5,
# model.config.eos_token_id,
# ]
# ],
# device=model.device,
# dtype=torch.long,
# )
# attention_mask = torch.tensor(
# [[True] * input_ids.shape[-1]],
# device=model.device,
# dtype=torch.bool,
# )
pegasus_text = "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires."
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(pegasus_text , return_tensors="pt", max_length=32, truncation=True, padding='max_length')
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
self.encoder = _create_traced_encoder(model.get_encoder(), input_ids, attention_mask)
encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask, return_dict=True)
decoder = model.model.decoder
decoder_outputs = decoder(input_ids, attention_mask, encoder_outputs["last_hidden_state"], None, None, None)
# print(decoder_outputs[1])
# print(decoder_outputs[1].shape)
self.decoder_no_past = _create_traced_decoder(
model.model.decoder, input_ids, encoder_outputs["last_hidden_state"], attention_mask
)
self.decoder_with_past = _create_traced_decoder(
model.model.decoder, input_ids, encoder_outputs["last_hidden_state"], attention_mask, decoder_outputs[1]
)
def _encoder_forward(self, input_ids, attention_mask):
return self.encoder(input_ids, attention_mask)[0]
#staticmethod
def _init_sequence_length_for_generation(
input_ids: torch.LongTensor, max_length: int
) -> Tuple[torch.Tensor, torch.Tensor, int]:
unfinished_sequences = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + 1
sequence_lengths = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + max_length
cur_len = input_ids.shape[-1]
return sequence_lengths, unfinished_sequences, cur_len
def _decoder_forward(self, input_ids, encoder_output, attention_mask, past: List[torch.Tensor]):
# Update here to use different decoder for different values of past.
if past is None or len(past) == 0:
decoder_output, past = self.decoder_no_past(
input_ids=input_ids, encoder_state=encoder_output, attention_mask=attention_mask
)
else:
decoder_output, past = self.decoder_with_past(
input_ids=input_ids, encoder_state=encoder_output, attention_mask=attention_mask, past=past
)
lm_logits = F.linear(decoder_output, self.final_logits_weight, bias=self.final_logits_bias)
return lm_logits, past
def greedy_search(
self, input_ids, encoder_output, attention_mask, max_length, pad_token_id: int, eos_token_id: int
):
# init sequence length tensors
sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation(
input_ids, max_length
)
past: List[torch.Tensor] = []
while cur_len < max_length:
logits, past = self._decoder_forward(input_ids, encoder_output, attention_mask, past)
next_token_logits = logits[:, -1, :]
# pre-process distribution
scores = self.logits_processor(input_ids, next_token_logits)
# argmax
next_tokens = torch.argmax(scores, dim=-1)
# add code that transfomers next_tokens to tokens_to_add
if eos_token_id is not None:
assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences)
# add token and increase length by one
input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
# update sequence length
if eos_token_id is not None:
sequence_lengths, unfinished_sequences = self._update_seq_length_for_generation(
sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id
)
# stop when there is a </s> in each sentence, or if we exceed the maximul length
if unfinished_sequences.max() == 0:
break
# increase cur_len
cur_len = cur_len + 1
return input_ids
def _prepare_decoder_input_ids_for_generation(
self,
input_ids: torch.LongTensor,
decoder_start_token_id,
bos_token_id: Optional[int] = None,
) -> torch.LongTensor:
decoder_input_ids = (
torch.ones((input_ids.shape[0], 1), dtype=input_ids.dtype, device=input_ids.device)
* decoder_start_token_id
)
return decoder_input_ids
def forward(self, input_ids, attention_mask, max_length, decoder_start_token_id):
pad_token_id = self.config.pad_token_id
bos_token_id = self.config.bos_token_id
eos_token_id = self.config.eos_token_id
# special case if pad_token_id is not defined
if pad_token_id is None and eos_token_id is not None:
# Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.
pad_token_id = eos_token_id
encoder_output = self._encoder_forward(input_ids, attention_mask)
input_ids = self._prepare_decoder_input_ids_for_generation(
input_ids,
decoder_start_token_id=decoder_start_token_id,
bos_token_id=bos_token_id,
)
return self.greedy_search(
input_ids,
encoder_output,
attention_mask,
max_length=max_length,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
)
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
import torch
import torch_neuron
neuron_model = NLLBGenerator(model)
And the current error I am receiving:
/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/transformers/models/m2m_100/modeling_m2m_100.py:326: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
INFO:Neuron:There are 1 ops of 1 different types in the TorchScript that are not compiled by neuron-cc: aten::embedding, (For more information see https://github.com/aws/aws-neuron-sdk/blob/master/release-notes/neuron-cc-ops/neuron-cc-ops-pytorch.md)
INFO:Neuron:Number of arithmetic operators (pre-compilation) before = 1479, fused = 1456, percent fused = 98.44%
INFO:Neuron:Number of neuron graph operations 3581 did not match traced graph 3283 - using heuristic matching of hierarchical information
WARNING:Neuron:torch.neuron.trace failed on _NeuronGraph$1631; falling back to native python function call
ERROR:Neuron:Error parsing message with type 'tensorflow.GraphDef'
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py", line 382, in op_converter
item, inputs, compiler_workdir=sg_workdir, **kwargs)
File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/decorators.py", line 82, in trace
graph_def = graph.as_graph_def()
File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3238, in as_graph_def
result, _ = self._as_graph_def(from_version, add_shapes)
File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3166, in _as_graph_def
graph.ParseFromString(compat.as_bytes(data))
google.protobuf.message.DecodeError: Error parsing message with type 'tensorflow.GraphDef'
INFO:Neuron:Number of arithmetic operators (post-compilation) before = 1479, compiled = 0, percent compiled = 0.0%
INFO:Neuron:The neuron partitioner created 1 sub-graphs
INFO:Neuron:Neuron successfully compiled 0 sub-graphs, Total fused subgraphs = 1, Percent of model sub-graphs successfully compiled = 0.0%
INFO:Neuron:Compiled these operators (and operator counts) to Neuron:
INFO:Neuron:Not compiled operators (and operator counts) to Neuron:
INFO:Neuron: => aten::Int: 414 [supported]
INFO:Neuron: => aten::add: 75 [supported]
INFO:Neuron: => aten::bmm: 48 [supported]
INFO:Neuron: => aten::contiguous: 72 [supported]
INFO:Neuron: => aten::cumsum: 1 [supported]
INFO:Neuron: => aten::detach: 1 [supported]
INFO:Neuron: => aten::dropout: 97 [supported]
INFO:Neuron: => aten::embedding: 1 [not supported]
INFO:Neuron: => aten::expand: 1 [supported]
INFO:Neuron: => aten::index_select: 1 [supported]
INFO:Neuron: => aten::layer_norm: 49 [supported]
INFO:Neuron: => aten::linear: 144 [supported]
INFO:Neuron: => aten::masked_fill: 1 [supported]
INFO:Neuron: => aten::mul: 74 [supported]
INFO:Neuron: => aten::ne: 1 [supported]
INFO:Neuron: => aten::relu: 24 [supported]
INFO:Neuron: => aten::reshape: 24 [supported]
INFO:Neuron: => aten::rsub: 1 [supported]
INFO:Neuron: => aten::size: 77 [supported]
INFO:Neuron: => aten::slice: 2 [supported]
INFO:Neuron: => aten::softmax: 24 [supported]
INFO:Neuron: => aten::to: 5 [supported]
INFO:Neuron: => aten::transpose: 120 [supported]
INFO:Neuron: => aten::type_as: 1 [supported]
INFO:Neuron: => aten::unsqueeze: 2 [supported]
INFO:Neuron: => aten::view: 219 [supported]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_4519/3952284984.py in <module>
314
315
--> 316 neuron_model = NLLBGenerator(model)
/tmp/ipykernel_4519/3952284984.py in __init__(self, model)
154 self.config = M2M100ConfigTS(model.config)
155 self.config.force_bos_token_to_be_generated = False
--> 156 self._trace_modules(model)
157 self.logits_processor = MinLengthLogitsProcessorTS(self.config.min_length, self.config.eos_token_id)
158 self.final_logits_weight = model.model.shared.weight
/tmp/ipykernel_4519/3952284984.py in _trace_modules(self, model)
185 attention_mask = inputs["attention_mask"]
186
--> 187 self.encoder = _create_traced_encoder(model.get_encoder(), input_ids, attention_mask)
188 encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask, return_dict=True)
189 decoder = model.model.decoder
/tmp/ipykernel_4519/3952284984.py in _create_traced_encoder(encoder, input_ids, attention_mask)
80 )
81
---> 82 return torch_neuron.trace(encoder_for_onnx, inputs,compiler_args=compiler_args)
83
84
~/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py in trace(func, example_inputs, fallback, op_whitelist, minimum_segment_size, subgraph_builder_function, subgraph_inputs_pruning, skip_compiler, debug_must_trace, allow_no_ops_on_neuron, compiler_workdir, dynamic_batch_size, compiler_timeout, _neuron_trace, compiler_args, optimizations, verbose, **kwargs)
182 logger.debug("skip_inference_context - trace with fallback at {}".format(get_file_and_line()))
183 neuron_graph = cu.compile_fused_operators(neuron_graph, **compile_kwargs)
--> 184 cu.stats_post_compiler(neuron_graph)
185
186 # Wrap the compiled version of the model in a script module. Note that this is
~/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py in stats_post_compiler(self, neuron_graph)
491 if succesful_compilations == 0 and not self.allow_no_ops_on_neuron:
492 raise RuntimeError(
--> 493 "No operations were successfully partitioned and compiled to neuron for this model - aborting trace!")
494
495 if percent_operations_compiled < 50.0:
RuntimeError: No operations were successfully partitioned and compiled to neuron for this model - aborting trace!
Any help would be appreciated.
A response to your question has been posted on the original Github issue - https://github.com/aws-neuron/aws-neuron-sdk/issues/420#issuecomment-1220885577
-Taylor
Related
'Evaluator' object has no attribute 'loss_value'
I'm trying to make a simple style copy net, combining two images. I'm a newbie and doing it from the example to get some experience in programming. The main idea is to copy style on the target image. Here's the code I wrote: def preprocess(image_path): img = load_img(image_path, target_size = (img_height, img_width)) img = img_to_array(img) img = np.expand_dims(img, axis = 0) img = vgg19.preprocess_input(img) return img def deprocess(x): x[:, :, 0] += 103.939 x[:, :, 1] += 116.779 x[:, :, 2] += 123.68 x = x[:, :, ::-1] x = np.clip(x, 0, 255).astype('unit8') return x target_image = backend.variable(preprocess(target_image_path)) sr_image = backend.variable(preprocess(sr_image_path)) if backend.image_data_format() == 'channels_first': combination_image = backend.placeholder((1,3,img_height, img_width)) else: combination_image = backend.placeholder((1,img_height, img_width,3)) input_tensor = backend.concatenate([target_image, sr_image, combination_image], axis = 0) model = vgg19.VGG19(input_tensor = input_tensor, weights = 'imagenet', include_top = False) print('Model loaded successfully') def content_loss(base, combination): return backend.sum(backend.square(combination - base)) def gram_matrix(x): features = backend.batch_flatten(backend.permute_dimensions(x, (2, 0, 1))) gram = backend.dot(features, backend.transpose(features)) return gram def style_loss(style, combination): S = gram_matrix(style) C = gram_matrix(combination) channels = 3 size = img_height * img_width return backend.sum(backend.square(S - C)) / (4.0 * (channels ** 2) * (size ** 2)) def total_variation_loss(x): a = backend.square( x[:, :img_height - 1, :img_width - 1, :] - x[:, 1:, :img_width - 1, :]) b = backend.square( x[:, :img_height - 1, :img_width - 1, :] - x[:, :img_height - 1, 1:, :]) return backend.sum(backend.pow(a + b, 1.25)) outputs_dict = dict([(layer.name, layer.output) for layer in model.layers]) content_layer = 'block5_conv2' style_layers = ['block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1', 'block5_conv1'] total_variation_weight = 1e-4 style_weight = 1. content_weight = 0.025 loss = backend.variable(0.0) layer_features = outputs_dict[content_layer] target_image_features = layer_features[0, :, :, :] combination_features = layer_features[2, :, :, :] loss = loss + content_weight * content_loss(target_image_features, combination_features) for layer_name in style_layers: layer_features = outputs_dict[layer_name] style_reference_features = layer_features[1, :, :, :] combination_features = layer_features[2, :, :, :] sl = style_loss(style_reference_features, combination_features) loss = loss + (style_weight / len(style_layers)) * sl loss += total_variation_weight * total_variation_loss(combination_image) grads = backend.gradients(loss, combination_image) outputs = [loss] if isinstance(grads, (list,tuple)): outputs += grads else: outputs.append(grads) f_outputs = backend.function([combination_image], outputs) def eval_loss_and_grads(x): if backend.image_data_format() == 'channels_first': x = x.reshape((1, 3, img_height, img_width)) else: x = x.reshape((1, img_height, img_width, 3)) outs = f_outputs([x]) loss_value = outs[0] if len(outs[1:]) == 1: grad_values = outs[1].flatten().astype('float64') else: grad_values = np.array(outs[1:]).flatten().astype('float64') return loss_value, grad_values class Evaluator(object): def _unit_(self): self.loss_value = None self.grads_values = None def loss(self, x): assert self.loss_value is None loss_value, grad_values = eval_loss_and_grads(x) self.loss_value = loss_value self.grads_values = grad_values return self.loss_value def grads(self, x): assert self.loss_value is not None grad_values = np.copy(self.grad_values) self.loss_value = None self.grad_values = None return grad_values evaluator = Evaluator() result_prefix = 'result' iterations = 20 x = preprocess(target_image_path) x = x.flatten() for i in range(iterations): print('Start of iterations', i) start_time = time.time() x, min_val, info = fmin_l_bfgs_b(evaluator.loss, x, fprime = evaluator.grads, maxfun = 20) print('Current loss value:', min_val) img = x.copy().reshape((img_height, img_width, 3)) img = deprocess(img) fname = result_prefix + '_at_iteration_%d.png' % i save_img('D:\study\Stylecopy\fmane', img) print('image saved as:', fname) end_time = time.time() print(' Iteration %d completed in %ds' % (i , end_time - start_time)) Here's the error I got: AttributeError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_12264/556996678.py in <module> 7 print('Start of iterations', i) 8 start_time = time.time() ----> 9 x, min_val, info = fmin_l_bfgs_b(evaluator.loss, x, 10 fprime = evaluator.grads, maxfun = 20) 11 print('Current loss value:', min_val) ~\anaconda3\envs\CNN_base\lib\site-packages\scipy\optimize\lbfgsb.py in fmin_l_bfgs_b(func, x0, fprime, args, approx_grad, bounds, m, factr, pgtol, epsilon, iprint, maxfun, maxiter, disp, callback, maxls) 195 'maxls': maxls} 196 --> 197 res = _minimize_lbfgsb(fun, x0, args=args, jac=jac, bounds=bounds, 198 **opts) 199 d = {'grad': res['jac'], ~\anaconda3\envs\CNN_base\lib\site-packages\scipy\optimize\lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options) 304 iprint = disp 305 --> 306 sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps, 307 bounds=new_bounds, 308 finite_diff_rel_step=finite_diff_rel_step) ~\anaconda3\envs\CNN_base\lib\site-packages\scipy\optimize\optimize.py in _prepare_scalar_function(fun, x0, jac, args, bounds, epsilon, finite_diff_rel_step, hess) 259 # ScalarFunction caches. Reuse of fun(x) during grad 260 # calculation reduces overall function evaluations. --> 261 sf = ScalarFunction(fun, x0, args, grad, hess, 262 finite_diff_rel_step, bounds, epsilon=epsilon) 263 ~\anaconda3\envs\CNN_base\lib\site-packages\scipy\optimize\_differentiable_functions.py in __init__(self, fun, x0, args, grad, hess, finite_diff_rel_step, finite_diff_bounds, epsilon) 138 139 self._update_fun_impl = update_fun --> 140 self._update_fun() 141 142 # Gradient evaluation ~\anaconda3\envs\CNN_base\lib\site-packages\scipy\optimize\_differentiable_functions.py in _update_fun(self) 231 def _update_fun(self): 232 if not self.f_updated: --> 233 self._update_fun_impl() 234 self.f_updated = True 235 ~\anaconda3\envs\CNN_base\lib\site-packages\scipy\optimize\_differentiable_functions.py in update_fun() 135 136 def update_fun(): --> 137 self.f = fun_wrapped(self.x) 138 139 self._update_fun_impl = update_fun ~\anaconda3\envs\CNN_base\lib\site-packages\scipy\optimize\_differentiable_functions.py in fun_wrapped(x) 132 # Overwriting results in undefined behaviour because 133 # fun(self.x) will change self.x, with the two no longer linked. --> 134 return fun(np.copy(x), *args) 135 136 def update_fun(): ~\AppData\Local\Temp/ipykernel_12264/3220866978.py in loss(self, x) 29 30 def loss(self, x): ---> 31 assert self.loss_value is None 32 loss_value, grad_values = eval_loss_and_grads(x) 33 self.loss_value = loss_value AttributeError: 'Evaluator' object has no attribute 'loss_value' I'm dealing with this problem and don't know how to solve it. I've double checked the code with the example (https://www.kaggle.com/code/gabrieltangzy/p2p-gan-newver-slice-cezanne), but haven't found any mistakes. I suppose it may occur because of difference in python versions. I use 3.9.7
PyTorch random_split() is returning wrong sized loader
I have a custom dataset loader for my dataset. I want to split the dataset into 70% train data, 20% validation data, and 10% test data. I have 16,488 data. So, my train data is supposed to be 11,542. But it's becoming 770 train data, 220 validation data, and 110 test data. I've tried but couldn't figure out the problem. class Dataset(Dataset): def __init__(self, directory, transform, preload=False, device: torch.device = torch.device('cpu'), **kwargs): self.device = device self.directory = directory self.transform = transform self.labels = [] self.images = [] self.preload = preload for i, file in enumerate(os.listdir(self.directory)): file_labels = parse('{}_{}_{age}_{gender}.jpg', file) if file_labels is None: continue if self.preload: image = Image.open(os.path.join(self.directory, file)).convert('RGB') if self.transform is not None: image = self.transform(image).to(self.device) else: image = os.path.join(self.directory, file) self.images.append(image) gender_to_class_id = { 'm': 0, 'f': 1 } gender = gender_to_class_id[file_labels['gender']] age = int(file_labels['age']) self.labels.append({ 'age': age, 'gender': gender }) pass def __len__(self): return len(self.labels) def __getitem__(self, idx): if torch.is_tensor(idx): idx = idx.tolist() image = self.images[idx] if not self.preload: image = Image.open(image).convert('RGB') if self.transform is not None: image = self.transform(image).to(self.device) labels = { 'age': self.labels[idx]['age'], 'gender': self.labels[idx]['gender'], } return image.to(self.device), labels def get_loaders(self, transform, train_size=0.7, validate_size=0.2, test_size=0.1, batch_size=15, **kwargs): if round(train_size + validate_size + test_size, 1) > 1.0: sys.exit("Sum of the percentages should be less than 1. it's " + str( train_size + validate_size + test_size) + " now!") train_len = int(len(self) * train_size) validate_len = int(len(self) * validate_size) test_len = int(len(self) * test_size) others_len = len(self) - train_len - validate_len - test_len self.trainDataset, self.validateDataset, self.testDataset, _ = torch.utils.data.random_split( self, [train_len, validate_len, test_len, others_len] ) train_loader = DataLoader(self.trainDataset, batch_size=batch_size) validate_loader = DataLoader(self.validateDataset, batch_size=batch_size) test_loader = DataLoader(self.testDataset, batch_size=batch_size) return train_loader, validate_loader, test_loader
It seems that you are giving batch_size=15 As a dataloader is iterable, it maybe simply giving you the len() of the 1st batch. It also explains why you are getting train data = 770, where it is supposed to be 11,542. Because, 16488 / 15 * 0.7 = 769.44 ≈ 770 Assigning batch_size = 1 should do the trick. 16488 / 1 * 0.7 = 11541.6 ≈ 11542
Training step not executing in pytorch lightning
I am working to finetune a t5 model to summarize Amazon reviews. I am following this tutorial here: https://towardsdatascience.com/fine-tuning-a-t5-transformer-for-any-summarization-task-82334c64c81 I noticed that the training_step in my code is never being executed as the training loss remains "NaN" throughout the epoch. However, the validation_step is computed fine. I already confirmed that there are no empty strings in the data and have tried multiple batch sizes. This is the error RuntimeError Traceback (most recent call last) <ipython-input-53-45d4afebefac> in <module>() ----> 1 trainer.fit(model) 8 frames <ipython-input-46-00fddffa2209> in training_epoch_end(self, outputs) 134 print("OUTPUTS") 135 print(outputs) --> 136 avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean() 137 tensorboard_logs = {"avg_train_loss": avg_train_loss} 138 return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs} RuntimeError: stack expects a non-empty TensorList I found that the training_step function is never being executed by adding print statements inside the training_step function. Below is my code for the T5FineTuner class (sorry I can't be any more concise): class T5FineTuner(pl.LightningModule): def __init__(self, hparams): super(T5FineTuner, self).__init__() self.hparams = hparams self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path) self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path) self.rouge_metric = load_metric('rouge') if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: self.freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) n_observations_per_split = { "train": self.hparams.n_train, "validation": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()} def freeze_params(self, model): for par in model.parameters(): par.requires_grad = False def freeze_embeds(self): """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5.""" try: self.freeze_params(self.model.model.shared) for d in [self.model.model.encoder, self.model.model.decoder]: freeze_params(d.embed_positions) freeze_params(d.embed_tokens) except AttributeError: self.freeze_params(self.model.shared) for d in [self.model.encoder, self.model.decoder]: self.freeze_params(d.embed_tokens) def lmap(self, f, x): """list(map(f, x))""" return list(map(f, x)) def is_logger(self): return True def parse_score(self, result): return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()} def forward( self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None ): return self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=labels, ) def _step(self, batch): labels = batch["target_ids"] labels[labels[:, :] == self.tokenizer.pad_token_id] = -100 # print(labels) outputs = self( input_ids=batch["source_ids"], attention_mask=batch["source_mask"], labels=labels, decoder_attention_mask=batch['target_mask'] ) # print(outputs) loss = outputs[0] return loss def ids_to_clean_text(self, generated_ids): gen_text = self.tokenizer.batch_decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True ) return self.lmap(str.strip, gen_text) def _generative_step(self, batch) : t0 = time.time() generated_ids = self.model.generate( batch["source_ids"], attention_mask=batch["source_mask"], use_cache=True, decoder_attention_mask=batch['target_mask'], max_length=150, num_beams=2, repetition_penalty=2.5, length_penalty=1.0, early_stopping=False, ) preds = self.ids_to_clean_text(generated_ids) target = self.ids_to_clean_text(batch["target_ids"]) gen_time = (time.time() - t0) / batch["source_ids"].shape[0] loss = self._step(batch) # print("LOSS _generative_step") # print(loss) base_metrics = {'val_loss': loss} # rouge: Dict = self.calc_generative_metrics(preds, target) summ_len = np.mean(self.lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target) self.rouge_metric.add_batch(preds, target) # rouge_results = self.rouge_metric.compute() # rouge_dict = self.parse_score(rouge_results) # base_metrics.update(rouge1=rouge_dict['rouge1'], rougeL=rouge_dict['rougeL']) return base_metrics def training_step(self, batch, batch_idx): print("training_step") print(batch) loss = self._step(batch) tensorboard_logs = {"train_loss": loss} print("LOSS") print(loss) return {"loss": loss, "log": tensorboard_logs} def training_epoch_end(self, outputs): print("OUTPUTS") print(outputs) avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean() tensorboard_logs = {"avg_train_loss": avg_train_loss} return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs} def validation_step(self, batch, batch_idx): print("validation_step") return self._generative_step(batch) def validation_epoch_end(self, outputs): avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() tensorboard_logs = {"val_loss": avg_loss} rouge_results = self.rouge_metric.compute() rouge_dict = self.parse_score(rouge_results) tensorboard_logs.update(rouge1=rouge_dict['rouge1'], rougeL=rouge_dict['rougeL']) ## Clear out the lists for next epoch self.target_gen= [] self.prediction_gen=[] return {"avg_val_loss": avg_loss, "rouge1" : rouge_results['rouge1'], "rougeL" : rouge_results['rougeL'], "log": tensorboard_logs, 'progress_bar': tensorboard_logs} def configure_optimizers(self): "Prepare optimizer and schedule (linear warmup and decay)" model = self.model no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.hparams.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) self.opt = optimizer return [optimizer] def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None, using_native_amp=False, optimizer_closure=None, on_tpu=None, using_lbfgs=None): # if self.trainer.use_tpu: # xm.optimizer_step(optimizer) # else: optimizer.step() optimizer.zero_grad() self.lr_scheduler.step() def get_tqdm_dict(self): tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]} return tqdm_dict def train_dataloader(self): print("train_dataloader") n_samples = self.n_obs['train'] print(n_samples) dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, num_workers=4) print(len(dataloader.dataset)) print(self.hparams.train_batch_size * max(1, self.hparams.n_gpu)) print(self.hparams.gradient_accumulation_steps) print(float(self.hparams.num_train_epochs)) t_total = ( (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu))) # // self.hparams.gradient_accumulation_steps * float(self.hparams.num_train_epochs) ) print(t_total) scheduler = get_linear_schedule_with_warmup( self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total ) self.lr_scheduler = scheduler return dataloader def val_dataloader(self): n_samples = self.n_obs['validation'] # validation_dataset = get_dataset(tokenizer=self.tokenizer, type_path="validation", num_samples=n_samples, args=self.hparams) return DataLoader(validation_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4) def test_dataloader(self): n_samples = self.n_obs['test'] # test_dataset = get_dataset(tokenizer=self.tokenizer, type_path="test", num_samples=n_samples, args=self.hparams) return DataLoader(test_dataset, batch_size=self.hparams.test_batch_size, num_workers=4) Below are my parameters: args_dict = dict( output_dir="", # path to save the checkpoints model_name_or_path='t5-small', tokenizer_name_or_path='t5-small', max_input_length=512, max_output_length=150, freeze_encoder=False, freeze_embeds=False, learning_rate=3e-4, weight_decay=0.0, adam_epsilon=1e-8, warmup_steps=0, train_batch_size=20, eval_batch_size=20, num_train_epochs=2, gradient_accumulation_steps=8, n_gpu=1, resume_from_checkpoint=None, val_check_interval = 0.05, n_val=1000, n_train=-1, n_test=-1, early_stop_callback=False, fp_16=False, # if you want to enable 16-bit training then install apex and set this to true opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default seed=42, )
It seems that this code is quite outdated. What makes this conflict is the optimizer_step() method. I just commented out this whole segment below and it worked for me. If you want to do any custom logic in this function, better to consult the latest code on GitHub. def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None, using_native_amp=False,on_tpu=None,using_lbfgs=None, optimizer_closure=None): if self.trainer.use_tpu: xm.optimizer_step(optimizer) else: optimizer.step(closure=optimizer_closure) optimizer.zero_grad() self.lr_scheduler.step()
How to pass model input to loss function in tensorflow keras?
I am training a neural networks with three different output prediction. For computing the loss of one output I need one of the input that is passed into the network. I am not able to access it as the training data is feed into the network by a keras data generator object. Is there any workaround for this problem. This is the Generator class that feds data into the model class DataGenerator(tf.keras.utils.Sequence): def __init__(self,list_ID,centers,sizes,batch_size=2,dims=(512,512),n_channels=3,n_classes=10,shuffle=True) -> None: assert len(list_ID) == len(centers) self.dims = dims self.batch_size = batch_size self.list_ID = list_ID self.centers = centers self.n_channels = n_channels self.n_classes = n_classes self.shuffle = shuffle self.sizes = sizes self.on_epoch_end() self.mask = None def __len__(self): return int(np.floor(len(self.list_ID) / self.batch_size)) def on_epoch_end(self): self.indexes = np.arange(len(self.list_ID)) if self.shuffle: np.random.shuffle(self.indexes) def __getitem__(self, index): indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size] list_ID_temp = [self.list_ID[k] for k in indexes] centers_temp = [self.centers[k] for k in indexes] sizes_temp = [self.sizes[k] for k in indexes] X, y = self.__datageneration(list_ID_temp, centers_temp,sizes_temp) return X, y def __datageneration(self, list_ID_temp,centers_temp,sizes_temp): X = np.empty((self.batch_size,*self.dims,self.n_channels)) Y_center = np.empty((self.batch_size,128,128,1)) Y_dimension = np.empty((self.batch_size,128,128,2)) Y_offset = np.empty((self.batch_size,128,128,2)) self.mask = np.empty((self.batch_size,128,128,1)) for i,ID in enumerate(list_ID_temp): image = cv2.imread(path+'/'+ID) / 255.0 heat_center, self.mask[i,] = gaussian_2d(centers_temp[i],image.shape) '''Here I tried to save mask which is what I need, as an attribute to data generator but when accessed by loss function the value is just None which is what I initialized it as in init method''' heat_size,heat_off = size_off_heatmap(sizes_temp[i], centers_temp[i],image.shape) image = cv2.resize(image,(512,512)) X[i,] = image Y_center[i,] = heat_center Y_dimension[i,] = heat_size Y_offset[i,] = heat_off return (X,{'center_output':Y_center,'size_output':Y_dimension,'offset_output':Y_offset}) This is the generator class I implemented and I needed the mask , which I tried to write as an attribute of data generator object(I have commented the code. For reference I will also include the function that will return the mask and the error function that requires the mask. Function returning mask def gaussian_2d(centers, img_shape): heatmap = [] y_index = np.tile(np.arange(128), (128, 1)) mask = np.zeros((128,128,1)) width = img_shape[1] height = img_shape[0] for x_o, y_o in centers: x = int(x_o / width * 128) y = int(y_o / height * 128) mask[y,x] = 1 gauss = np.exp(-((y_index.T - y) ** 2 + (y_index - x) ** 2) / 2 * 0.2 ** 2) heatmap.append(gauss) if len(heatmap) > 1: heatmap = np.stack(heatmap) heatmap = np.max(heatmap, axis=0) else: heatmap = np.array(heatmap) heatmap = heatmap.reshape((128, 128,1)) return heatmap,mask Loss function def final_loss(mask): def l1_loss(y_true, y_pred): y_true = tf.cast(y_true, tf.float32) y_pred = tf.cast(y_pred, tf.float32) n = tf.reduce_sum(tf.cast(tf.equal(mask, 1.0),dtype=tf.float32)) tot_loss = tf.reduce_sum(tf.abs(y_pred - y_true)) if tf.greater(n,0): loss = tot_loss / (n) else: loss = tot_loss return loss return l1_loss The error show is as below Epoch 1/10 --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-27-74a28b075f52> in <module>() ----> 1 model.fit(gen,epochs=10,verbose=1,callbacks=Callback(patience=4)) 9 frames /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs) 975 except Exception as e: # pylint:disable=broad-except 976 if hasattr(e, "ag_error_metadata"): --> 977 raise e.ag_error_metadata.to_exception(e) 978 else: 979 raise ValueError: in user code: /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:805 train_function * return step_function(self, iterator) <ipython-input-24-c45fe131feb7>:5 l1_loss * n = tf.reduce_sum(tf.cast(tf.equal(mask, 1.0),dtype=tf.float32)) /usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py:201 wrapper ** return target(*args, **kwargs) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:1679 equal return gen_math_ops.equal(x, y, name=name) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_math_ops.py:3179 equal name=name) /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:540 _apply_op_helper (input_name, err)) ValueError: Tried to convert 'x' to a tensor and failed. Error: None values not supported. '''
Encounter the error one of the variables needed for gradient computation has been modified by an inplace operation
I have encountered the following error. This is a different rnn structure. I have implemented to use in graph convolution. The problem is that the hidden is updated in-place operation. However, I have to update its value in each forward call. How can I do that? Thanks in advance. RuntimeError Traceback (most recent call last) <ipython-input-110-b4425651d544> in <module>() 8 out = model(x[i]) 9 loss = mael(out, x[i+1]) ---> 10 loss.backward(retain_graph=True) 11 optimizer.step() 12 print(loss.item()) 1 frames /usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables) 130 Variable._execution_engine.run_backward( 131 tensors, grad_tensors_, retain_graph, create_graph, --> 132 allow_unreachable=True) # allow_unreachable flag 133 134 RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 100]], which is output 0 of SelectBackward, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck! This is a different rnn structure. I have implemented to use in graph convolution. The problem is that the hidden is updated in-place operation. However, I have to update its value in each forward call. How can I do that? Thanks in advance. class RNN(nn.Module): def __init__(self, input_dim, hidden_dim): super(RNN,self).__init__() self.input_dim = input_dim self.hidden_dim = hidden_dim self.weight = Parameter(torch.rand(10,input_dim,hidden_dim, requires_grad=True)) self.weight_h = Parameter(torch.rand(10,input_dim,hidden_dim, requires_grad=True)) self.bias = Parameter(torch.rand(10,input_dim,hidden_dim, requires_grad=True)) self.hidden = torch.rand(10,input_dim, hidden_dim) self.weight_2 = Parameter(torch.rand(10,input_dim,hidden_dim,requires_grad=True)) self.weight_h_2 = Parameter(torch.rand(10,hidden_dim,hidden_dim, requires_grad=True)) self.bias_2 = Parameter(torch.rand(10,input_dim,hidden_dim, requires_grad=True)) self.tanh = Tanh() self.iteration = 0 self.next_layer = False self.hidden_init = torch.rand(1,1) def set_hidden(self,x): y = self.tanh(mm(x, self.weight[self.iteration]) + mm(self.hidden_init, self.weight_h[self.iteration]) + self.bias[self.iteration]) return y def set_hidden_state_layer_2(self, x, hidden): y = self.tanh(mm(x, self.weight_2[self.iteration]) + mm(hidden, self.weight_h_2[self.iteration]) + self.bias_2[self.iteration]) return y def forward(self, x): try: dim_1, dim_2, dim_3 = x.shape except: x = torch.unsqueeze(x,0) if self.iteration == 10: self.next_layer = True self.iteration = 0 if self.next_layer: self.hidden[self.iteration] = self.set_hidden_state_layer_2(x, self.hidden[self.iteration].clone()) self.iteration = self.iteration + 1 return self.hidden[self.iteration - 1] else: hidden_init = torch.rand(1,1) self.hidden[self.iteration] = self.tanh(mm(x, self.weight[self.iteration]) + mm(self.hidden_init, self.weight_h[self.iteration]) + self.bias[self.iteration]) self.iteration = self.iteration + 1 return self.hidden[self.iteration - 1] model = RNN(1,100) mael = nn.L1Loss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) x = torch.rand(11,1) x_2 = torch.rand(11,1) for i in range(10): optimizer.zero_grad() out = model(x[i]) loss = mael(out, x[i+1]) loss.backward(retain_graph=True) optimizer.step() print(loss.item())