ValueError: Unable to create tensor, you should probably activate padding with 'padding=True' - nlp

I am trying to evaluate facebook/hubert-base-ls9601 Huggingface pre-trained model after fine-tuning on a private dataset.
I am using facebook/hubert-base-ls9601 pre-trained model, and Wav2vec2 feature extractor, and pooling mode set to mean.
Here's the evaluation code:
test_dataset = load_dataset("csv", data_files={"test": "/content/drive/MyDrive/freelancing/test.csv"}, delimiter="\t")["test"]
def speech_file_to_array_fn(batch):
speech_array, sampling_rate = torchaudio.load(batch["path"])
resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
speech = resampler(speech_array).squeeze().numpy()
batch["speech"] = speech_array
return batch
def predict(batch):
features = feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate, return_tensors="pt", padding=True)
input_values = features.input_values.to(device)
with torch.no_grad():
logits = model(input_values).logits
pred_ids = torch.argmax(logits, dim=-1).detach().cpu().numpy()
batch["predicted"] = pred_ids
return batch
test_dataset = test_dataset.map(speech_file_to_array_fn)
result = test_dataset.map(predict, batched=True, batch_size=2)
On the last line of code, I encounter the following error block:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py in convert_to_tensors(self, tensor_type)
168 if not is_tensor(value):
--> 169 tensor = as_tensor(value)
170
ValueError: could not broadcast input array from shape (2,220683) into shape (2,)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
12 frames
<ipython-input-73-7bd88adad349> in <module>()
----> 1 result = test_dataset.map(predict, batched=True, batch_size=2)
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
1970 new_fingerprint=new_fingerprint,
1971 disable_tqdm=disable_tqdm,
-> 1972 desc=desc,
1973 )
1974 else:
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
517 self: "Dataset" = kwargs.pop("self")
518 # apply actual function
--> 519 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
520 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
521 for dataset in datasets:
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
484 }
485 # apply actual function
--> 486 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
487 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
488 # re-apply format to the output
/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py in wrapper(*args, **kwargs)
456 # Call actual function
457
--> 458 out = func(self, *args, **kwargs)
459
460 # Update fingerprint of in-place transforms + update in-place history of transforms
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in _map_single(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc, cache_only)
2340 indices,
2341 check_same_num_examples=len(input_dataset.list_indexes()) > 0,
-> 2342 offset=offset,
2343 )
2344 except NumExamplesMismatchError:
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples, offset)
2217 if with_rank:
2218 additional_args += (rank,)
-> 2219 processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
2220 if update_data is None:
2221 # Check if the function returns updated examples
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in decorated(item, *args, **kwargs)
1912 )
1913 # Use the LazyDict internally, while mapping the function
-> 1914 result = f(decorated_item, *args, **kwargs)
1915 # Return a standard dict
1916 return result.data if isinstance(result, LazyDict) else result
<ipython-input-71-6f845da29c00> in predict(batch)
11
12 def predict(batch):
---> 13 features = feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate, return_tensors="pt", padding=True)
14
15 input_values = features.input_values.to(device)
/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in __call__(self, raw_speech, padding, max_length, truncation, pad_to_multiple_of, return_attention_mask, return_tensors, sampling_rate, **kwargs)
200 truncation=truncation,
201 pad_to_multiple_of=pad_to_multiple_of,
--> 202 return_attention_mask=return_attention_mask,
203 )
204
/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_sequence_utils.py in pad(self, processed_features, padding, max_length, truncation, pad_to_multiple_of, return_attention_mask, return_tensors)
230 batch_outputs[key].append(value)
231
--> 232 return BatchFeature(batch_outputs, tensor_type=return_tensors)
233
234 def _pad(
/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py in __init__(self, data, tensor_type)
78 def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
79 super().__init__(data)
---> 80 self.convert_to_tensors(tensor_type=tensor_type)
81
82 def __getitem__(self, item: str) -> Union[Any]:
/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py in convert_to_tensors(self, tensor_type)
174 raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
175 raise ValueError(
--> 176 "Unable to create tensor, you should probably activate padding "
177 "with 'padding=True' to have batched tensors with the same length."
178 )
ValueError: Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.
I am working on Google Colab. Those are the environment variables:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1
The padding is already activated in the predict function.
Can you please help me fix it?

Related

FGSM attack in Foolbox

I am using Foolbox 3.3.1 to perform some adversarial attacks on resnet50 network. The code is as follows:
import torch
from torchvision import models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=True).to(device)
model.eval()
mean = [0.485, 0.456, 0.406]
std=[0.229, 0.224, 0.225]
preprocessing = dict(mean=mean, std=std, axis=-3)
bounds = (0, 1)
fmodel = fb.models.PyTorchModel(model, bounds=bounds, preprocessing=preprocessing)
images, labels = fb.utils.samples(fmodel, dataset='imagenet', batchsize=8)
labels_float = labels.to(torch.float32)
def perform_attack(attack, fmodel, images, labels, predicted_labels_before_attack):
print(f'Performing attack with {type(attack).__name__}...', end='')
raw, clipped, is_adv = attack(fmodel, images, labels, epsilons=0.03)
print('done')
logits_after_attacks = fmodel(clipped)
labels_after_attack = logits_after_attacks.max(dim=1)[1].cpu().numpy()
for image, predicted_label_before_attack, label, label_after_attack in zip(images, predicted_labels_before_attack, labels.cpu().numpy(), labels_after_attack):
label_imshow = type(attack).__name__
if predicted_label_before_attack == label and label != label_after_attack:
label_imshow += '; successful attack'
label_imshow += f'\nTrue class: {lab_dict[label]}\nClassified before attack as: {lab_dict[predicted_label_before_attack]}\nClassified after attack as: {lab_dict[label_after_attack]}'
imshow(image, label_imshow)
for attack in (
fb.attacks.FGSM(), # "nll_loss_forward_no_reduce_cuda_kernel_index" not implemented for 'Int'
):
perform_attack(attack, fmodel, images, labels, predicted_labels_before_attack)
I get the error:
RuntimeError: "nll_loss_forward_no_reduce_cuda_kernel_index" not implemented for 'Int'
with full stack:
Performing attack with LinfFastGradientAttack...
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_1736/3238714708.py in <module>
28 # fb.attacks.BoundaryAttack(), # very slow
29 ):
---> 30 perform_attack(attack, fmodel, images, labels, predicted_labels_before_attack)
~\AppData\Local\Temp/ipykernel_1736/3978727835.py in perform_attack(attack, fmodel, images, labels, predicted_labels_before_attack)
1 def perform_attack(attack, fmodel, images, labels, predicted_labels_before_attack):
2 print(f'Performing attack with {type(attack).__name__}...', end='')
----> 3 raw, clipped, is_adv = attack(fmodel, images, labels, epsilons=0.03)
4 print('done')
5 logits_after_attacks = fmodel(clipped)
~\anaconda3\envs\adversarial\lib\site-packages\foolbox\attacks\base.py in __call__(***failed resolving arguments***)
277 success = []
278 for epsilon in real_epsilons:
--> 279 xp = self.run(model, x, criterion, epsilon=epsilon, **kwargs)
280
281 # clip to epsilon because we don't really know what the attack returns;
~\anaconda3\envs\adversarial\lib\site-packages\foolbox\attacks\fast_gradient_method.py in run(self, model, inputs, criterion, epsilon, **kwargs)
90 raise ValueError("unsupported criterion")
91
---> 92 return super().run(
93 model=model, inputs=inputs, criterion=criterion, epsilon=epsilon, **kwargs
94 )
~\anaconda3\envs\adversarial\lib\site-packages\foolbox\attacks\gradient_descent_base.py in run(***failed resolving arguments***)
90
91 for _ in range(self.steps):
---> 92 _, gradients = self.value_and_grad(loss_fn, x)
93 gradients = self.normalize(gradients, x=x, bounds=model.bounds)
94 x = x + gradient_step_sign * stepsize * gradients
~\anaconda3\envs\adversarial\lib\site-packages\foolbox\attacks\gradient_descent_base.py in value_and_grad(self, loss_fn, x)
50 x: ep.Tensor,
51 ) -> Tuple[ep.Tensor, ep.Tensor]:
---> 52 return ep.value_and_grad(loss_fn, x)
53
54 def run(
~\anaconda3\envs\adversarial\lib\site-packages\eagerpy\framework.py in value_and_grad(f, t, *args, **kwargs)
350 f: Callable[..., TensorType], t: TensorType, *args: Any, **kwargs: Any
351 ) -> Tuple[TensorType, TensorType]:
--> 352 return t.value_and_grad(f, *args, **kwargs)
353
354
~\anaconda3\envs\adversarial\lib\site-packages\eagerpy\tensor\tensor.py in value_and_grad(self, f, *args, **kwargs)
541 self: TensorType, f: Callable[..., TensorType], *args: Any, **kwargs: Any
542 ) -> Tuple[TensorType, TensorType]:
--> 543 return self._value_and_grad_fn(f, has_aux=False)(self, *args, **kwargs)
544
545 #final
~\anaconda3\envs\adversarial\lib\site-packages\eagerpy\tensor\pytorch.py in value_and_grad(x, *args, **kwargs)
493 loss, aux = f(x, *args, **kwargs)
494 else:
--> 495 loss = f(x, *args, **kwargs)
496 loss = loss.raw
497 loss.backward()
~\anaconda3\envs\adversarial\lib\site-packages\foolbox\attacks\gradient_descent_base.py in loss_fn(inputs)
40 def loss_fn(inputs: ep.Tensor) -> ep.Tensor:
41 logits = model(inputs)
---> 42 return ep.crossentropy(logits, labels).sum()
43
44 return loss_fn
~\anaconda3\envs\adversarial\lib\site-packages\eagerpy\framework.py in crossentropy(logits, labels)
319
320 def crossentropy(logits: TensorType, labels: TensorType) -> TensorType:
--> 321 return logits.crossentropy(labels)
322
323
~\anaconda3\envs\adversarial\lib\site-packages\eagerpy\tensor\pytorch.py in crossentropy(self, labels)
462 raise ValueError("labels must be 1D and must match the length of logits")
463 return type(self)(
--> 464 torch.nn.functional.cross_entropy(self.raw, labels.raw, reduction="none")
465 )
466
~\anaconda3\envs\adversarial\lib\site-packages\torch\nn\functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
2844 if size_average is not None or reduce is not None:
2845 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2846 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
2847
2848
RuntimeError: "nll_loss_forward_no_reduce_cuda_kernel_index" not implemented for 'Int'
Any clue?
i think the problem is on the 3978727835.py file try to change the argument labels to 'labels.to(DEVICE).long()' if you use CUDA unless change it to 'labels.long()'

How to reconstruct the decoder from an LSTM-AE?

I have a trained LSTM-AE, of which the architecture is as follows:
In brief, I have an LSTM-AE of depth 3, the number of cells on the LSTM layers on the encoder side are [120, 80, 50] (and symmetric for the decoder). I built the model using the code shown on this page. For information, because I want to train the LSTM-AT directly on variable-length time series, so I didn't specify the timestamps in the input layer, which means the model is trained on batches of size 1 (one time series per batch).
I can extract the encoder just fine, but I cannot do the same for the decoder :-(... My goal is to check, given a vector of 50 features (which are extracted by the encoder), whether the decoder can reconstruct the input series.
Here's my attempt so far:
# load the full autoencoder
model = load_model(path_to_model)
# reconstruct the decoder
in_layer = Input(shape=(None, 50))
time_dist = model.layers[-1]
dec_1 = model.layers[-2]
dec_2 = model.layers[-3]
dec_3 = model.layers[-4]
rep_vec = model.layers[-5]
out_layer = time_dist(dec_1(dec_2(dec_3(rep_vec(in_layer)))))
decoder = Model(in_layer, out_layer, name='decoder')
res = decoder(input_feature) # input_feature has shape (50,)
I obtained this error:
InvalidArgumentError: slice index 1 of dimension 0 out of bounds. [Op:StridedSlice] name: decoder/repeat/strided_slice/
If you are interested in the full error log...
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
Input In [86], in <module>
13 out_layer = time_dist(dec_1(dec_2(dec_3(rep_vec(in_layer)))))
14 decoder = Model(in_layer, out_layer, name='decoder')
---> 15 res = decoder(input_feature)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1030, in Layer.__call__(self, *args, **kwargs)
1026 inputs = self._maybe_cast_inputs(inputs, input_list)
1028 with autocast_variable.enable_auto_cast_variables(
1029 self._compute_dtype_object):
-> 1030 outputs = call_fn(inputs, *args, **kwargs)
1032 if self._activity_regularizer:
1033 self._handle_activity_regularization(inputs, outputs)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/functional.py:420, in Functional.call(self, inputs, training, mask)
401 #doc_controls.do_not_doc_inheritable
402 def call(self, inputs, training=None, mask=None):
403 """Calls the model on new inputs.
404
405 In this case `call` just reapplies
(...)
418 a list of tensors if there are more than one outputs.
419 """
--> 420 return self._run_internal_graph(
421 inputs, training=training, mask=mask)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/functional.py:556, in Functional._run_internal_graph(self, inputs, training, mask)
553 continue # Node is not computable, try skipping.
555 args, kwargs = node.map_arguments(tensor_dict)
--> 556 outputs = node.layer(*args, **kwargs)
558 # Update tensor_dict.
559 for x_id, y in zip(node.flat_output_ids, nest.flatten(outputs)):
File ~/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1030, in Layer.__call__(self, *args, **kwargs)
1026 inputs = self._maybe_cast_inputs(inputs, input_list)
1028 with autocast_variable.enable_auto_cast_variables(
1029 self._compute_dtype_object):
-> 1030 outputs = call_fn(inputs, *args, **kwargs)
1032 if self._activity_regularizer:
1033 self._handle_activity_regularization(inputs, outputs)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/keras/layers/core.py:919, in Lambda.call(self, inputs, mask, training)
915 return var
917 with backprop.GradientTape(watch_accessed_variables=True) as tape,\
918 variable_scope.variable_creator_scope(_variable_creator):
--> 919 result = self.function(inputs, **kwargs)
920 self._check_variables(created_variables, tape.watched_variables())
921 return result
File D:/PhD/Code/feature_learning/train_models/train_lstmae.py:30, in repeat_vector(args)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:206, in add_dispatch_support.<locals>.wrapper(*args, **kwargs)
204 """Call target, and fall back on dispatchers if there is a TypeError."""
205 try:
--> 206 return target(*args, **kwargs)
207 except (TypeError, ValueError):
208 # Note: convert_to_eager_tensor currently raises a ValueError, not a
209 # TypeError, when given unexpected types. So we need to catch both.
210 result = dispatch(wrapper, args, kwargs)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:1040, in _slice_helper(tensor, slice_spec, var)
1038 var_empty = constant([], dtype=dtypes.int32)
1039 packed_begin = packed_end = packed_strides = var_empty
-> 1040 return strided_slice(
1041 tensor,
1042 packed_begin,
1043 packed_end,
1044 packed_strides,
1045 begin_mask=begin_mask,
1046 end_mask=end_mask,
1047 shrink_axis_mask=shrink_axis_mask,
1048 new_axis_mask=new_axis_mask,
1049 ellipsis_mask=ellipsis_mask,
1050 var=var,
1051 name=name)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:206, in add_dispatch_support.<locals>.wrapper(*args, **kwargs)
204 """Call target, and fall back on dispatchers if there is a TypeError."""
205 try:
--> 206 return target(*args, **kwargs)
207 except (TypeError, ValueError):
208 # Note: convert_to_eager_tensor currently raises a ValueError, not a
209 # TypeError, when given unexpected types. So we need to catch both.
210 result = dispatch(wrapper, args, kwargs)
File ~/venv/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:1213, in strided_slice(input_, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask, var, name)
1210 if strides is None:
1211 strides = ones_like(begin)
-> 1213 op = gen_array_ops.strided_slice(
1214 input=input_,
1215 begin=begin,
1216 end=end,
1217 strides=strides,
1218 name=name,
1219 begin_mask=begin_mask,
1220 end_mask=end_mask,
1221 ellipsis_mask=ellipsis_mask,
1222 new_axis_mask=new_axis_mask,
1223 shrink_axis_mask=shrink_axis_mask)
1225 parent_name = name
1227 if var is not None:
File ~/venv/lib/python3.8/site-packages/tensorflow/python/ops/gen_array_ops.py:10505, in strided_slice(input, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask, name)
10503 return _result
10504 except _core._NotOkStatusException as e:
> 10505 _ops.raise_from_not_ok_status(e, name)
10506 except _core._FallbackException:
10507 pass
File ~/venv/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:6897, in raise_from_not_ok_status(e, name)
6895 message = e.message + (" name: " + name if name is not None else "")
6896 # pylint: disable=protected-access
-> 6897 six.raise_from(core._status_to_exception(e.code, message), None)
File <string>:3, in raise_from(value, from_value)
InvalidArgumentError: slice index 1 of dimension 0 out of bounds. [Op:StridedSlice] name: decoder/repeat/strided_slice/
I appreciate very much any advice you would give me!
Edit
Here is the code I used to build the mode:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.backend import shape
def repeat_vector(args):
"""Builds the repeat vector layer dynamically by the size of the input series"""
layer_to_repeat = args[0]
sequence_layer = args[1]
return RepeatVector(shape(sequence_layer)[1])(layer_to_repeat)
n_atts = 3 # time series of 3 measurements
n_units = [120, 80, 50] # encoder - 1st layer: 120, 2nd layer: 80, 3rd layer: 50 (and symmetric for decoder)
n_layers = len(n_units)
init = GlorotUniform(seed=420)
reg = None
optimizer = Adam(learning_rate=0.0001)
activ = 'tanh'
loss_metric = 'mse'
inputs = Input(shape=(None, n_atts), name='input_layer')
# the encoder
encoded = LSTM(n_units[0], name='encoder_1', return_sequences=(n_layers != 1), kernel_initializer=init,
kernel_regularizer=reg, activation=activ)(inputs)
for i in range(1, n_layers):
if i != n_layers - 1:
encoded = LSTM(n_units[i], name='encoder_{}'.format(i + 1), return_sequences=(n_layers != 1),
kernel_initializer=init, kernel_regularizer=reg, activation=activ)(encoded)
else:
encoded = LSTM(n_units[i], name='encoder_{}'.format(i + 1), return_sequences=False,
kernel_initializer=init, kernel_regularizer=reg, activation=activ)(encoded)
# repeat the vector (plug the encoder to the decoder)
repeated = Lambda(repeat_vector, output_shape=(None, n_units[-1]), name='repeat')([encoded, inputs])
# the decoder
decoded = LSTM(n_units[n_layers - 1], return_sequences=True, name='decoder_1',
kernel_initializer=init, kernel_regularizer=reg, activation=activ)(repeated) # first layer
for i in range(1, n_layers):
decoded = LSTM(n_units[n_layers - 1 - i], return_sequences=True, name='decoder_{}'.format(i + 1),
kernel_initializer=init, kernel_regularizer=reg, activation=activ)(decoded)
# last layer
tdist = TimeDistributed(Dense(n_atts))(decoded)
# compile the model
model = Model(inputs, tdist, name='lstm-ae')
model.compile(optimizer=optimizer, loss=loss_metric)
For information, I use tensorflow 2.5.
Because the number of units is read from a config file, I wrote the code this way to add the layers programmatically.

A simple linear regression model with a DenseVariational layer in Tensorflow-Probability returns: TypeError: 'NoneType' object is not callable

This is an attempt to use Tensforflow-Probability and more specifically the DenseVariational layer but it fails for some reason. How can I correct the code?
x_train = np.linspace(-1, 1, 100)[:, np.newaxis]
y_train = x_train + 0.3*np.random.randn(100)[:, np.newaxis]
def prior(kernel_size, bias_size, dtype = None):
n = kernel_size + bias_size
prior_model = Sequential([
tfpl.DistributionLambda(
lambda t: tfd.MultivariateNormalDiag(loc = tf.zeros(n) , scale_diag = tf.ones(n)
))
])
def posterior(kernel_size, bias_size, dtype = None):
n = kernel_size + bias_size
posterior_model = Sequential([
tfpl.VariableLayer(tfpl.MultivariateNormalTriL.params_size(n) , dtype = dtype), # The parameters of the model are declared Variables that are trainable
tfpl.MultivariateNormalTriL(n) # The posterior function will return to the Variational layer that will call it a MultivariateNormalTril object that will have as many dimensions
# as the parameters of the Variational Dense Layer. That means that each parameter will be generated by a distinct Normal Gaussian shifted and scaled
# by a mu and sigma learned from the data, independently of all the other weights. The output of this Variablelayer will become the input to the
# MultivariateNormalTriL object.
# The shape of the VariableLayer object will be defined by the number of parameters needed to create the MultivariateNormalTriL object given
# that it will live in a Space of n dimensions (event_size = n). This number is returned by the tfpl.MultivariateNormalTriL.params_size(n)
])
return(posterior_model)
model = Sequential([
tfpl.DenseVariational(
input_shape = (1, ), # The input is of dimensionality 1, a series
units = 1, # A linear regression is represented by a Dense layer with one single unit
make_prior_fn = prior, # We pass the function we have defined which returns the prior distribution on the weights
make_posterior_fn = posterior, # We pass the function we have defined which returns the variational approximation of the posterior distribution on the weights
kl_weight = 1/ x_train.shape[0], # Tensorflow scales the likelihood loss calculated using the mini-batch to become an unbiased estimator of the true loss but does not do the
# same for the DL divergence loss. Here we instruct it to do the necessary scaling.
kl_use_exact = True # Unless there is a closed form equation for the KL divergence in the library of Tensorflow setting True will return error. By setting False instead
# the KL Divergence will be approxiated using Sampling
)
])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-14-e7cf0bfd5902> in <module>
17 # same for the DL divergence loss. Here we instruct it to do the necessary scaling.
18
---> 19 kl_use_exact = True # Unless there is a closed form equation for the KL divergence in the library of Tensorflow setting True will return error. By setting False instead
20 # the KL Divergence will be approxiated using Sampling
21
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\training\tracking\base.py in _method_wrapper(self, *args, **kwargs)
455 self._self_setattr_tracking = False # pylint: disable=protected-access
456 try:
--> 457 result = method(self, *args, **kwargs)
458 finally:
459 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\sequential.py in __init__(self, layers, name)
140 layers = [layers]
141 for layer in layers:
--> 142 self.add(layer)
143
144 #property
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\training\tracking\base.py in _method_wrapper(self, *args, **kwargs)
455 self._self_setattr_tracking = False # pylint: disable=protected-access
456 try:
--> 457 result = method(self, *args, **kwargs)
458 finally:
459 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\sequential.py in add(self, layer)
204 # and create the node connecting the current layer
205 # to the input layer we just created.
--> 206 layer(x)
207 set_inputs = True
208
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\base_layer.py in __call__(self, *args, **kwargs)
924 if _in_functional_construction_mode(self, inputs, args, kwargs, input_list):
925 return self._functional_construction_call(inputs, args, kwargs,
--> 926 input_list)
927
928 # Maintains info about the `Layer.call` stack.
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\base_layer.py in _functional_construction_call(self, inputs, args, kwargs, input_list)
1115 try:
1116 with ops.enable_auto_cast_variables(self._compute_dtype_object):
-> 1117 outputs = call_fn(cast_inputs, *args, **kwargs)
1118
1119 except errors.OperatorNotAllowedInGraphError as e:
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\autograph\impl\api.py in wrapper(*args, **kwargs)
253 try:
254 with conversion_ctx:
--> 255 return converted_call(f, args, kwargs, options=options)
256 except Exception as e: # pylint:disable=broad-except
257 if hasattr(e, 'ag_error_metadata'):
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\autograph\impl\api.py in converted_call(f, args, kwargs, caller_fn_scope, options)
455 if conversion.is_in_whitelist_cache(f, options):
456 logging.log(2, 'Whitelisted %s: from cache', f)
--> 457 return _call_unconverted(f, args, kwargs, options, False)
458
459 if ag_ctx.control_status_ctx().status == ag_ctx.Status.DISABLED:
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow\python\autograph\impl\api.py in _call_unconverted(f, args, kwargs, options, update_cache)
337
338 if kwargs is not None:
--> 339 return f(*args, **kwargs)
340 return f(*args)
341
~\Anaconda3\envs\tf2\lib\site-packages\tensorflow_probability\python\layers\dense_variational_v2.py in call(self, inputs)
120
121 q = self._posterior(inputs)
--> 122 r = self._prior(inputs)
123 self.add_loss(self._kl_divergence_fn(q, r))
124
TypeError: 'NoneType' object is not callable
Did you fail to return ?
def prior(kernel_size, bias_size, dtype=None):
n = kernel_size + bias_size
prior_model = tf.keras.Sequential([
tfp.layers.DistributionLambda(
lambda t: tfd.MultivariateNormalDiag(loc=tf.zeros(n), scale_diag=tf.ones(n)
))
])
return (prior_model)

can't apply sklearn.compose.ColumnTransformer on only one column of pandas dataframe

I have defined a custom tansformer that takes a pandas dataframe, apply a function on only one column and leaves all the remaining columns untouched. The transformer is working fine during testing, but not when I include it as part of a Pipeline.
Here's the transformer:
import re
from sklearn.base import BaseEstimator, TransformerMixin
class SynopsisCleaner(BaseEstimator, TransformerMixin):
def __init__(self):
return None
def fit(self, X, y=None, **fit_params):
# nothing to learn from data.
return self
def clean_text(self, text):
text = text.lower()
text = re.sub(r'#[a-zA-Z0-9_]+', '', text)
text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)
text = re.sub(r'www.[^ ]+', '', text)
text = re.sub(r'[a-zA-Z0-9]*www[a-zA-Z0-9]*com[a-zA-Z0-9]*', '', text)
text = re.sub(r'[^a-zA-Z]', ' ', text)
text = [token for token in text.split() if len(token) > 2]
text = ' '.join(text)
return text
def transform(self, X, y=None, **fit_params):
for i in range(X.shape[0]):
X[i] = self.clean_text(X[i])
return X
When I test it manually like this, it is working just as expected.
train_synopsis = SynopsisCleaner().transform(train_data['Synopsis'])
But, when I include it as a part of sklearn pipeline:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# part 1: defining a column transformer that learns on only one column and transforms it
synopsis_clean_col_tran = ColumnTransformer(transformers=[('synopsis_clean_col_tran', SynopsisCleaner(), ['Synopsis'])],
# set remainder to passthrough to pass along all the un-specified columns untouched to the next steps
remainder='passthrough')
# make a pipeline now with all the steps
pipe_1 = Pipeline(steps=[('synopsis_cleaning', synopsis_clean_col_tran)])
pipe_1.fit(train_data)
I get KeyError, like shown below:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2890 try:
-> 2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
16 frames
<ipython-input-10-3396fa5d6092> in <module>()
6 # make a pipeline now with all the steps
7 pipe_1 = Pipeline(steps=[('synopsis_cleaning', synopsis_clean_col_tran)])
----> 8 pipe_1.fit(train_data)
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
352 self._log_message(len(self.steps) - 1)):
353 if self._final_estimator != 'passthrough':
--> 354 self._final_estimator.fit(Xt, y, **fit_params)
355 return self
356
/usr/local/lib/python3.6/dist-packages/sklearn/compose/_column_transformer.py in fit(self, X, y)
482 # we use fit_transform to make sure to set sparse_output_ (for which we
483 # need the transformed data) to have consistent output type in predict
--> 484 self.fit_transform(X, y=y)
485 return self
486
/usr/local/lib/python3.6/dist-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
516 self._validate_remainder(X)
517
--> 518 result = self._fit_transform(X, y, _fit_transform_one)
519
520 if not result:
/usr/local/lib/python3.6/dist-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
455 message=self._log_message(name, idx, len(transformers)))
456 for idx, (name, trans, column, weight) in enumerate(
--> 457 self._iter(fitted=fitted, replace_strings=True), 1))
458 except ValueError as e:
459 if "Expected 2D array, got 1D array instead" in str(e):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self)
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
252 return [func(*args, **kwargs)
--> 253 for func, args, kwargs in self.items]
254
255 def __reduce__(self):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in <listcomp>(.0)
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
252 return [func(*args, **kwargs)
--> 253 for func, args, kwargs in self.items]
254
255 def __reduce__(self):
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
726 with _print_elapsed_time(message_clsname, message):
727 if hasattr(transformer, 'fit_transform'):
--> 728 res = transformer.fit_transform(X, y, **fit_params)
729 else:
730 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.6/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
569 if y is None:
570 # fit method of arity 1 (unsupervised transformation)
--> 571 return self.fit(X, **fit_params).transform(X)
572 else:
573 # fit method of arity 2 (supervised transformation)
<ipython-input-6-004ee595d544> in transform(self, X, y, **fit_params)
20 def transform(self, X, y=None, **fit_params):
21 for i in range(X.shape[0]):
---> 22 X[i] = self.clean_text(X[i])
23 return X
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __getitem__(self, key)
2900 if self.columns.nlevels > 1:
2901 return self._getitem_multilevel(key)
-> 2902 indexer = self.columns.get_loc(key)
2903 if is_integer(indexer):
2904 indexer = [indexer]
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
-> 2893 raise KeyError(key) from err
2894
2895 if tolerance is not None:
KeyError: 0
What am I doing wrong here?
EDIT 1: without brackets and the column name specified as string, this is the error I see:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-11-bdd42b09e2af> in <module>()
6 # make a pipeline now with all the steps
7 pipe_1 = Pipeline(steps=[('synopsis_cleaning', synopsis_clean_col_tran)])
----> 8 pipe_1.fit(train_data)
3 frames
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
352 self._log_message(len(self.steps) - 1)):
353 if self._final_estimator != 'passthrough':
--> 354 self._final_estimator.fit(Xt, y, **fit_params)
355 return self
356
/usr/local/lib/python3.6/dist-packages/sklearn/compose/_column_transformer.py in fit(self, X, y)
482 # we use fit_transform to make sure to set sparse_output_ (for which we
483 # need the transformed data) to have consistent output type in predict
--> 484 self.fit_transform(X, y=y)
485 return self
486
/usr/local/lib/python3.6/dist-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
536
537 self._update_fitted_transformers(transformers)
--> 538 self._validate_output(Xs)
539
540 return self._hstack(list(Xs))
/usr/local/lib/python3.6/dist-packages/sklearn/compose/_column_transformer.py in _validate_output(self, result)
400 raise ValueError(
401 "The output of the '{0}' transformer should be 2D (scipy "
--> 402 "matrix, array, or pandas DataFrame).".format(name))
403
404 def _validate_features(self, n_features, feature_names):
ValueError: The output of the 'synopsis_clean_col_tran' transformer should be 2D (scipy matrix, array, or pandas DataFrame).
In your manual test, you are passing the Series train_data['Synopsis'], but the column transformer is passing the Frame train_data[['Synopsis']]. (So, to clarify the error: X[i] is trying to get the column named 0, which indeed does not exist.) You should be able to fix this as easily as dropping the brackets around 'Synopsis' in the column specification of the transformer. From the docs:
...A scalar string or int should be used where transformer expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. ...
That is,
synopsis_clean_col_tran = ColumnTransformer(
transformers=[('synopsis_clean_col_tran', SynopsisCleaner(), 'Synopsis')],
# set remainder to passthrough to pass along all the un-specified columns untouched to the next steps
remainder='passthrough',
)
Ah, but then ColumnTransformer complains that the output of your transformer is one-dimensional; that's unfortunate. I think the cleanest thing then is to switch your transform to expect both input and output as 2D. If you'll only ever need dataframes as input (no other sklearn transformers converting to numpy arrays), then this can be relatively simple using a FunctionTransformer instead of your custom class.
def clean_text_frame(X):
return X.applymap(clean_text) # the function "clean_text" currently in your class.
synopsis_clean_col_tran = ColumnTransformer(
transformers=[('synopsis_clean_col_tran', FunctionTransformer(clean_text_frame), ['Synopsis'])],
# set remainder to passthrough to pass along all the un-specified columns untouched to the next steps
remainder='passthrough',
)

How to use SHAP with a linear SVC model from sklearn using Pipeline?

I am doing text classification using a linear SVC model from sklearn. Now I want to visualize which words/tokens have the highest impact on the classification decision by using SHAP (https://github.com/slundberg/shap).
Right now this does not work because I am getting an error that seems to originate from the vectorizer step in the pipeline I have defined - whats wrong here?
Is my general approach on how to use SHAP in this case correct?
x_Train, x_Test, y_Train, y_Test = train_test_split(df_all['PDFText'], df_all['class'], test_size = 0.2, random_state = 1234)
pipeline = Pipeline([
(
'tfidv',
TfidfVectorizer(
ngram_range=(1,3),
analyzer='word',
strip_accents = ascii,
use_idf = True,
sublinear_tf=True,
max_features=6000,
min_df=2,
max_df=1.0
)
),
(
'lin_svc',
svm.SVC(
C=1.0,
probability=True,
kernel='linear'
)
)
])
pipeline.fit(x_Train, y_Train)
shap.initjs()
explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
shap_values = explainer.shap_values(x_Test, nsamples=100)
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_Test.iloc[0,:])
This is the error message I get:
Provided model function fails when applied to the provided data set.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-81-4bca63616b3b> in <module>
3
4 # use Kernel SHAP to explain test set predictions
----> 5 explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
6 shap_values = explainer.shap_values(x_Test, nsamples=100)
7
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\explainers\kernel.py in __init__(self, model, data, link, **kwargs)
95 self.keep_index_ordered = kwargs.get("keep_index_ordered", False)
96 self.data = convert_to_data(data, keep_index=self.keep_index)
---> 97 model_null = match_model_to_data(self.model, self.data)
98
99 # enforce our current input type limitations
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\common.py in match_model_to_data(model, data)
80 out_val = model.f(data.convert_to_df())
81 else:
---> 82 out_val = model.f(data.data)
83 except:
84 print("Provided model function fails when applied to the provided data set.")
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
116
117 # lambda, but not partial, allows help() to work with update_wrapper
--> 118 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
119 # update the docstring of the returned function
120 update_wrapper(out, self.fn)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\pipeline.py in predict_proba(self, X)
379 for name, transform in self.steps[:-1]:
380 if transform is not None:
--> 381 Xt = transform.transform(Xt)
382 return self.steps[-1][-1].predict_proba(Xt)
383
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents, copy)
1631 check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')
1632
-> 1633 X = super(TfidfVectorizer, self).transform(raw_documents)
1634 return self._tfidf.transform(X, copy=False)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents)
1084
1085 # use the same matrix-building strategy as fit_transform
-> 1086 _, X = self._count_vocab(raw_documents, fixed_vocab=True)
1087 if self.binary:
1088 X.data.fill(1)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
940 for doc in raw_documents:
941 feature_counter = {}
--> 942 for feature in analyze(doc):
943 try:
944 feature_idx = vocabulary[feature]
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
326 tokenize)
327 return lambda doc: self._word_ngrams(
--> 328 tokenize(preprocess(self.decode(doc))), stop_words)
329
330 else:
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
KernelExplainer expects to receive a classification model as the first argument. Please check the use of Pipeline with Shap following the link.
In your case, you can use the Pipeline as follows:
x_Train = pipeline.named_steps['tfidv'].fit_transform(x_Train)
explainer = shap.KernelExplainer(pipeline.named_steps['lin_svc'].predict_proba, x_Train)

Resources