Reading a custom pyspark transformer - apache-spark

After messing with this for quite a while, in Spark 2.3 I am finally able to get a pure python custom transformer saved. But I get an error while loading the transformer back.
I checked the content of what was saved and find all the relevant variable saved in the file on HDFS. Would be great if someone can spot what I am missing to do in this simple transformer.
from pyspark.ml import Transformer
from pyspark.ml.param.shared import Param,Params,TypeConverters
class AggregateTransformer(Transformer,DefaultParamsWritable,DefaultParamsReadable):
aggCols = Param(Params._dummy(), "aggCols", "",TypeConverters.toListString)
valCols = Param(Params._dummy(), "valCols", "",TypeConverters.toListString)
def __init__(self,aggCols,valCols):
super(AggregateTransformer, self).__init__()
self._setDefault(aggCols=[''])
self._set(aggCols = aggCols)
self._setDefault(valCols=[''])
self._set(valCols = valCols)
def getAggCols(self):
return self.getOrDefault(self.aggCols)
def setAggCols(self, aggCols):
self._set(aggCols=aggCols)
def getValCols(self):
return self.getOrDefault(self.valCols)
def setValCols(self, valCols):
self._set(valCols=valCols)
def _transform(self, dataset):
aggFuncs = []
for valCol in self.getValCols():
aggFuncs.append(F.sum(valCol).alias("sum_"+valCol))
aggFuncs.append(F.min(valCol).alias("min_"+valCol))
aggFuncs.append(F.max(valCol).alias("max_"+valCol))
aggFuncs.append(F.count(valCol).alias("cnt_"+valCol))
aggFuncs.append(F.avg(valCol).alias("avg_"+valCol))
aggFuncs.append(F.stddev(valCol).alias("stddev_"+valCol))
dataset = dataset.groupBy(self.getAggCols()).agg(*aggFuncs)
return dataset
I get this error when I load an instance of this transformer after saving it.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-172-44e20f7e3842> in <module>()
----> 1 x = agg.load("/tmp/test")
/usr/hdp/current/spark2.3-client/python/pyspark/ml/util.py in load(cls, path)
309 def load(cls, path):
310 """Reads an ML instance from the input path, a shortcut of `read().load(path)`."""
--> 311 return cls.read().load(path)
312
313
/usr/hdp/current/spark2.3-client/python/pyspark/ml/util.py in load(self, path)
482 metadata = DefaultParamsReader.loadMetadata(path, self.sc)
483 py_type = DefaultParamsReader.__get_class(metadata['class'])
--> 484 instance = py_type()
485 instance._resetUid(metadata['uid'])
486 DefaultParamsReader.getAndSetParams(instance, metadata)
TypeError: __init__() missing 2 required positional arguments: 'aggCols' and 'valCols'

Figured out the answer!
The problem was that a new Transformer class was being initialized by the reader but the init function for my AggregateTransformer didnt have default values for the arguments.
So changing the following line of code fixed the issue!
def __init__(self,aggCols=[],valCols=[]):
Going to leave this answer and question here since it was incredibly difficult for me to find a working example of a pure python transformer that could be saved and read back anywhere! It could help someone looking for this.

Related

Cannot interpret SVM model using Shapash

Currently, I'm exploring machine learning interpretability tools for one of my project. I found Shapash quite a new tool and many people suggesting to use it to create a few easily interpretable charts for ML model. When I tried it with RandomForestClassifier it worked fine and generate a webpage full of different charts but the same I cannot achieve while using SVM(just exploring this library, not focusing on the perfect ML model for a problem).
Note - using Shapash link here
#Fit blackbox model
svc = svm.SVC()
svc.fit(X_train_smote, y_train_smote)
y_pred = svc.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")
from shapash import SmartExplainer
xpl = SmartExplainer(model=svc)
error which I'm getting -
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
/tmp/ipykernel_13648/1233939729.py in <module>
----> 1 xpl = SmartExplainer(model=svc)
~/Python_AI/ai_env/lib/python3.8/site-packages/shapash/explainer/smart_explainer.py in __init__(self, model, backend, preprocessing, postprocessing, features_groups, features_dict, label_dict, title_story, palette_name, colors_dict, **kwargs)
194 if isinstance(backend, str):
195 backend_cls = get_backend_cls_from_name(backend)
--> 196 self.backend = backend_cls(
197 model=self.model, preprocessing=preprocessing, **kwargs)
198 elif isinstance(backend, BaseBackend):
~/Python_AI/ai_env/lib/python3.8/site-packages/shapash/backend/shap_backend.py in __init__(self, model, preprocessing, explainer_args, explainer_compute_args)
16 self.explainer_args = explainer_args if explainer_args else {}
17 self.explainer_compute_args = explainer_compute_args if explainer_compute_args else {}
---> 18 self.explainer = shap.Explainer(model=model, **self.explainer_args)
19
20 def run_explainer(self, x: pd.DataFrame) -> dict:
~/Python_AI/ai_env/lib/python3.8/site-packages/shap/explainers/_explainer.py in __init__(self, model, masker, link, algorithm, output_names, feature_names, **kwargs)
166 # if we get here then we don't know how to handle what was given to us
167 else:
--> 168 raise Exception("The passed model is not callable and cannot be analyzed directly with the given masker! Model: " + str(model))
169
170 # build the right subclass
Exception: The passed model is not callable and cannot be analyzed directly with the given masker! Model: SVC()

Got error "AttributeError: 'TimestepEmbedSequential' object has no attribute '__globals__'" while torch.jit.script()

While trying to script Stable Diffusion model using torch.jit.script(), I got this following error:
AttributeError: 'TimestepEmbedSequential' object has no attribute '__globals__'
I'm trying to export this model to ONNX and found out that running torch.onnx.export() will torch.jit.trace the models, which unrolls every loops, so I'm trying to use script first.
When I follow the traceback, the error occurs in this function while reading fn.__globals__ in _jit_internal.py from torch
def get_closure(fn):
"""
Get a dictionary of closed over variables from a function
"""
captures = {}
captures.update(fn.__globals__)
for index, captured_name in enumerate(fn.__code__.co_freevars):
captures[captured_name] = fn.__closure__[index].cell_contents
return captures
code for scripting is as follows:
stablediffusion_wrapper = StableDiffusionWrapper(model, sampler, opt)
scripted_module = torch.jit.script(stablediffusion_wrapper, example_inputs=[(token_dummy, img_dummy)])
and TimestepEmbedSequential module:
class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
"""
A sequential module that passes timestep embeddings to the children that
support it as an extra input.
"""
def forward(self, x, emb, context=None):
for layer in self:
if isinstance(layer, TimestepBlock):
x = layer(x, emb)
elif isinstance(layer, SpatialTransformer):
x = layer(x, context)
else:
x = layer(x)
return x
Any suggestions how can I figure it out?
I tried to set #torch.jit.export decorator to the parent class TimestepBlock, but showed no effect. Actually, I have no idea what to look for this problem. I would appreciate any suggestions. Thank you

unabel to load a ppo model

hello I've trained a PPO model from stabel_baselines3 on collab I saved it
model.save("model")
but when I tried loading it I got the following error:
m = PPO.load("model", env=env)
AttributeError Traceback (most recent call last)
/tmp/ipykernel_25649/121834194.py in <module>
2 env = e.MinitaurBulletEnv(render=False)
3 env.reset()
----> 4 m2 = PPO.load("model", env=env)
5 for episode in range(1, 6):
6 obs = env.reset()
~/anaconda3/lib/python3.8/site-packages/stable_baselines3/common/base_class.py in load(cls, path, env, device, custom_objects, **kwargs)
668 env = cls._wrap_env(env, data["verbose"])
669 # Check if given env is valid
--> 670 check_for_correct_spaces(env, data["observation_space"], data["action_space"])
671 else:
672 # Use stored env, if one exists. If not, continue as is (can be used for predict)
~/anaconda3/lib/python3.8/site-packages/stable_baselines3/common/utils.py in check_for_correct_spaces(env, observation_space, action_space)
217 :param action_space: Action space to check against
218 """
--> 219 if observation_space != env.observation_space:
220 raise ValueError(f"Observation spaces do not match: {observation_space} != {env.observation_space}")
221 if action_space != env.action_space:
~/anaconda3/lib/python3.8/site-packages/gym/spaces/box.py in __eq__(self, other)
138
139 def __eq__(self, other):
--> 140 return isinstance(other, Box) and (self.shape == other.shape) and np.allclose(self.low, other.low) and np.allclose(self.high, other.high)
AttributeError: 'Box' object has no attribute 'shape'
knowing that the env is a box env from pybullet
import pybullet_envs.bullet.minitaur_gym_env as e
import gym
env = e.MinitaurBulletEnv(render=False)
env.reset()
additional info is that the model loaded perfectly in collab
From your question, I can't tell if you are or aren't working on Google Colab, but if you are, I think you should definitely include the whole path to the saved model when you load it. Maybe you need to do this even if not in Colab.
What I mean is that your line of code should probably look something like this when you're loading the model:
m = PPO.load("./model.zip/", env=env)
I hope this helps!

Setting `remove_unused_columns=False` causes error in HuggingFace Trainer class

I am training a model using HuggingFace Trainer class. The following code does a decent job:
!pip install datasets
!pip install transformers
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
dataset = load_dataset('glue', 'mnli')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, padding=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
args = TrainingArguments(
"test-glue",
learning_rate=3e-5,
per_device_train_batch_size=8,
num_train_epochs=3,
remove_unused_columns=True
)
trainer = Trainer(
model,
args,
train_dataset=encoded_dataset["train"],
tokenizer=tokenizer
)
trainer.train()
However, setting remove_unused_columns=False results in the following error:
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
704 if not is_tensor(value):
--> 705 tensor = as_tensor(value)
706
ValueError: too many dimensions 'str'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
8 frames
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
720 )
721 raise ValueError(
--> 722 "Unable to create tensor, you should probably activate truncation and/or padding "
723 "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
724 )
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.
Any suggestions are highly appreciated.
It fails because the value in line 705 is a list of str, which points to hypothesis. And hypothesis is one of the ignored_columns in trainer.py.
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
704 if not is_tensor(value):
--> 705 tensor = as_tensor(value)
See the below snippet from trainer.py for the remove_unused_columns flag:
def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None):
if not self.args.remove_unused_columns:
return dataset
if self._signature_columns is None:
# Inspect model forward signature to keep only the arguments it accepts.
signature = inspect.signature(self.model.forward)
self._signature_columns = list(signature.parameters.keys())
# Labels may be named label or label_ids, the default data collator handles that.
self._signature_columns += ["label", "label_ids"]
columns = [k for k in self._signature_columns if k in dataset.column_names]
ignored_columns = list(set(dataset.column_names) - set(self._signature_columns))
There could be a potential pull request on HuggingFace to provide a fallback option in case the flag is False. But in general, it looks like that the flag implementation is not complete for e.g. it can't be used with Tensorflow.
On the contrary, it doesn't hurt to keep it True, unless there is some special need.

huggingface's ReformerForMaskedLM configuration issue

I'm trying to pass the all of the huggingface's ...ForMaskedLM to the FitBert model for fill-in-the-blank task and see which pretrained yields the best result on the data I've prepared. But in the Reformer module I have this error says that I need to do 'config.is_decoder=False' but I don't really get what this means (This is my first time using huggingface). I tried to pass a ReformerConfig(is_decoder=False) to the model but still get the same error. How can I fix this?
My code:
pretrained_weights = ['google/reformer-crime-and-punishment',
'google/reformer-enwik8']
configurations = ReformerConfig(is_decoder=False)
for weight in pretrained_weights:
print(weight)
model = ReformerForMaskedLM(configurations).from_pretrained(weight)
tokenizer = ReformerTokenizer.from_pretrained(weight)
fb = FitBert(model=model, tokenizer=tokenizer)
predicts = []
for _, row in df.iterrows():
predicts.append(fb.rank(row['question'], options=[row['1'], row['2'], row['3'], row['4']])[0])
print(weight,':', np.sum(df.anwser==predicts) / df.shape[0])
Error:
AssertionError Traceback (most recent call last)
<ipython-input-5-a6016e0015ba> in <module>()
4 for weight in pretrained_weights:
5 print(weight)
----> 6 model = ReformerForMaskedLM(configurations).from_pretrained(weight)
7 tokenizer = ReformerTokenizer.from_pretrained(weight)
8 fb = FitBert(model=model, tokenizer=tokenizer)
/usr/local/lib/python3.7/dist-packages/transformers/modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
1032
1033 # Instantiate model.
-> 1034 model = cls(config, *model_args, **model_kwargs)
1035
1036 if state_dict is None and not from_tf:
/usr/local/lib/python3.7/dist-packages/transformers/models/reformer/modeling_reformer.py in __init__(self, config)
2304 assert (
2305 not config.is_decoder
-> 2306 ), "If you want to use `ReformerForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention."
2307 self.reformer = ReformerModel(config)
2308 self.lm_head = ReformerOnlyLMHead(config)
AssertionError: If you want to use `ReformerForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.
You can override certain model configurations by loading the model config separately and providing it as parameter for the from_pretrained() method. This will assure that you are using the proper model configuration with the changes you have made:
from transformers import ReformerConfig, ReformerForMaskedLM
config = ReformerConfig.from_pretrained('google/reformer-crime-and-punishment')
print(config.is_decoder)
config.is_decoder=False
print(config.is_decoder)
model = ReformerForMaskedLM.from_pretrained('google/reformer-crime-and-punishment', config=config)
Output:
True
False

Resources