How to load a spark model - apache-spark

I do not succeed in loading the model and just saved. I have got a strange error.
from transforms.api import Output, transform,transform_df
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel
import logging
logger = logging.getLogger(__name__)
def save_model(spark_session, output, model, model_name='model4'):
foundry_file_system = output.filesystem()._foundry_fs
logger.info("The path 1 is : "+ str(foundry_file_system))
path = foundry_file_system._root_path + "/" + model_name
logger.info("The path 2 is : "+ str(path))
model.write().overwrite().session(spark_session).save(path)
model=LogisticRegressionModel.read().session(spark_session).load(path)
df_to_predict = spark_session.createDataFrame([(
Vectors.dense([0.0, 1.1, 0.1]),
Vectors.dense([2.0, 1.0, -1.0]),
Vectors.dense([2.0, 1.3, 1.0]),
Vectors.dense([0.0, 1.2, -0.5]),)], ["features"])
df_predicted = model.transform(df_to_predict)
logger.info(df_predicted.show())
logger.info(df_predicted.count())
def my_compute_function(ctx, output_model):
training = ctx.spark_session.createDataFrame([
(1.0, Vectors.dense([0.0, 1.1, 0.1])),
(0.0, Vectors.dense([2.0, 1.0, -1.0])),
(0.0, Vectors.dense([2.0, 1.3, 1.0])),
(1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
lr = LogisticRegression(maxIter=10, regParam=0.01)
model1 = lr.fit(training)
save_model(ctx.spark_session, output_model, model1, 'model4')
Here is the error I get:
NonRetryableError: Py4JJavaError: An error occurred while calling
o266.load. : scala.MatchError:
[2,3,[1,null,null,WrappedArray(0.06817659473873602)],[1,1,3,null,null,WrappedArray(-3.1009356010205322,
2.6082147383214482, -0.38017912254303043),true],false] (of class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) at
org.apache.spark.ml.classification.LogisticRegressionModel$LogisticRegressionModelReader.load(LogisticRegression.scala:1273)
....

That error is indicative of using a different method to load the model than what was used to write the model.
You should be using LogisticRegressionModel.load not LogisticRegression.read()
This can also be caused if the parquet metadata doesn't match. I recommend that you set the summary metadata level to NONE
spark.conf.set("parquet.summary.metadata.level", "NONE")

Related

The Keras MultiHeadAttention() class does not return expected values

I would like to match the results of the self_attention() function on page 339 of the Chollet's book, Deep learning with Python, second edition, with those of the MultiHeadAttention() example just below on the same page.
I wrote an example with the same input and I have different results. Can somebody explain why? I inserted the self_attention() function for clarity.
import numpy as np
from scipy.special import softmax
from tensorflow.keras.layers import MultiHeadAttention
def self_attention(input_sequence):
output = np.zeros(shape=input_sequence.shape)
# The output will consist of contextual embeddinsgs of the same shape
for i, pivot_vector in enumerate(input_sequence):
scores = np.zeros(shape=(len(input_sequence),))
for j, vector in enumerate(input_sequence):
scores[j] = np.dot(pivot_vector, vector.T) # Q K^T
scores /= np.sqrt(input_sequence.shape[1]) # sqrt(d_k)
scores = softmax(scores) # softmax(Q K^T / sqrt(d_k))
print(i, scores)
new_pivot_representation = np.zeros(shape=pivot_vector.shape)
for j, vector in enumerate(input_sequence):
new_pivot_representation += vector * scores[j]
output[i] = new_pivot_representation
return output
test_input_sequence = np.array([[[1.0, 0.0, 0.0, 1.0],
[0.0, 1.0, 0.0, 0.0],
[0.0, 1.0, 1.0, 1.0]]])
test_input_sequence.shape
# (1, 3, 4)
self_attention(test_input_sequence[0])
"""
returns
[[0.50648039 0.49351961 0.30719589 0.81367628]
[0.23269654 0.76730346 0.38365173 0.61634827]
[0.21194156 0.78805844 0.57611688 0.78805844]]
the attention scores being:
[0.50648039 0.18632372 0.30719589]
[0.23269654 0.38365173 0.38365173]
[0.21194156 0.21194156 0.57611688]
"""
att_layer = MultiHeadAttention(num_heads=1,
key_dim=4,
use_bias=False,
attention_axes=(1,))
att_layer(test_input_sequence,
test_input_sequence,
test_input_sequence,
return_attention_scores=True)
"""
returns
array([[[-0.46123487, 0.36683324, -0.47130704, -0.00722525],
[-0.49571565, 0.37488416, -0.52883905, -0.02713571],
[-0.4566634 , 0.38055322, -0.45884743, -0.00156384]]],
dtype=float32)
and the attention scores
array([[[[0.31446996, 0.36904442, 0.3164856 ],
[0.34567958, 0.2852166 , 0.36910382],
[0.2934979 , 0.3996053 , 0.30689687]]]], dtype=float32)>)
"""
I found the answer. This is due to the three dense layers before query, key, and value, and the one after the attention module (this last dense layer is missing from Fig. 11.8 in the book).
To reproduce the results of self_attention(), we just need to have pass-through dense layers:
i_4 = np.identity(4)
w_pt_4 = [i_4.reshape(4, 1, 4) for _ in range(3)] + [i_4.reshape(1, 4, 4)]
att_layer.set_weights(w_pt_4)

Use TransformDataset without using AnalyzeAndTransformDataset

I am trying to use tensorflow transform and I would like to serialise a whole pipeline composed by different transformations. Let say I have a transformation that doesn't have to be fitted (as feature interaction between numeric columns). I want to use the TransformDataset function directly on the preprocessing function I have already defined. Anyway it seems this is not possible
If a run something like this
import pprint
import tempfile
import apache_beam as beam
import pandas as pd
import tensorflow as tf
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata, schema_utils
NUMERIC_FEATURE_KEYS = ['a', 'b', 'c']
impute_dictionary = dict(b=1.0, c=0.0)
RAW_DATA_FEATURE_SPEC = dict([(name, tf.io.FixedLenFeature([], tf.float32)) for name in NUMERIC_FEATURE_KEYS])
RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC))
def interaction_fn(inputs):
outputs = inputs.copy()
new_numeric_feature_keys = []
for i in range(len(NUMERIC_FEATURE_KEYS)):
for j in range(i, len(NUMERIC_FEATURE_KEYS)):
if i == j:
outputs[f'{NUMERIC_FEATURE_KEYS[i]}_squared'] = outputs[NUMERIC_FEATURE_KEYS[i]] * outputs[NUMERIC_FEATURE_KEYS[i]]
new_numeric_feature_keys.append(f'{NUMERIC_FEATURE_KEYS[i]}_squared')
else:
outputs[f'{NUMERIC_FEATURE_KEYS[i]}_{NUMERIC_FEATURE_KEYS[j]}'] = outputs[NUMERIC_FEATURE_KEYS[i]] * outputs[ NUMERIC_FEATURE_KEYS[j]]
new_numeric_feature_keys.append(f'{NUMERIC_FEATURE_KEYS[i]}_{NUMERIC_FEATURE_KEYS[j]}')
NUMERIC_FEATURE_KEYS.extend(new_numeric_feature_keys)
return outputs
if __name__ == '__main__':
temp = tempfile.gettempdir()
data = pd.DataFrame(dict(
a=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
b=[1.0, 1.0, 1.0, 2.0, 0.0, 1.0],
c=[0.9, 2.0, 1.0, 0.0, 0.0, 0.0]
))
data.to_parquet('data_no_nans.parquet')
x = {}
for col in data.columns:
x[col] = tf.constant(data[col], dtype=tf.float32, name=col)
with beam.Pipeline() as pipeline:
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
raw_data = pipeline | 'ReadTrainData' >> beam.io.ReadFromParquet('data_no_nans.parquet')
raw_dataset = (raw_data, RAW_DATA_METADATA)
transformed_data, _ = (raw_data, interaction_fn) | tft_beam.TransformDataset()
transformed_data | beam.Map(pprint.pprint)
I get the error
2020-02-11 15:49:37.025525: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-02-11 15:49:37.132944: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f87ddda6d30 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-02-11 15:49:37.132959: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
WARNING:tensorflow:Tensorflow version (2.1.0) found. Note that Tensorflow Transform support for TF 2.0 is currently in beta, and features such as tf.function may not work as intended.
WARNING:tensorflow:Tensorflow version (2.1.0) found. Note that Tensorflow Transform support for TF 2.0 is currently in beta, and features such as tf.function may not work as intended.
Traceback (most recent call last):
File "/Users/andrea.marchini/Hackathon/tfx_test/foo.py", line 56, in <module>
transformed_data, _ = (raw_data, interaction_fn) | tft_beam.TransformDataset()
File "/Users/andrea.marchini/.local/share/virtualenvs/tfx_test-jg7eSsGQ/lib/python3.7/site-packages/apache_beam/transforms/ptransform.py", line 482, in __ror__
pvalueish, pvalues = self._extract_input_pvalues(left)
File "/Users/andrea.marchini/.local/share/virtualenvs/tfx_test-jg7eSsGQ/lib/python3.7/site-packages/tensorflow_transform/beam/impl.py", line 908, in _extract_input_pvalues
dataset_and_transform_fn)
TypeError: cannot unpack non-iterable PCollection object
Is the TransformDatasetsupposed to be used only on the result of the AnalyzeAndTransformDataset one?
Maybe you could you try this:
transformed_data = (raw_dataset, interaction_fn) | tft_beam.TransformDataset()
I think it tried to unpack raw_data which does not contain the metadata. Moreover TransformDataset returns only on variable, not two.

How to use specific GPU's in keras for multi-GPU training?

I have a server with 4 GPU's. I want to use exactly 2 of them for multi-GPU training.
Keras documentation provided here gives some insight about how to use multiple GPU's but I want to select the specific GPU's. Is there a way to achieve this?
from keras import backend as K
import tensorflow as tf
c = []
for d in ['/device:GPU:2', '/device:GPU:3']:
with K.tf.device(d):
config = tf.ConfigProto(intra_op_parallelism_threads=4,\
inter_op_parallelism_threads=4, allow_soft_placement=True,\
device_count = {'CPU' : 1, 'GPU' : 2})
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2])
c.append(tf.matmul(a, b))
with tf.device('/cpu:0'):
sum = tf.add_n(c)
session = tf.Session(config=config)
K.set_session(session)
I think this should work . You should be having the number(index) of GPU devices you want to use. In this case its 2 and 3. Relevant links 1)https://github.com/carla-simulator/carla/issues/116
2) https://www.tensorflow.org/guide/using_gpu#using_multiple_gpus
The best way is to compile the Keras model with a tf.distribute Strategy by creating and compiling your model in the strategy's scope. For example:
import contextlib
def model_scope(devices):
if 1 < len(devices):
strategy = tf.distribute.MirroredStrategy(devices)
scope = strategy.scope()
else:
scope = contextlib.supress() # Python 3.4 up
return scope
devices = ['/device:GPU:2', '/device:GPU:3']
with model_scope(devices):
# create and compile your model
model = get_model()
model.compile(optimizer=optimizer, loss=loss)

GLM with Apache Spark 2.2.0 - Tweedie family default Link value

I am using spark 2.2.0 with python. I tried to figure out what is the default param of Link function Spark accepts in the GeneralizedLineraModel in case of Tweedie family.
When I look to documentation https://spark.apache.org/docs/2.2.0/api/scala/index.html#org.apache.spark.ml.regression.GeneralizedLinearRegression
class pyspark.ml.regression.GeneralizedLinearRegression(self, labelCol="label", featuresCol="features", predictionCol="prediction", family="gaussian", link=None, fitIntercept=True, maxIter=25, tol=1e-6, regParam=0.0, weightCol=None, solver="irls", linkPredictionCol=None
It seems that default value when family='tweedie' should be None but when I tried this (by using similar test as unit test : https://github.com/apache/spark/pull/17146/files/fe1d3ae36314e385990f024bca94ab1e416476f2) :
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([(1.0, Vectors.dense(0.0, 0.0)),\
(1.0, Vectors.dense(1.0, 2.0)),\
(2.0, Vectors.dense(0.0, 0.0)),\
(2.0, Vectors.dense(1.0, 1.0)),], ["label", "features"])
glr = GeneralizedLinearRegression(family="tweedie",variancePower=1.42,link=None)
model = glr.fit(df)
transformed = model.transform(df)
it raised a Null pointer Java exception...
Py4JJavaError: An error occurred while calling o6739.w. :
java.lang.NullPointerException ...
It works well when I remove explicite link=None in the initilization of the model.
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([(1.0, Vectors.dense(0.0, 0.0)),\
(1.0, Vectors.dense(1.0, 2.0)),\
(2.0, Vectors.dense(0.0, 0.0)),\
(2.0, Vectors.dense(1.0, 1.0)),], ["label", "features"])
glr = GeneralizedLinearRegression(family="tweedie",variancePower=1.42)
model = glr.fit(df)
transformed = model.transform(df)
I would like to be able to pass a standard set of params like
params={"family":"Onefamily","link":"OnelinkAccordingToFamily",..}
and then initialize GLM as:
glr = GeneralizedLinearRegression(family=params["family"],link=params['link]' ....)
So it could be more standard and works in any case of family and link.
Just seems that the link value is not ignored in the case when family=Tweedie any idea of what default value I should use? I tried link='' or link='None' but it raises 'invalid link function'.
To deal with GLR tweedie family you'll need to define the power link function specified through the "linkPower" parameter, and you shouldn't set link to None which was leading to that exception you got.
Here is an example on how to use it :
df = spark.createDataFrame(
[(1.0, Vectors.dense(0.0, 0.0)),
(1.0, Vectors.dense(1.0, 2.0)),
(2.0, Vectors.dense(0.0, 0.0)),
(2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"])
# in this case the default link power applies
glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6)
model = glr.fit(df) # in this case the default link power applies
model2 = glr.setLinkPower(-1.0).fit(df)
PS : The default link power in the tweedie family is 1 - variancePower.

Get survival function with Spark ML

I am training an Accelerated failure time model with PySpark (from pyspark.ml.regression import AFTSurvivalRegression)
Now I want to apply the model to new data and get the probability that the event will happen before time t (survival function), which method should I use ? The documentation is not clear to me : https://spark.apache.org/docs/2.1.0/api/python/pyspark.ml.html#pyspark.ml.regression.AFTSurvivalRegression
For example, if I do the following:
from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.linalg import Vectors
training = spark.createDataFrame([
(1.218, 1.0, Vectors.dense(1.560, -0.605)),
(2.949, 0.0, Vectors.dense(0.346, 2.158)),
(3.627, 0.0, Vectors.dense(1.380, 0.231)),
(0.273, 1.0, Vectors.dense(0.520, 1.151)),
(4.199, 0.0, Vectors.dense(0.795, -0.226))], ["label", "censor", "features"])
quantileProbabilities = [0.25, 0.75]
aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities,
quantilesCol="quantiles")
model = aft.fit(training)
model.transform(training).show(truncate=False)
I get as an output :
Does it mean that for the first line, P(event happening between 0.832 and 9.48) = 50% ?
Thanks

Resources