sagemaker giving UnicodeDecodeError while deserializing - python-3.x

In sagemaker, was able to load and deploy model from s3. While deserializing the data for prediction, I am getting "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd7 in position 2: invalid continuation byte" on line
"results = predictor.predict(test_X)"
I tried the following sagemaker example https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_applying_machine_learning/linear_time_series_forecast/linear_time_series_forecast.ipynb . I was able to create train, validate and deploy model and store model in s3.
After this I wanted to import model from s3 into sagemaker and test using the imported model. Was able to load and deploy the model, but when predicting for test values, I am getting UnicodeDecodeError
from sagemaker.predictor import csv_serializer, json_deserializer
role = get_execution_role()
sagemaker_session = sagemaker.Session()
model_data = sagemaker.session.s3_input( model_file_location_in_s3, distribution='FullyReplicated', content_type='application/x-sagemaker-model', s3_data_type='S3Prefix')
sagemaker_model = sagemaker.LinearLearnerModel(model_data=model_file,
role=role,
sagemaker_session=sagemaker_session)
predictor = sagemaker_model.deploy(initial_instance_count=1, instance_type='ml.t2.medium')
#loading test data
gas = pd.read_csv('gasoline.csv', header=None, names=['thousands_barrels'],encoding='utf-8')
gas['thousands_barrels_lag1'] = gas['thousands_barrels'].shift(1)
gas['thousands_barrels_lag2'] = gas['thousands_barrels'].shift(2)
gas['thousands_barrels_lag3'] = gas['thousands_barrels'].shift(3)
gas['thousands_barrels_lag4'] = gas['thousands_barrels'].shift(4)
gas['trend'] = np.arange(len(gas))
gas['log_trend'] = np.log1p(np.arange(len(gas)))
gas['sq_trend'] = np.arange(len(gas)) ** 2
weeks = pd.get_dummies(np.array(list(range(52)) * 15)[:len(gas)], prefix='week')
gas = pd.concat([gas, weeks], axis=1)
gas = gas.iloc[4:, ]
split_train = int(len(gas) * 0.6)
split_test = int(len(gas) * 0.3)
test_y = gas['thousands_barrels'][split_test:]
test_X = gas.drop('thousands_barrels', axis=1).iloc[split_test:, ].as_matrix()
predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer
results = predictor.predict(test_X)
one_step = np.array([r['score'] for r in results['predictions']])
the program works fine when model is trained and deployed(as in example) but when loading from s3, it throws this error.
The test data is numpy ndarray.

The deserializer does not seem to be appropriated for the content of the response.
To investigate, write a custom deserializer just printing some details:
def debug_deserializer(data, content_type):
print(content_type)
print(data)
and apply it like:
predictor.deserializer = debug_deserializer
This could, for example yield something like this:
application/x-recordio-protobuf
<botocore.response.StreamingBody object at 0x7fd3544883c8>
None
Telling you the content type is application/x-recordio-protobuf. Then write a custom deserializer as for example:
from sagemaker.amazon.common import RecordDeserializer
def recordio_protobuf_deserialize(data, content_type):
rec_des = RecordDeserializer()
return rec_des.deserialize(data, content_type)
and apply like:
predictor.deserializer = recordio_protobuf_deserialize

Related

How to update the weights of a pickled file?

I am training a Calibrated Classifier on Google Cloud Scheduler every day which takes about 5 mins to run. My python script receives latest data (from that day) and concatenate it to the original data and then the model gets trained and saves the pickled files on Cloud Storage. The issue I am facing now is, if it takes more than 5 mins (which it will at some point), it gives an upstream request timeout error.
I imagine, that it because of the more time the model is taking to train and I can think of one solution where I train the model only on the new data and update the weights of the original model in the pickled file. However, I am not sure if its possible.
Below is my function that runs on the scheduler:
def train_model():
users, tasks, tags, task_tags, task_user, boards = connect_postgres() ##loading the data from a postgres function
storage_client = storage.Client()
bucket = storage_client.get_bucket('my-bucket')
blob = bucket.blob('original_data.pkl')
pickle_in0 = blob.download_as_string()
data = pickle.loads(pickle_in0)
tasks = tasks.rename(columns={'id': 'task_id', 'name': 'task_name'})
# Joining tasks and task_user_assigns tables
tasks = tasks[tasks.task_name.isnull() == False]
task_user = task_user[['id', 'task_id', 'user_id']].rename(columns={'id': 'task_user_id'})
task_data = tasks.merge(task_user, on='task_id', how='left')
# Joining users with the task_data
users = users[['id', 'email']].rename(columns={'id': 'user_id'})
users_tasks = task_data.merge(users, on='user_id', how='left')
users_tasks = users_tasks[users_tasks.user_id.isnull() == False].reset_index(drop=True)
# Joining boards table to user_tasks
boards = boards[['id', 'name']].rename(columns={'id': 'board_id', 'name': 'board_name'})
users_board = users_tasks.merge(boards, on='board_id', how='left').reset_index(drop=True)
# Data Cleaning
translator = Translator() # This is to translate if the tasks are not in English
users_board["task_trans"] = users_board["task_name"].map(lambda x: translator.translate(x, dest="en").text)
users_board['task_trans'] = users_board['task_trans'].apply(lambda x: remove_emoji(x)) #This calls a function to remove Emoticons from text
users_board['task_trans'] = users_board['task_trans'].apply(lambda x: remove_punct(x)) #This calls a function to remove punctuations from text
users_board = users_board[['task_id', 'email', 'board_id', 'user_id', 'task_trans']]
data1 = pd.concat([data, users_board], axis=0)
df1 = data1.copy
X = df1.task_trans #all the observations
y = df1.user_id #all the lables
print(y.nunique())
#FROM HERE ON, THE TRAINING SCRIPT BEGINS
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf_transformer.transform(X_train_counts)
print('model 1 done')
labels = LabelEncoder()
y_train_labels_fit = labels.fit(y)
y_train_lables_trf = labels.transform(y)
linear_svc = LinearSVC()
clf = linear_svc.fit(X_train_transformed, y_train_lables_trf)
print('model 2 done')
calibrated_svc = CalibratedClassifierCV(base_estimator=linear_svc, cv="prefit")
calibrated_svc.fit(X_train_transformed, y_train_lables_trf)
print('model 3 done')
# SAVING THE MODELS ON GOOGLE CLOUD STORAGE
# storage_client = storage.Client()
fs = gcsfs.GCSFileSystem(project='my-project')
filename = '~path/svc.sav'
pickle.dump(calibrated_svc, fs.open(filename, 'wb'))
filename = '~path/count_vectorizer.sav'
pickle.dump(count_vect, fs.open(filename, 'wb'))
filename = '~path/tfidf_vectorizer.sav'
pickle.dump(tf_transformer, fs.open(filename, 'wb'))
blob = bucket.blob('data.pkl')
pickle_out = pickle.dumps(df1)
blob.upload_from_string(pickle_out)
return "success"
Any idea how to achieve that? Or any other strategy that I can follow to solve this problem?
I couldn't find a way to update the weights of a pickle file and eventually settled with increasing the timeout parameter in cloud run to more than the training time and it fixed the issue for the time being.

Simple prediction from frozen .pb saved model

I try for days to use tf exported .pb file model for prediction. The model was generated with bestExporter function as follows :
features_specs = tf.feature_column.make_parse_example_spec(serving_features)
serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec=features_specs,default_batch_size=None)
exporter[n] = tf.estimator.BestExporter(name="best_exporter", serving_input_receiver_fn=serving_input_receiver_fn,event_file_pattern='eval/*.tfevents.*',exports_to_keep=1)
if train_params["use_early_stop"] == True:
hookModel[n] = tf.estimator.experimental.stop_if_no_decrease_hook(model[n], metric_name='average_loss', max_steps_without_decrease=train_params["early_stop_max_steps_without_decrease"], min_steps=train_params["early_stop_min_steps"],run_every_secs=train_params["early_stop_run_every_secs"], run_every_steps=train_params["early_stop_run_every_steps"],)
else:hookModel[n] = None
train_spec[n] = tf.estimator.TrainSpec(input_fn=input_fn_["train"+m],hooks=[hookModel[n]])
eval_spec[n] = tf.estimator.EvalSpec(input_fn=input_fn_["test"+m],start_delay_secs = train_params["eval_specs_start_delay_secs"],throttle_secs = train_params["eval_specs_throttle_secs"],exporters=[exporter[n]])
tf.estimator.train_and_evaluate(model[n], train_spec[n], eval_spec[n])
I think in this way input dict names are referenced...
I successfully load the model with :
model_[model_stage+"_"+model_type] = tf.saved_model.load(model_path)
but i don't know how correctly pass my features dictionnary in the model_XX['prediction'](example) wrapped function.
I saw this topic but didn't help : TensorFlow v2: Replacement for tf.contrib.predictor.from_saved_model
There's no equivalent of old tf.contrib.predictor.from_saved_model i used before...
Thanks for answer.
I found the solution to pass a dict in wrapped model. This is a slightly modified synthesis of these given solutions with modifications for TF2-4/Python 3.7 :
TensorFlow v2: Replacement for tf.contrib.predictor.from_saved_model
https://www.programcreek.com/python/example/90440/tensorflow.Example
Second is particulary complete and shows a lot of cases.
So :
my_dict = {"feature_1" : str(something), "feature_2" : int(an_int), , "feature_3" : float(a_float), ...}
# Load the model
my_model = tf.saved_model.load(model_path)
# Creates a serialized example from dict
def create_serialized_example(name_to_values):
example = tf.train.Example()
for name, values in name_to_values.items():
feature = example.features.feature[name]
if isinstance(values, str):
values = values.encode() # Modified because in new tf versions strings have to be encoded
add = feature.bytes_list.value.extend
elif isinstance(values, float):
add = feature.float_list.value.extend # Modified : float_list instead of float_32 in TF 2
elif isinstance(values, int):
add = feature.int64_list.value.extend
else:
raise AssertionError('Unsupported type: %s' % type(values[0]))
add([values]) # Modified : have to be a list, not variable
return example.SerializeToString()
# Predict function
pred = my_model.signatures["predict"] (examples=tf.constant([create_serialized_example(mydict)]))

getting TypeError: Expected int32, got None of type 'NoneType' instead

I have implemented sequence to sequence model with attention layer if I 300000 data points I'm not getting any error if I use all of my data points I'm getting following error model.fit
TypeError: Expected int32, got None of type 'NoneType' instead.
what would be the reason for this?
the code before model.fit is
class encoder_decoder(tf.keras.Model):
def __init__(self,embedding_size,encoder_inputs_length,output_length,vocab_size,output_vocab_size,score_fun,units):
super(encoder_decoder,self).__init__()
self.vocab_size = vocab_size
self.enc_units = units
self.embedding_size = embedding_size
self.encoder_inputs_length = encoder_inputs_length
self.output_length = output_length
self.lstm_output = 0
self.state_h = 0
self.state_c = 0
self.output_vocab_size = output_vocab_size
self.dec_units = units
self.score_fun = score_fun
self.att_units = units
self.encoder=Encoder(self.vocab_size,self.embedding_size,self.enc_units,self.encoder_inputs_length)
self.decoder = Decoder(self.output_vocab_size, self.embedding_size, self.output_length, self.dec_units ,self.score_fun ,self.att_units)
# self.dense = Dense(self.output_vocab_size,activation = "softmax")
def call(self,data):
input,output = data[0],data[1]
encoder_hidden = self.encoder.initialize_states(input.shape[0])
encoder_output,encoder_hidden,encoder_cell = self.encoder(input,encoder_hidden)
decoder_hidden = encoder_hidden
decoder_cell =encoder_cell
decoder_output = self.decoder(output,encoder_output,decoder_hidden,decoder_cell)
return decoder_output
Inside the call function I'm initializing states for the encoder where I'm getting
the number of rows from input using the following line of code
encoder_hidden = self.encoder.initialize_states(input.shape[0])
If I print input, I'm getting shape as (None,55)
That's the reason I'm getting this error.
Here my total number data points is 330614 when I use all my data I getting this
error, when I use only 330000 data points I'm getting this error,
if I print batch inside def method I'm getting shape as (64,55)
Please find my below code for creating dataset for my sequence to sequence model
the function to reprocess the data and the function to create the dataset
and a function the load the dataset
def preprocess_sentence(w):
# w = unicode_to_ascii(w.lower().strip())
w = re.sub(r"([?.!,¿])", r" \1 ", w)
w = re.sub(r'[" "]+', " ", w)
w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
w = w.strip()
w = '<start> ' + w + ' <end>'
return w
def create_dataset(path, num_examples):
lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
# lines1 = lines[330000:]
# lines = lines[0:323386]+lines1
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]
word_pairs = [[i[0],i[1]] for i in word_pairs]
return zip(*word_pairs)
def tokenize(lang):
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
filters='')
lang_tokenizer.fit_on_texts(lang)
tensor = lang_tokenizer.texts_to_sequences(lang)
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
return tensor, lang_tokenizer
def load_dataset(path, num_examples=None):
# creating cleaned input, output pairs
targ_lang, inp_lang = create_dataset(path, num_examples)
input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer,targ_lang,inp_lang
# Try experimenting with the size of that dataset
num_examples = None
input_tensor, target_tensor, inp_lang, targ_lang,targ_lang_text,inp_lang_text = load_dataset(path, num_examples)
# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
max_length_targ,max_length_inp
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
the shape of datasets as follows
shape of input train (269291, 55)
shape of target train (269291, 53)
shape of input test (67323, 55)
shape of target test (67323, 53)
You can share the code block before the model.fit.
NoneType error is indicating that the final array which is passed to the model is for some reason empty. You can add print statements at previous steps to understand where along the way your array became empty.
Compare the scenario to the case where you are taking all your data points so that you can understand where the array is changing and how it is handled prior to passing it through model.fit.

AttributeError: 'DType' object has no attribute 'type' Tensorflow Serving

I am trying to use a function (from another module) inside tensorflow. The function accepts a numpy array and returns the changepoints. My main goal is to deploy this model on tensorflow serving. I am running into error
AttributeError: 'DType' object has no attribute 'type'
There are 2 functions, one is create_data() that creates a numpy array and returns it, another is change() which accepts numpy array and uses the before mentioned function to return changepoints. I have created a placeholder to accept input data, an operation to execute the function. Problem is, if i try to send data through placeholder, i run into error. If i send the data directly into the function, it runs. Following is my code.
def create_data():
np.random.seed(0)
size = 100
mean_a = 0.0
mean_b = 10.0
mean_c = 0
var = 0.1
data_a = np.random.normal(mean_a, var, size)
data_b = np.random.normal(mean_b, var, size)
data_c = np.random.normal(mean_c, var, size)
data = np.concatenate([data_a, data_b, data_c])
return data
def change(data):
# what else i tried
# data = np.array(data, dtype=np.float)
# above line gives another error mentioned after code
cpts = (pelt(normal_mean(x, np.var(x)), len(x)))
return cpts
sess = tf.Session()
x = tf.placeholder(tf.float32, shape=[300, ], name="myInput")
y = tf.convert_to_tensor(change(x),np.float32,name="myOutput")
z = sess.run(y,feed_dict={x:create_data()})
If i try the code data = np.array(data, dtype=np.float) in the function change(), it gives me error
ValueError: setting an array element with a sequence.
I also tried data = np.hstack((data)).astype(np.float) and data = np.vstack((data)).astype(np.float) but it runs into a separate error that says use tf.map_fn. I also tried to use tf.eval() to convert the numbers but i couldn't get them to run inside a function with placeholders.
But if i send in the output directly,
y = tf.convert_to_tensor(change(create_data()),np.float32,name="myOutput")
It works.
How should i send in the input to make it work?
EDIT: The function in question is this if anyone wants to know.
This error is raised when you try to pass a Tensor into a numpy function
You need to use tf.py_func to include python function into tensorflow graph
(also, your change() functin uses data as argument instead of x)
Here is the code that worked for me
import numpy as np
import tensorflow as tf
from changepy import pelt
from changepy.costs import normal_mean
def create_data():
np.random.seed(0)
size = 100
mean_a = 0.0
mean_b = 10.0
mean_c = 0
var = 0.1
data_a = np.random.normal(mean_a, var, size)
data_b = np.random.normal(mean_b, var, size)
data_c = np.random.normal(mean_c, var, size)
data = np.concatenate([data_a, data_b, data_c])
return data
def change(x):
# what else i tried
# data = np.array(data, dtype=np.float)
# above line gives another error mentioned after code
cpts = (pelt(normal_mean(x, np.var(x)), len(x)))
return cpts
sess = tf.Session()
x = tf.placeholder(tf.float32, shape=[300, ], name="myInput")
y = tf.convert_to_tensor(tf.compat.v1.py_func(change, [x], 3*[tf.int64]),np.float32,name="myOutput")
z = sess.run(y,feed_dict={x:create_data()})
print(z)

'Word2Vec' object has no attribute 'index2word'

I'm getting this error "AttributeError: 'Word2Vec' object has no attribute 'index2word'" in following code in python. Anyone knows how can I solve it?
Acctually "tfidf_weighted_averaged_word_vectorizer" throws the error. "obli.csv" contains line of sentences.
Thank you.
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
dataset = get_data2()
corpus, labels = dataset.data, dataset.target
corpus, labels = remove_empty_docs(corpus, labels)
# print('Actual class label:', dataset.target_names[labels[10]])
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,
labels,
test_data_proportion=0.3)
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(train_corpus)
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train,
tfidf_vectors=tfidf_train_features,
tfidf_vocabulary=vocab,
model=model,
num_features=100)
def get_data2():
obli = pd.read_csv('db/obli.csv').values.ravel().tolist()
cl0 = [0 for x in range(len(obli))]
nonObli = pd.read_csv('db/nonObli.csv').values.ravel().tolist()
cl1 = [1 for x in range(len(nonObli))]
all = obli + nonObli
db = Db(all,cl0 + cl1)
db.data = all
db.target = cl0 + cl1
return db
This is code from chapter 4 of Text Analytics for Python by Dipanjan Sarkar.
index2word in gensim has been moved since that text was published.
Instead of model.index2word you should use model.wv.index2word.

Resources