How to update the weights of a pickled file? - python-3.x

I am training a Calibrated Classifier on Google Cloud Scheduler every day which takes about 5 mins to run. My python script receives latest data (from that day) and concatenate it to the original data and then the model gets trained and saves the pickled files on Cloud Storage. The issue I am facing now is, if it takes more than 5 mins (which it will at some point), it gives an upstream request timeout error.
I imagine, that it because of the more time the model is taking to train and I can think of one solution where I train the model only on the new data and update the weights of the original model in the pickled file. However, I am not sure if its possible.
Below is my function that runs on the scheduler:
def train_model():
users, tasks, tags, task_tags, task_user, boards = connect_postgres() ##loading the data from a postgres function
storage_client = storage.Client()
bucket = storage_client.get_bucket('my-bucket')
blob = bucket.blob('original_data.pkl')
pickle_in0 = blob.download_as_string()
data = pickle.loads(pickle_in0)
tasks = tasks.rename(columns={'id': 'task_id', 'name': 'task_name'})
# Joining tasks and task_user_assigns tables
tasks = tasks[tasks.task_name.isnull() == False]
task_user = task_user[['id', 'task_id', 'user_id']].rename(columns={'id': 'task_user_id'})
task_data = tasks.merge(task_user, on='task_id', how='left')
# Joining users with the task_data
users = users[['id', 'email']].rename(columns={'id': 'user_id'})
users_tasks = task_data.merge(users, on='user_id', how='left')
users_tasks = users_tasks[users_tasks.user_id.isnull() == False].reset_index(drop=True)
# Joining boards table to user_tasks
boards = boards[['id', 'name']].rename(columns={'id': 'board_id', 'name': 'board_name'})
users_board = users_tasks.merge(boards, on='board_id', how='left').reset_index(drop=True)
# Data Cleaning
translator = Translator() # This is to translate if the tasks are not in English
users_board["task_trans"] = users_board["task_name"].map(lambda x: translator.translate(x, dest="en").text)
users_board['task_trans'] = users_board['task_trans'].apply(lambda x: remove_emoji(x)) #This calls a function to remove Emoticons from text
users_board['task_trans'] = users_board['task_trans'].apply(lambda x: remove_punct(x)) #This calls a function to remove punctuations from text
users_board = users_board[['task_id', 'email', 'board_id', 'user_id', 'task_trans']]
data1 = pd.concat([data, users_board], axis=0)
df1 = data1.copy
X = df1.task_trans #all the observations
y = df1.user_id #all the lables
print(y.nunique())
#FROM HERE ON, THE TRAINING SCRIPT BEGINS
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf_transformer.transform(X_train_counts)
print('model 1 done')
labels = LabelEncoder()
y_train_labels_fit = labels.fit(y)
y_train_lables_trf = labels.transform(y)
linear_svc = LinearSVC()
clf = linear_svc.fit(X_train_transformed, y_train_lables_trf)
print('model 2 done')
calibrated_svc = CalibratedClassifierCV(base_estimator=linear_svc, cv="prefit")
calibrated_svc.fit(X_train_transformed, y_train_lables_trf)
print('model 3 done')
# SAVING THE MODELS ON GOOGLE CLOUD STORAGE
# storage_client = storage.Client()
fs = gcsfs.GCSFileSystem(project='my-project')
filename = '~path/svc.sav'
pickle.dump(calibrated_svc, fs.open(filename, 'wb'))
filename = '~path/count_vectorizer.sav'
pickle.dump(count_vect, fs.open(filename, 'wb'))
filename = '~path/tfidf_vectorizer.sav'
pickle.dump(tf_transformer, fs.open(filename, 'wb'))
blob = bucket.blob('data.pkl')
pickle_out = pickle.dumps(df1)
blob.upload_from_string(pickle_out)
return "success"
Any idea how to achieve that? Or any other strategy that I can follow to solve this problem?

I couldn't find a way to update the weights of a pickle file and eventually settled with increasing the timeout parameter in cloud run to more than the training time and it fixed the issue for the time being.

Related

What is an efficient way to make a dataset and dataloader for high frequency time series with multiple individuals?

I'm trying to forecast high frequency time series using LSTMs and PyTorch library. I'm going through PyTorch tutorial for creating custom datasets and models and figured out how to create my Dataset class and my Dataloader and they work perfectly fine but they take too much time to generate one batch.
I want to generate batches of fixed size, each batch contains time series from different individuals and the input window is of the same length as the output window (multi-step prediction).
I think the issue is due to the fact that I'm verifying the windows are correct.
My dataframe of a little bit more than 3M lines with 6 columns. I have some 100 individuals and for each individual I have 4 different time series $y_{1}$, $y_{2}$, $y_{3}$ and $y_{4}$. I have no missing values at all and the time steps are consecutive. For each individual I have the same time steps.
My code is:
class TSDataset(Dataset):
def __init__(self, train_data, unique_column = 'unique_id', input_length = 3840, target_length = 3840, targets = ['y1', 'y2', 'y3', 'y4'], transform = None):
self.train_data = train_data
self.unique_column = unique_column
self.input_length = input_length
self.target_length = target_length
self.total_window_length = input_length + target_length
self.targets = targets
def __len__(self):
return len(self.train_data)
def verify_time_steps(self, idx):
change = False
# Check if the window doesn't overlap over many individuals
num_individuals = self.train_data.iloc[np.arange(idx + self.total_window_length), :][self.unique_column].unique().shape[0]
if num_stations != 1:
change = True
if idx + self.total_window_length >= len(self.train_data):
change = True
return change
def reshuffle(self):
return np.random.randint(0, len(self.train_data))
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
change = self.verify_time_steps(idx)
if change == True:
while change != False:
idx = self.reshuffle()
change = self.verify_time_steps(idx)
sample = self.train_data.iloc[np.arange(idx, idx + self.input_length), :][self.targets].values
labels = self.train_data.iloc[np.arange(idx + self.input_length, idx + self.input_length + self.target_length), :][self.targets].values
sample = torch.from_numpy(sample)
labels = torch.from_numpy(labels)
return sample, labels
I've tried using the TimeSeriesDataset from PyTorchForecasting but I had a hard time creating models that suit it.
I've also tried creating the dataset outside, as a numpy array but my RAM can't handle it.
Hope you can help me figure out how to alleviate the computations.

How do I run a machine learning training in the background?

I have a function in Support Vector Classifier which runs on a scheduler on Google Cloud Platform. That function, fetches the new data, adds it to the original data, trains the model on new data and saves it on google cloud storage. All of this takes 5 min to complete. I wish to not wait for the final output, instead I want to run it in the background and end the process without waiting.
Below is my function with comments:
def train_model():
users, tasks, tags, task_tags, task_user, boards = connect_postgres() ##loading the data from a postgres function
storage_client = storage.Client()
bucket = storage_client.get_bucket('my-bucket')
blob = bucket.blob('original_data.pkl')
pickle_in0 = blob.download_as_string()
data = pickle.loads(pickle_in0)
tasks = tasks.rename(columns={'id': 'task_id', 'name': 'task_name'})
# Joining tasks and task_user_assigns tables
tasks = tasks[tasks.task_name.isnull() == False]
task_user = task_user[['id', 'task_id', 'user_id']].rename(columns={'id': 'task_user_id'})
task_data = tasks.merge(task_user, on='task_id', how='left')
# Joining users with the task_data
users = users[['id', 'email']].rename(columns={'id': 'user_id'})
users_tasks = task_data.merge(users, on='user_id', how='left')
users_tasks = users_tasks[users_tasks.user_id.isnull() == False].reset_index(drop=True)
# Joining boards table to user_tasks
boards = boards[['id', 'name']].rename(columns={'id': 'board_id', 'name': 'board_name'})
users_board = users_tasks.merge(boards, on='board_id', how='left').reset_index(drop=True)
# Data Cleaning
translator = Translator() # This is to translate if the tasks are not in English
users_board["task_trans"] = users_board["task_name"].map(lambda x: translator.translate(x, dest="en").text)
users_board['task_trans'] = users_board['task_trans'].apply(lambda x: remove_emoji(x)) #This calls a function to remove Emoticons from text
users_board['task_trans'] = users_board['task_trans'].apply(lambda x: remove_punct(x)) #This calls a function to remove punctuations from text
users_board = users_board[['task_id', 'email', 'board_id', 'user_id', 'task_trans']]
data1 = pd.concat([data, users_board], axis=0)
df1 = data1.copy
X = df1.task_trans #all the observations
y = df1.user_id #all the lables
print(y.nunique())
#FROM HERE ON, THE TRAINING SCRIPT BEGINS
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf_transformer.transform(X_train_counts)
print('model 1 done')
labels = LabelEncoder()
y_train_labels_fit = labels.fit(y)
y_train_lables_trf = labels.transform(y)
linear_svc = LinearSVC()
clf = linear_svc.fit(X_train_transformed, y_train_lables_trf)
print('model 2 done')
calibrated_svc = CalibratedClassifierCV(base_estimator=linear_svc, cv="prefit")
calibrated_svc.fit(X_train_transformed, y_train_lables_trf)
print('model 3 done')
# SAVING THE MODELS ON GOOGLE CLOUD STORAGE
# storage_client = storage.Client()
fs = gcsfs.GCSFileSystem(project='my-project')
filename = '~path/svc.sav'
pickle.dump(calibrated_svc, fs.open(filename, 'wb'))
filename = '~path/count_vectorizer.sav'
pickle.dump(count_vect, fs.open(filename, 'wb'))
filename = '~path/tfidf_vectorizer.sav'
pickle.dump(tf_transformer, fs.open(filename, 'wb'))
blob = bucket.blob('original_data.pkl')
pickle_out = pickle.dumps(df1)
blob.upload_from_string(pickle_out)
return "success"
Now, I tried to do the following:
p = subprocess.Popen([sys.executable, '-c', train_model()], stdout=subprocess.PIPE, stderr=subprocess.STDOUT); print('finished')
This also took the same amount of time. Is there a way I can solve this?
Also, if I want to print the python logs for this process on client-side, is that possible?

sagemaker giving UnicodeDecodeError while deserializing

In sagemaker, was able to load and deploy model from s3. While deserializing the data for prediction, I am getting "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd7 in position 2: invalid continuation byte" on line
"results = predictor.predict(test_X)"
I tried the following sagemaker example https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_applying_machine_learning/linear_time_series_forecast/linear_time_series_forecast.ipynb . I was able to create train, validate and deploy model and store model in s3.
After this I wanted to import model from s3 into sagemaker and test using the imported model. Was able to load and deploy the model, but when predicting for test values, I am getting UnicodeDecodeError
from sagemaker.predictor import csv_serializer, json_deserializer
role = get_execution_role()
sagemaker_session = sagemaker.Session()
model_data = sagemaker.session.s3_input( model_file_location_in_s3, distribution='FullyReplicated', content_type='application/x-sagemaker-model', s3_data_type='S3Prefix')
sagemaker_model = sagemaker.LinearLearnerModel(model_data=model_file,
role=role,
sagemaker_session=sagemaker_session)
predictor = sagemaker_model.deploy(initial_instance_count=1, instance_type='ml.t2.medium')
#loading test data
gas = pd.read_csv('gasoline.csv', header=None, names=['thousands_barrels'],encoding='utf-8')
gas['thousands_barrels_lag1'] = gas['thousands_barrels'].shift(1)
gas['thousands_barrels_lag2'] = gas['thousands_barrels'].shift(2)
gas['thousands_barrels_lag3'] = gas['thousands_barrels'].shift(3)
gas['thousands_barrels_lag4'] = gas['thousands_barrels'].shift(4)
gas['trend'] = np.arange(len(gas))
gas['log_trend'] = np.log1p(np.arange(len(gas)))
gas['sq_trend'] = np.arange(len(gas)) ** 2
weeks = pd.get_dummies(np.array(list(range(52)) * 15)[:len(gas)], prefix='week')
gas = pd.concat([gas, weeks], axis=1)
gas = gas.iloc[4:, ]
split_train = int(len(gas) * 0.6)
split_test = int(len(gas) * 0.3)
test_y = gas['thousands_barrels'][split_test:]
test_X = gas.drop('thousands_barrels', axis=1).iloc[split_test:, ].as_matrix()
predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer
results = predictor.predict(test_X)
one_step = np.array([r['score'] for r in results['predictions']])
the program works fine when model is trained and deployed(as in example) but when loading from s3, it throws this error.
The test data is numpy ndarray.
The deserializer does not seem to be appropriated for the content of the response.
To investigate, write a custom deserializer just printing some details:
def debug_deserializer(data, content_type):
print(content_type)
print(data)
and apply it like:
predictor.deserializer = debug_deserializer
This could, for example yield something like this:
application/x-recordio-protobuf
<botocore.response.StreamingBody object at 0x7fd3544883c8>
None
Telling you the content type is application/x-recordio-protobuf. Then write a custom deserializer as for example:
from sagemaker.amazon.common import RecordDeserializer
def recordio_protobuf_deserialize(data, content_type):
rec_des = RecordDeserializer()
return rec_des.deserialize(data, content_type)
and apply like:
predictor.deserializer = recordio_protobuf_deserialize

Why can't I split files when generating some TFrecord files?

Why can't I split files when generating some TFrecords files?
I'm doing some job predicting protein stuctures. As you may know, one protein molecule might have different strands. So I need to split the list of the atoms into different TFrecords by the strand name.
The problem is, this code ended up by generating several TFrecords with nothing written. All blank.
Or, is there a method to split the strands while training my module? Then I could ignore this problem and put the strand name in the TFrecords as a feature.
'''
with all module imported and no errors raised
'''
def generate_TFrecord(intPosition, endPosition, path):
CrtS = x #x is the name of the current strand
path = path + CrtS
writer = tf.io.TFRecordWriter('%s.tfrecord' %path)
for i in range(intPosition, endPosition):
if identifyCoreCarbon(i):
vectros = getVectors(i)
features = {}
'''
feeding this dict
'''
tf_features = tf.train.Features(feature = features)
tf_example = tf.train.Example(features = tf_features)
tf_serialized = tf_example.SerializeToString()
writer.write(tf_serialized)
'''
if checkStrand(i) == False:
writer.write(tf_serialized)
intPosition = i
'''
writer.close()
'''
strand_index is a list of all the startpoint of a single strand
'''
for loop in strand_index:
generate_TFrecord(loop, endPosition, path)
'''
________division___________
This code below works, but only generate a single tfrecord containing all the atom imformations.
writer = tf.io.TFRecordWriter('%s.tfrecord' %path)
for i in range(0, endPosition):
if identifyCoreCarbon(i):
vectros = getVectors(i)
features = {}
'''
feeing features
'''
tf_features = tf.train.Features(feature = features)
tf_example = tf.train.Example(features = tf_features)
tf_serialized = tf_example.SerializeToString()
writer.write(tf_serialized)
writer.close()
'''

How to implement TensorBoard v2 (tf.contrib.summary) with while_loop?

I want to log the loss and accuracy value evaluated inside a nested body function of a while_loop during training. This is the structure: I have a class, a method of this class builds the graph using a while_loop (build_graph()). Another method calls build_graph() and then runs the session. It works. Or it seems to work. However, I would like to use TensorBoard to check if loss and accuracy are actually improving, but I'm not able to summary those tensors. I've tried to define a tf.contrib.summary.create_file_writer('summary') and a graph and pass them to build_graph() as parameters, so that the body function can see them. I have checked the list during graph execution coming from tf.contrib.summary.all_summary_ops() and it isn't empty. However, when I open TensorBoard I get "No dashboards are active for the current data set.". Neither the graph. I am aware that tf.summary does not work in while_loop but it seems that tf.contrib.summary works.
Here is a working example
import tensorflow as tf
import sys
import datamanagement
class myNet:
def __init__(self):
self.varlist = ["x", "y"]
self.data = []
self.hsize = [10, 10]
self.batch_size = 10
self.tr_mainsteps = 1000
self.learnrate = 0.001
self.sourcedatafile = "XYfit.csv" # source file
# Dataset parameters
self.seq_params = {'dim': len(self.varlist),
'batch_size': self.batch_size,
'shuffle': True,
'filepath': self.sourcedatafile}
# Dataset from CSV file
self.dataset = datamanagement.CSVDataSet(**self.seq_params).finaldataset
# Iterator on the CSV file
self.dataiterator = self.dataset.make_initializable_iterator()
# Optimizer
self.optim = tf.train.RMSPropOptimizer(learning_rate=self.learnrate)
# Official creation of the graph
self.graph = tf.get_default_graph()
with self.graph.as_default():
# Writer creation
self.writer = tf.contrib.summary.create_file_writer('./summary')
with self.writer.as_default():
tf.contrib.summary.always_record_summaries()
def mymodel(self, Zinp, reuse=False):
# This function builds the graph of the network
with tf.variable_scope("mymod/net", reuse=reuse):
h1 = tf.layers.dense(Zinp, self.hsize[0], activation=tf.nn.leaky_relu, name='h1')
h2 = tf.layers.dense(h1, self.hsize[1], activation=tf.nn.leaky_relu, name='h2')
out = tf.layers.dense(h2, len(self.varlist), activation=None, name='final') # None means linear activation
return out
def _trainepoch(self, ind):
with self.writer.as_default():
# Real data tensor from CSV file
self.realdata = self.dataiterator.get_next()
# random input vector
self.Znoise = tf.random_uniform([self.batch_size, len(self.varlist)], minval=-1., maxval=1.)
# Model and output tensor
self.output = self.mymodel(self.Znoise, reuse=tf.AUTO_REUSE)
# Loss
self.loss = tf.losses.mean_squared_error(self.realdata, self.output)
tf.contrib.summary.scalar("loss", self.loss)
#Trainable variables
t_vars = tf.trainable_variables()
# Evaluation of the weight gradients
grad = self.optim.compute_gradients(self.loss, var_list=t_vars)
# Update weights based on gradients
return self.optim.apply_gradients(grad), tf.contrib.summary.all_summary_ops()
def _train_buildgraph(self):
def body(ind, ops):
train_up, ops = self._trainepoch(ind)
# Ensure that the update is applied before continuing.
with tf.control_dependencies([train_up]):
ind = ind + 1
return ind, ops
def cond(ind, ops):
return ind < self.tr_mainsteps
return tf.while_loop(cond, body, [tf.constant(0), [tf.Variable(False)]])
def config_run(self, trepoch=50, testNet=False):
self.tr_mainsteps = trepoch # Number of adversarial training epoch
with self.graph.as_default():
with self.writer.as_default():
tr_loop, summary_ops = self._train_buildgraph()
# Graph execution
with self.graph.as_default():
with self.writer.as_default():
with tf.Session() as sess:
sess.run(tf.initializers.global_variables())
sess.run(self.dataiterator.initializer)
tf.contrib.summary.initialize(
graph=tf.get_default_graph()
)
sess.run([summary_ops, tr_loop, summary_ops])
def main(argv):
hmodel = myNet()
hmodel.config_run()
if __name__ == "__main__":
main(sys.argv[1:])
Can someone help me?

Resources