Training procedure - python-3.x

I noticed that the sklearn.linear_model.SGDClassifier implements gradient descent for a linear model, therefore one could state that the class combines the fitting procedure (SGD) and the model (linear model) in one class.
SGD is, however, not inherent to linear models, and linear_models can be trained using many other optimizations with particular pro's (memory usage, convergence speed, local optima avoidance*, ...). One could state that such optimization techniques implement how to iterate over the training data, whether to do it online or offline, in what feature-dimension to apply an update and when to stop (possibly based on an error function of a validation set).
In particular, I implemented a model using theano and wrapped it in a fit/predict interface. Theano is cool because it allows one to define a callable which applies gradient descent on one sample, or on a set of samples, as well as a callable which returns the error on a validation set. But this coolness is not inherent to theano, a lot more models can simply define an update and error-evaluation function, which can then be used by different iteration- and stopping policies for fitting.
The theano examples often use minibatch, and the minibatch code is copy-pasted or reimplemented a lot with just minor adjustments which can easily be factored out. So I was hoping that sklearn implements something that you initialize with some parameters and an update/error callable to fit 'any model'. Or possible there is some good practice on how to do this yourself (especially w.r.t. the interface of the fitter).
Is there anything like this (in sklearn), that is Fitters which do not define the model?
*In the particular case of linear models and a l2 cost function, local optima do not exist of course, but still.
EDIT
Fair enough, this calls for a suggestion. I coded these two classes, which are not 100% clean, but they given an idea of what I mean:
import numpy
class StochasticUpdate():
def __init__(self, model, update, n_epochs, n_data_points, error=None, test_fraction=None):
self.update = update
self.n_epochs = n_epochs
self.n_data_points = n_data_points
self.error = error
self.model = model
if self.error is None and test_fraction is not None:
raise ValueError('error parameter must be specified if a test_faction (value: %s) should be used.' % test_fraction)
self.do_test = test_fraction is not None
self.n_train_samples = int(n_data_points - test_fraction) if self.do_test else n_data_points
if self.do_test:
self.test_range = numpy.arange(self.n_train_samples, n_data_points)
self.n_test_samples = int(n_data_points * test_fraction)
self.train_range = numpy.arange(0, self.n_train_samples)
def fit(self):
if self.do_test: self.test_errors = []
self.train_errors = []
self.mean_cost_values = []
for epoch in range(self.n_epochs):
order = numpy.random.permutation(self.n_train_samples)
mean_cost_value = 0
for i in range(self.n_train_samples):
mean_cost_value += self.update([order[i]])
self.mean_cost_values.append(mean_cost_value/ self.n_data_points)
if self.error is not None:
self.train_errors.append(self.error(self.train_range))
if self.do_test:
self.test_errors.append(self.error(self.test_range))
return self.model
from math import ceil
class MinibatchStochasticUpdate(StochasticUpdate):
def __init__(self, model, update, n_epochs, n_data_points, error, batch_size, patience=5000, patience_increase=2,
improvement_threshold = 0.995, validation_frequency=None, validate_faction=0.1, test_fraction=None):
super().__init__(self, model, update, n_data_points, error, test_fraction)
self.update = update
self.n_epochs = n_epochs
self.n_data_points = n_data_points
self.model = model
self.batch_size = batch_size
self.patience = patience
self.patience_increase = patience_increase
self.improvement_threshold = improvement_threshold
self.n_validation_samples = int(n_data_points * validate_faction)
self.validation_range = numpy.arange(self.n_train_samples, self.n_train_samples + self.n_validation_samples)
self.n_train_batches = int(ceil(n_data_points / batch_size))
self.n_train_batches = int(ceil(self.n_train_samples / self.batch_size))
self.train_batch_ranges = [
numpy.arange(minibatch_index * self.batch_size, min((minibatch_index+1) * self.batch_size, self.n_train_samples))
for minibatch_index in range(self.n_train_batches)
]
self.validation_frequency = min(self.n_train_batches, patience/2) if validation_frequency is None else validation_frequency
def fit(self):
self.best_validation_error = numpy.inf
best_params = None
iteration = 0
for epoch in range(self.n_epochs):
for minibatch_index in range(self.n_train_batches):
self.update(self.train_batch_ranges[minibatch_index])
if (iter + 1) % self.validation_frequency == 0:
current_validation_error = self.error(self.validation_error)
if current_validation_error < self.best_validation_error:
if current_validation_error < self.best_validation_error * self.improvement_threshold:
patience = max(self.patience, iter * self.patience_increase)
best_params = self.model.copy_parameters()
self.best_validation_error = current_validation_error
if iteration <= patience:
self.model.set_parameters(best_params)
return self.model
iteration += 1
self.model.set_parameters(best_params)
return self.model
Then for in the fit of the model one could support different training approaches and stopping criteria like this:
def fit(self, X, y):
X_shared = theano.shared(X, borrow=True)
y_shared = theano.shared(y, borrow=True)
learning_rate = self.training_method_options['learning_rate']
trainer = {
'stochastic_gradient_descent': lambda: StochasticUpdate(
self,
update=self.update_stochastic_gradient_descent_function(X_shared, y_shared, learning_rate),
n_epochs=self.training_method_options['n_epochs'],
n_data_points=X.shape[0],
error=self.evaluation_function(X_shared, y_shared),
),
'minibatch_gradient_descent': lambda: MinibatchStochasticUpdate(
self,
update=self.update_stochastic_gradient_descent_function(X_shared, y_shared, learning_rate),
n_epochs=self.training_method_options['n_epochs'],
n_data_points=X.shape[0],
error=self.evaluation_function(X_shared, y_shared),
batch_size=self.training_method_options['batch_size']
)
}[self.training_method]()
trainer.fit()
return self
Obviosuly the hash-map part is hacky, and could be done more elegantly using a standardized interface for the two classes above (since the hashmaps are still O(N*M) in size for N fitters and M models).

Related

What is an efficient way to make a dataset and dataloader for high frequency time series with multiple individuals?

I'm trying to forecast high frequency time series using LSTMs and PyTorch library. I'm going through PyTorch tutorial for creating custom datasets and models and figured out how to create my Dataset class and my Dataloader and they work perfectly fine but they take too much time to generate one batch.
I want to generate batches of fixed size, each batch contains time series from different individuals and the input window is of the same length as the output window (multi-step prediction).
I think the issue is due to the fact that I'm verifying the windows are correct.
My dataframe of a little bit more than 3M lines with 6 columns. I have some 100 individuals and for each individual I have 4 different time series $y_{1}$, $y_{2}$, $y_{3}$ and $y_{4}$. I have no missing values at all and the time steps are consecutive. For each individual I have the same time steps.
My code is:
class TSDataset(Dataset):
def __init__(self, train_data, unique_column = 'unique_id', input_length = 3840, target_length = 3840, targets = ['y1', 'y2', 'y3', 'y4'], transform = None):
self.train_data = train_data
self.unique_column = unique_column
self.input_length = input_length
self.target_length = target_length
self.total_window_length = input_length + target_length
self.targets = targets
def __len__(self):
return len(self.train_data)
def verify_time_steps(self, idx):
change = False
# Check if the window doesn't overlap over many individuals
num_individuals = self.train_data.iloc[np.arange(idx + self.total_window_length), :][self.unique_column].unique().shape[0]
if num_stations != 1:
change = True
if idx + self.total_window_length >= len(self.train_data):
change = True
return change
def reshuffle(self):
return np.random.randint(0, len(self.train_data))
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
change = self.verify_time_steps(idx)
if change == True:
while change != False:
idx = self.reshuffle()
change = self.verify_time_steps(idx)
sample = self.train_data.iloc[np.arange(idx, idx + self.input_length), :][self.targets].values
labels = self.train_data.iloc[np.arange(idx + self.input_length, idx + self.input_length + self.target_length), :][self.targets].values
sample = torch.from_numpy(sample)
labels = torch.from_numpy(labels)
return sample, labels
I've tried using the TimeSeriesDataset from PyTorchForecasting but I had a hard time creating models that suit it.
I've also tried creating the dataset outside, as a numpy array but my RAM can't handle it.
Hope you can help me figure out how to alleviate the computations.

RuntimeError: Trying to backward through the graph a second time. Saved intermediate values of the graph are freed when you call .backward()

I am trying to train SRGAN from scratch. I have read solutions for this type of problem, but it would be great if someone could help me debug my code. The exact error is: "RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad()" Here is the snippet I am trying to train:
gen_model = Generator().to(device, non_blocking=True)
disc_model = Discriminator().to(device, non_blocking=True)
opt_gen = optim.Adam(gen_model.parameters(), lr=0.01)
opt_disc = optim.Adam(disc_model.parameters(), lr=0.01)
from torch.nn.modules.loss import BCELoss
def train_model(gen, disc):
for epoch in range(20):
run_loss_disc = 0
run_loss_gen = 0
for data in train:
low_res, high_res = data[0].to(device, non_blocking=True, dtype=torch.float).permute(0, 3, 1, 2),data[1].to(device, non_blocking=True, dtype=torch.float).permute(0, 3, 1, 2)
#--------Discriminator-----------------
gen_image = gen(low_res)
gen_image = gen_image.detach()
disc_gen = disc(gen_image)
disc_real = disc(high_res)
p=nn.BCEWithLogitsLoss()
loss_gen = p(disc_real, torch.ones_like(disc_real))
loss_real = p(disc_gen, torch.zeros_like(disc_gen))
loss_disc = loss_gen + loss_real
opt_disc.zero_grad()
loss_disc.backward()
run_loss_disc+=loss_disc
#---------Generator--------------------
cont_loss = vgg_loss(high_res, gen_image)
adv_loss = 1e-3*p(disc_gen, torch.ones_like(disc_gen))
gen_loss = cont_loss+(10^-3)*adv_loss
opt_gen.zero_grad()
gen_loss.backward()
opt_disc.step()
opt_gen.step()
run_loss_gen+=gen_loss
print("Run Loss Discriminator: %d", run_loss_disc)
print("Run Loss Generator: %d", run_loss_gen)
train_model(gen_model, disc_model)
Apparently your disc_gen value was discarded by the first backward() call, as it says.
It should work if you change the discriminator part a bit:
gen_image = gen(low_res)
disc_gen = disc(gen_image.detach())
and add this at the start of the generator part:
disc_gen = disc(gen_image)

Creating a custom estimator

A federal agency, Centers for Medicare and Medicaid Services (CMS), imposes regulations on nursing homes. However, nursing homes are inspected by state agencies for compliance with regulations, and fines for violations can vary widely between states.
Let's develop a very simple initial model to predict the amount of fines a nursing home might expect to pay based on its location. Fill in the class definition of the custom estimator, StateMeanEstimator, below.
i am getting the following (error Your solution did not match the expected type: 53 * number
Specifically, solution[52] did not match {'type': 'number'}:
None)
Here is my code
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
class GroupMeanEstimator(BaseEstimator, RegressorMixin):
def init(self, grouper):
self.grouper = grouper
self.group_averages = {}
def fit(self, X, y):
# Use self.group_averages to store the average penalty by group
Xy = X.join(y)
state_mean_series = Xy.groupby(self.grouper)[y.name].mean()
for row in pd.DataFrame(state_mean_series).itertuples():
self.group_averages[row[0]] = row[1]
return self
def predict(self, X):
# Return a list of predicted penalties based on group of samples in X
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
dictionary = self.group_averages
group = self.grouper
list_of_predictions = []
for row in X.itertuples():
prediction = dictionary.get(row.STATE)
list_of_predictions.append(prediction)
return list_of_predictions

How do I get the coefficients/interecept for each group/model so I can plot the fitted line for each group?

I've written a custom class to group elements of a dataset, fit each group, and then run predictions for each group based on the fitted model. I want to be able to return the coefficients of each fitting (presumably in a dictionary), so that I can refer back to them and plot the line of best fit for each.
Calling the standard .coef_ or .get_params methods do not work because the items these methods attempt to retrieve are groupby objects. Alternatively, I tried to introduce the following:
def get_coefs():
coefs_dict = {}
for name, values in dataframe.groupby(self.groupby_column):
coefs_dict[name] = self.drugs_dict[name].coefs_
return coefs_dict
But get the following:
<bound method GroupbyEstimator.get_coefs of GroupbyEstimator(groupby_column='ndc',
pipeline_factory=<function pipeline_factory at 0x0000018DAD207268>)>
Here's the class I've written:
from sklearn import base
import numpy as np
import pandas as pd
class GroupbyEstimator(base.BaseEstimator, base.RegressorMixin):
def __init__(self, groupby_column, pipeline_factory):
self.groupby_column = groupby_column
self.pipeline_factory = pipeline_factory
def fit(self, dataframe, label):
self.drugs_dict = {}
self.label = label
dataframe = pd.get_dummies(dataframe)
for name, values in dataframe.groupby(self.groupby_column):
y = values[label]
X = values.drop(columns = [label, self.groupby_column], axis = 1)
self.drugs_dict[name] = self.pipeline_factory().fit(X, y)
return self
def get_coefs():
self.coefs_dict = {}
self.coefs_dict[name] = self.drugs_dict[name].named_steps["lin_reg"].coef_
return self.coefs_dict
def predict(self, test_data):
price_pred_list = []
for idx, row in test_data.iterrows():
name = row[self.groupby_column]
regression_coefs = self.drugs_dict[name]
row = pd.DataFrame(row).T
X = row.drop(columns = [self.label, self.groupby_column], axis = 1).values.reshape(1, -1)
drug_price_pred = regression_coefs.predict(X)
price_pred_list.append([name, drug_price_pred])
return price_pred_list
Expected result is a dictionary of the format:
{drug_a: [coefficient_1, coefficient_2,...coefficient_n],
drug_b: [coefficient_1, coefficient_2,...coefficient_n],
drug_c: [coefficient_1, coefficient_2,...coefficient_n]}
The pipeline factory is like this. I'll be building this out with alternative regressors, pca, gridsearchcv, etc. at a later time (so long as I can get the parameters out of the groupby objects for the individual regressions.
def pipeline_factory():
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
return Pipeline([
('lin_reg', LinearRegression())
])
EDIT: Added the get_coefs method as suggested. Unfortunately, as displayed above, it is still returning the same error.
The problem is with self.drugs_dict which is a dictionary of Pipeline objects, so you can't use coef_ on them directly. The coef_ is anattribute associated with the estimator object which in your case is a LinearRegression object. So the correct way of accessing the coefficients will be self.drugs_dict[name].named_steps["lin_reg"].coef_ instead of self.drugs_dict[name].coefs_ in your get_coefs() method.
While #Parthasarathy Subburaj led me to the right answer, here's the completed code for anyone that may be looking for a similar solution:
from sklearn import base
import numpy as np
import pandas as pd
class GroupbyEstimator(base.BaseEstimator, base.RegressorMixin):
def __init__(self, groupby_column, pipeline_factory):
# column is the value to group by; estimator_factory can be called to produce estimators
self.groupby_column = groupby_column
self.pipeline_factory = pipeline_factory
def fit(self, dataframe, label):
# Create an estimator and fit it with the portion in each group (create and fit a model per city
self.drugs_dict = {}
self.label = label
self.coefs_dict = {}
dataframe = pd.get_dummies(dataframe) #onehot encoder had problems with the data, so I'm getting the dummies with pandas here
for name, values in dataframe.groupby(self.groupby_column):
y = values[label]
X = values.drop(columns = [label, self.groupby_column], axis = 1)
self.drugs_dict[name] = self.pipeline_factory().fit(X, y)
self.coefs_dict[name] = self.drugs_dict[name].named_steps["lin_reg"].coef_
return self
def get_coefs(self):
return self.coefs_dict
def predict(self, test_data):
price_pred_list = []
for idx, row in test_data.iterrows():
name = row[self.groupby_column] #get drug name from drug column
regression_coefs = self.drugs_dict[name] #get coefficients from fitting in drugs_dict
row = pd.DataFrame(row).T
X = row.drop(columns = [self.label, self.groupby_column], axis = 1).values.reshape(1, -1)
drug_price_pred = regression_coefs.predict(X) #Use regression coefficients from dictionary (key = drug name) to predict
price_pred_list.append([name, drug_price_pred])
return price_pred_list
The TL;DR of the comments is that the dictionary holding model names and coefficients needs to be created under the fit method using sklearn's .named_steps on the desired portion of the pipeline, and then returned in a separate method (in this case get_coefs).

How to implement TensorBoard v2 (tf.contrib.summary) with while_loop?

I want to log the loss and accuracy value evaluated inside a nested body function of a while_loop during training. This is the structure: I have a class, a method of this class builds the graph using a while_loop (build_graph()). Another method calls build_graph() and then runs the session. It works. Or it seems to work. However, I would like to use TensorBoard to check if loss and accuracy are actually improving, but I'm not able to summary those tensors. I've tried to define a tf.contrib.summary.create_file_writer('summary') and a graph and pass them to build_graph() as parameters, so that the body function can see them. I have checked the list during graph execution coming from tf.contrib.summary.all_summary_ops() and it isn't empty. However, when I open TensorBoard I get "No dashboards are active for the current data set.". Neither the graph. I am aware that tf.summary does not work in while_loop but it seems that tf.contrib.summary works.
Here is a working example
import tensorflow as tf
import sys
import datamanagement
class myNet:
def __init__(self):
self.varlist = ["x", "y"]
self.data = []
self.hsize = [10, 10]
self.batch_size = 10
self.tr_mainsteps = 1000
self.learnrate = 0.001
self.sourcedatafile = "XYfit.csv" # source file
# Dataset parameters
self.seq_params = {'dim': len(self.varlist),
'batch_size': self.batch_size,
'shuffle': True,
'filepath': self.sourcedatafile}
# Dataset from CSV file
self.dataset = datamanagement.CSVDataSet(**self.seq_params).finaldataset
# Iterator on the CSV file
self.dataiterator = self.dataset.make_initializable_iterator()
# Optimizer
self.optim = tf.train.RMSPropOptimizer(learning_rate=self.learnrate)
# Official creation of the graph
self.graph = tf.get_default_graph()
with self.graph.as_default():
# Writer creation
self.writer = tf.contrib.summary.create_file_writer('./summary')
with self.writer.as_default():
tf.contrib.summary.always_record_summaries()
def mymodel(self, Zinp, reuse=False):
# This function builds the graph of the network
with tf.variable_scope("mymod/net", reuse=reuse):
h1 = tf.layers.dense(Zinp, self.hsize[0], activation=tf.nn.leaky_relu, name='h1')
h2 = tf.layers.dense(h1, self.hsize[1], activation=tf.nn.leaky_relu, name='h2')
out = tf.layers.dense(h2, len(self.varlist), activation=None, name='final') # None means linear activation
return out
def _trainepoch(self, ind):
with self.writer.as_default():
# Real data tensor from CSV file
self.realdata = self.dataiterator.get_next()
# random input vector
self.Znoise = tf.random_uniform([self.batch_size, len(self.varlist)], minval=-1., maxval=1.)
# Model and output tensor
self.output = self.mymodel(self.Znoise, reuse=tf.AUTO_REUSE)
# Loss
self.loss = tf.losses.mean_squared_error(self.realdata, self.output)
tf.contrib.summary.scalar("loss", self.loss)
#Trainable variables
t_vars = tf.trainable_variables()
# Evaluation of the weight gradients
grad = self.optim.compute_gradients(self.loss, var_list=t_vars)
# Update weights based on gradients
return self.optim.apply_gradients(grad), tf.contrib.summary.all_summary_ops()
def _train_buildgraph(self):
def body(ind, ops):
train_up, ops = self._trainepoch(ind)
# Ensure that the update is applied before continuing.
with tf.control_dependencies([train_up]):
ind = ind + 1
return ind, ops
def cond(ind, ops):
return ind < self.tr_mainsteps
return tf.while_loop(cond, body, [tf.constant(0), [tf.Variable(False)]])
def config_run(self, trepoch=50, testNet=False):
self.tr_mainsteps = trepoch # Number of adversarial training epoch
with self.graph.as_default():
with self.writer.as_default():
tr_loop, summary_ops = self._train_buildgraph()
# Graph execution
with self.graph.as_default():
with self.writer.as_default():
with tf.Session() as sess:
sess.run(tf.initializers.global_variables())
sess.run(self.dataiterator.initializer)
tf.contrib.summary.initialize(
graph=tf.get_default_graph()
)
sess.run([summary_ops, tr_loop, summary_ops])
def main(argv):
hmodel = myNet()
hmodel.config_run()
if __name__ == "__main__":
main(sys.argv[1:])
Can someone help me?

Resources