Creating a custom estimator - python-3.x

A federal agency, Centers for Medicare and Medicaid Services (CMS), imposes regulations on nursing homes. However, nursing homes are inspected by state agencies for compliance with regulations, and fines for violations can vary widely between states.
Let's develop a very simple initial model to predict the amount of fines a nursing home might expect to pay based on its location. Fill in the class definition of the custom estimator, StateMeanEstimator, below.
i am getting the following (error Your solution did not match the expected type: 53 * number
Specifically, solution[52] did not match {'type': 'number'}:
None)
Here is my code
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
class GroupMeanEstimator(BaseEstimator, RegressorMixin):
def init(self, grouper):
self.grouper = grouper
self.group_averages = {}
def fit(self, X, y):
# Use self.group_averages to store the average penalty by group
Xy = X.join(y)
state_mean_series = Xy.groupby(self.grouper)[y.name].mean()
for row in pd.DataFrame(state_mean_series).itertuples():
self.group_averages[row[0]] = row[1]
return self
def predict(self, X):
# Return a list of predicted penalties based on group of samples in X
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
dictionary = self.group_averages
group = self.grouper
list_of_predictions = []
for row in X.itertuples():
prediction = dictionary.get(row.STATE)
list_of_predictions.append(prediction)
return list_of_predictions

Related

What is an efficient way to make a dataset and dataloader for high frequency time series with multiple individuals?

I'm trying to forecast high frequency time series using LSTMs and PyTorch library. I'm going through PyTorch tutorial for creating custom datasets and models and figured out how to create my Dataset class and my Dataloader and they work perfectly fine but they take too much time to generate one batch.
I want to generate batches of fixed size, each batch contains time series from different individuals and the input window is of the same length as the output window (multi-step prediction).
I think the issue is due to the fact that I'm verifying the windows are correct.
My dataframe of a little bit more than 3M lines with 6 columns. I have some 100 individuals and for each individual I have 4 different time series $y_{1}$, $y_{2}$, $y_{3}$ and $y_{4}$. I have no missing values at all and the time steps are consecutive. For each individual I have the same time steps.
My code is:
class TSDataset(Dataset):
def __init__(self, train_data, unique_column = 'unique_id', input_length = 3840, target_length = 3840, targets = ['y1', 'y2', 'y3', 'y4'], transform = None):
self.train_data = train_data
self.unique_column = unique_column
self.input_length = input_length
self.target_length = target_length
self.total_window_length = input_length + target_length
self.targets = targets
def __len__(self):
return len(self.train_data)
def verify_time_steps(self, idx):
change = False
# Check if the window doesn't overlap over many individuals
num_individuals = self.train_data.iloc[np.arange(idx + self.total_window_length), :][self.unique_column].unique().shape[0]
if num_stations != 1:
change = True
if idx + self.total_window_length >= len(self.train_data):
change = True
return change
def reshuffle(self):
return np.random.randint(0, len(self.train_data))
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
change = self.verify_time_steps(idx)
if change == True:
while change != False:
idx = self.reshuffle()
change = self.verify_time_steps(idx)
sample = self.train_data.iloc[np.arange(idx, idx + self.input_length), :][self.targets].values
labels = self.train_data.iloc[np.arange(idx + self.input_length, idx + self.input_length + self.target_length), :][self.targets].values
sample = torch.from_numpy(sample)
labels = torch.from_numpy(labels)
return sample, labels
I've tried using the TimeSeriesDataset from PyTorchForecasting but I had a hard time creating models that suit it.
I've also tried creating the dataset outside, as a numpy array but my RAM can't handle it.
Hope you can help me figure out how to alleviate the computations.

How do I get the coefficients/interecept for each group/model so I can plot the fitted line for each group?

I've written a custom class to group elements of a dataset, fit each group, and then run predictions for each group based on the fitted model. I want to be able to return the coefficients of each fitting (presumably in a dictionary), so that I can refer back to them and plot the line of best fit for each.
Calling the standard .coef_ or .get_params methods do not work because the items these methods attempt to retrieve are groupby objects. Alternatively, I tried to introduce the following:
def get_coefs():
coefs_dict = {}
for name, values in dataframe.groupby(self.groupby_column):
coefs_dict[name] = self.drugs_dict[name].coefs_
return coefs_dict
But get the following:
<bound method GroupbyEstimator.get_coefs of GroupbyEstimator(groupby_column='ndc',
pipeline_factory=<function pipeline_factory at 0x0000018DAD207268>)>
Here's the class I've written:
from sklearn import base
import numpy as np
import pandas as pd
class GroupbyEstimator(base.BaseEstimator, base.RegressorMixin):
def __init__(self, groupby_column, pipeline_factory):
self.groupby_column = groupby_column
self.pipeline_factory = pipeline_factory
def fit(self, dataframe, label):
self.drugs_dict = {}
self.label = label
dataframe = pd.get_dummies(dataframe)
for name, values in dataframe.groupby(self.groupby_column):
y = values[label]
X = values.drop(columns = [label, self.groupby_column], axis = 1)
self.drugs_dict[name] = self.pipeline_factory().fit(X, y)
return self
def get_coefs():
self.coefs_dict = {}
self.coefs_dict[name] = self.drugs_dict[name].named_steps["lin_reg"].coef_
return self.coefs_dict
def predict(self, test_data):
price_pred_list = []
for idx, row in test_data.iterrows():
name = row[self.groupby_column]
regression_coefs = self.drugs_dict[name]
row = pd.DataFrame(row).T
X = row.drop(columns = [self.label, self.groupby_column], axis = 1).values.reshape(1, -1)
drug_price_pred = regression_coefs.predict(X)
price_pred_list.append([name, drug_price_pred])
return price_pred_list
Expected result is a dictionary of the format:
{drug_a: [coefficient_1, coefficient_2,...coefficient_n],
drug_b: [coefficient_1, coefficient_2,...coefficient_n],
drug_c: [coefficient_1, coefficient_2,...coefficient_n]}
The pipeline factory is like this. I'll be building this out with alternative regressors, pca, gridsearchcv, etc. at a later time (so long as I can get the parameters out of the groupby objects for the individual regressions.
def pipeline_factory():
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
return Pipeline([
('lin_reg', LinearRegression())
])
EDIT: Added the get_coefs method as suggested. Unfortunately, as displayed above, it is still returning the same error.
The problem is with self.drugs_dict which is a dictionary of Pipeline objects, so you can't use coef_ on them directly. The coef_ is anattribute associated with the estimator object which in your case is a LinearRegression object. So the correct way of accessing the coefficients will be self.drugs_dict[name].named_steps["lin_reg"].coef_ instead of self.drugs_dict[name].coefs_ in your get_coefs() method.
While #Parthasarathy Subburaj led me to the right answer, here's the completed code for anyone that may be looking for a similar solution:
from sklearn import base
import numpy as np
import pandas as pd
class GroupbyEstimator(base.BaseEstimator, base.RegressorMixin):
def __init__(self, groupby_column, pipeline_factory):
# column is the value to group by; estimator_factory can be called to produce estimators
self.groupby_column = groupby_column
self.pipeline_factory = pipeline_factory
def fit(self, dataframe, label):
# Create an estimator and fit it with the portion in each group (create and fit a model per city
self.drugs_dict = {}
self.label = label
self.coefs_dict = {}
dataframe = pd.get_dummies(dataframe) #onehot encoder had problems with the data, so I'm getting the dummies with pandas here
for name, values in dataframe.groupby(self.groupby_column):
y = values[label]
X = values.drop(columns = [label, self.groupby_column], axis = 1)
self.drugs_dict[name] = self.pipeline_factory().fit(X, y)
self.coefs_dict[name] = self.drugs_dict[name].named_steps["lin_reg"].coef_
return self
def get_coefs(self):
return self.coefs_dict
def predict(self, test_data):
price_pred_list = []
for idx, row in test_data.iterrows():
name = row[self.groupby_column] #get drug name from drug column
regression_coefs = self.drugs_dict[name] #get coefficients from fitting in drugs_dict
row = pd.DataFrame(row).T
X = row.drop(columns = [self.label, self.groupby_column], axis = 1).values.reshape(1, -1)
drug_price_pred = regression_coefs.predict(X) #Use regression coefficients from dictionary (key = drug name) to predict
price_pred_list.append([name, drug_price_pred])
return price_pred_list
The TL;DR of the comments is that the dictionary holding model names and coefficients needs to be created under the fit method using sklearn's .named_steps on the desired portion of the pipeline, and then returned in a separate method (in this case get_coefs).

How to separate two concatenaded words

I have a review dataset and I want to process it using NLP techniques. I did all the preprocessing stages (remove stop words, stemming, etc.). My problem is that there are some words, which are connected to each other and my function doesn't understand those. Here is an example:
Great services. I had a nicemeal and I love it a lot.
How can I correct it from nicemeal to nice meal?
Peter Norvig has a nice solution to the word segmentation problem that you are encountering. Long story short, he uses a large dataset of word (and bigram) frequencies and some dynamic programming to split long strings of connected words into their most likely segmentation.
You download the zip file with the source code and the word frequencies and adapt it to your use case. Here is the relevant bit, for completeness.
def memo(f):
"Memoize function f."
table = {}
def fmemo(*args):
if args not in table:
table[args] = f(*args)
return table[args]
fmemo.memo = table
return fmemo
#memo
def segment(text):
"Return a list of words that is the best segmentation of text."
if not text: return []
candidates = ([first]+segment(rem) for first,rem in splits(text))
return max(candidates, key=Pwords)
def splits(text, L=20):
"Return a list of all possible (first, rem) pairs, len(first)<=L."
return [(text[:i+1], text[i+1:])
for i in range(min(len(text), L))]
def Pwords(words):
"The Naive Bayes probability of a sequence of words."
return product(Pw(w) for w in words)
#### Support functions (p. 224)
def product(nums):
"Return the product of a sequence of numbers."
return reduce(operator.mul, nums, 1)
class Pdist(dict):
"A probability distribution estimated from counts in datafile."
def __init__(self, data=[], N=None, missingfn=None):
for key,count in data:
self[key] = self.get(key, 0) + int(count)
self.N = float(N or sum(self.itervalues()))
self.missingfn = missingfn or (lambda k, N: 1./N)
def __call__(self, key):
if key in self: return self[key]/self.N
else: return self.missingfn(key, self.N)
def datafile(name, sep='\t'):
"Read key,value pairs from file."
for line in file(name):
yield line.split(sep)
def avoid_long_words(key, N):
"Estimate the probability of an unknown word."
return 10./(N * 10**len(key))
N = 1024908267229 ## Number of tokens
Pw = Pdist(datafile('count_1w.txt'), N, avoid_long_words)
You can also use the segment2 method as it uses bigrams and is much more accurate.

characterised inventory matrix in multiLCA calculations with brightway

what does the characterised inventory matrix of a multiLCA object represent in Brightway2? I would have expected to find several of these matrices in the object, representing the characterised inventory of the different activities and different impact assessment methods.
For a simple LCA object, the sum of all the elements of the characterised inventory matrix gives the total impact of that activity. But it seems not to be the case for MultiLCA objects (e.g.)
#impact assessment method
i2002=[('IMPACT 2002+ (Endpoint)', 'resources', 'total'),
('IMPACT 2002+ (Endpoint)', 'climate change', 'climate change'),
('IMPACT 2002+ (Endpoint)', 'human health', 'total'),
('IMPACT 2002+ (Endpoint)', 'ecosystem quality', 'total')
]
fu=[]
for j in range(1,11):
fu.append({bw.Database('ei_33c').random():1})
testsetup_i2002 ={'inv': fu, 'ia':i2002}
bw.calculation_setups['testsetup_i2002'] = testsetup_i2002
mlca_test=bw.MultiLCA('testsetup_i2002')
result=mlca_test.lca.characterized_inventory.sum()
the result is different from the scores or sum of scores obtained from
mlca_test.results()
You can see the MultiLCA source code, which is relatively simple. It overwrites the characterized_inventory matrix for each step of the calculation, and only stores the results in a Numpy array: self.results = np.zeros((len(self.func_units), len(self.methods))). To get what you want - separate characterized inventory matrices for each combination of functional unit and LCIA method - you would have to write your own subclass. Here is one example:
from bw2calc.multi_lca import *
class PersistentMultiLCA(MultiLCA):
def __init__(self, cs_name):
if not calculation_setups:
raise ImportError
assert cs_name in calculation_setups
try:
cs = calculation_setups[cs_name]
except KeyError:
raise ValueError(
"{} is not a known `calculation_setup`.".format(cs_name)
)
self.func_units = cs['inv']
self.methods = cs['ia']
self.lca = LCA(demand=self.all, method=self.methods[0])
self.lca.lci(factorize=True)
self.method_matrices = []
self.results = {}
for method in self.methods:
self.lca.switch_method(method)
self.method_matrices.append(self.lca.characterization_matrix)
for row, func_unit in enumerate(self.func_units):
self.lca.redo_lci(func_unit)
for col, cf_matrix in enumerate(self.method_matrices):
self.lca.characterization_matrix = cf_matrix
self.lca.lcia_calculation()
self.results[row, col] = self.lca.characterized_inventory.copy()

Training procedure

I noticed that the sklearn.linear_model.SGDClassifier implements gradient descent for a linear model, therefore one could state that the class combines the fitting procedure (SGD) and the model (linear model) in one class.
SGD is, however, not inherent to linear models, and linear_models can be trained using many other optimizations with particular pro's (memory usage, convergence speed, local optima avoidance*, ...). One could state that such optimization techniques implement how to iterate over the training data, whether to do it online or offline, in what feature-dimension to apply an update and when to stop (possibly based on an error function of a validation set).
In particular, I implemented a model using theano and wrapped it in a fit/predict interface. Theano is cool because it allows one to define a callable which applies gradient descent on one sample, or on a set of samples, as well as a callable which returns the error on a validation set. But this coolness is not inherent to theano, a lot more models can simply define an update and error-evaluation function, which can then be used by different iteration- and stopping policies for fitting.
The theano examples often use minibatch, and the minibatch code is copy-pasted or reimplemented a lot with just minor adjustments which can easily be factored out. So I was hoping that sklearn implements something that you initialize with some parameters and an update/error callable to fit 'any model'. Or possible there is some good practice on how to do this yourself (especially w.r.t. the interface of the fitter).
Is there anything like this (in sklearn), that is Fitters which do not define the model?
*In the particular case of linear models and a l2 cost function, local optima do not exist of course, but still.
EDIT
Fair enough, this calls for a suggestion. I coded these two classes, which are not 100% clean, but they given an idea of what I mean:
import numpy
class StochasticUpdate():
def __init__(self, model, update, n_epochs, n_data_points, error=None, test_fraction=None):
self.update = update
self.n_epochs = n_epochs
self.n_data_points = n_data_points
self.error = error
self.model = model
if self.error is None and test_fraction is not None:
raise ValueError('error parameter must be specified if a test_faction (value: %s) should be used.' % test_fraction)
self.do_test = test_fraction is not None
self.n_train_samples = int(n_data_points - test_fraction) if self.do_test else n_data_points
if self.do_test:
self.test_range = numpy.arange(self.n_train_samples, n_data_points)
self.n_test_samples = int(n_data_points * test_fraction)
self.train_range = numpy.arange(0, self.n_train_samples)
def fit(self):
if self.do_test: self.test_errors = []
self.train_errors = []
self.mean_cost_values = []
for epoch in range(self.n_epochs):
order = numpy.random.permutation(self.n_train_samples)
mean_cost_value = 0
for i in range(self.n_train_samples):
mean_cost_value += self.update([order[i]])
self.mean_cost_values.append(mean_cost_value/ self.n_data_points)
if self.error is not None:
self.train_errors.append(self.error(self.train_range))
if self.do_test:
self.test_errors.append(self.error(self.test_range))
return self.model
from math import ceil
class MinibatchStochasticUpdate(StochasticUpdate):
def __init__(self, model, update, n_epochs, n_data_points, error, batch_size, patience=5000, patience_increase=2,
improvement_threshold = 0.995, validation_frequency=None, validate_faction=0.1, test_fraction=None):
super().__init__(self, model, update, n_data_points, error, test_fraction)
self.update = update
self.n_epochs = n_epochs
self.n_data_points = n_data_points
self.model = model
self.batch_size = batch_size
self.patience = patience
self.patience_increase = patience_increase
self.improvement_threshold = improvement_threshold
self.n_validation_samples = int(n_data_points * validate_faction)
self.validation_range = numpy.arange(self.n_train_samples, self.n_train_samples + self.n_validation_samples)
self.n_train_batches = int(ceil(n_data_points / batch_size))
self.n_train_batches = int(ceil(self.n_train_samples / self.batch_size))
self.train_batch_ranges = [
numpy.arange(minibatch_index * self.batch_size, min((minibatch_index+1) * self.batch_size, self.n_train_samples))
for minibatch_index in range(self.n_train_batches)
]
self.validation_frequency = min(self.n_train_batches, patience/2) if validation_frequency is None else validation_frequency
def fit(self):
self.best_validation_error = numpy.inf
best_params = None
iteration = 0
for epoch in range(self.n_epochs):
for minibatch_index in range(self.n_train_batches):
self.update(self.train_batch_ranges[minibatch_index])
if (iter + 1) % self.validation_frequency == 0:
current_validation_error = self.error(self.validation_error)
if current_validation_error < self.best_validation_error:
if current_validation_error < self.best_validation_error * self.improvement_threshold:
patience = max(self.patience, iter * self.patience_increase)
best_params = self.model.copy_parameters()
self.best_validation_error = current_validation_error
if iteration <= patience:
self.model.set_parameters(best_params)
return self.model
iteration += 1
self.model.set_parameters(best_params)
return self.model
Then for in the fit of the model one could support different training approaches and stopping criteria like this:
def fit(self, X, y):
X_shared = theano.shared(X, borrow=True)
y_shared = theano.shared(y, borrow=True)
learning_rate = self.training_method_options['learning_rate']
trainer = {
'stochastic_gradient_descent': lambda: StochasticUpdate(
self,
update=self.update_stochastic_gradient_descent_function(X_shared, y_shared, learning_rate),
n_epochs=self.training_method_options['n_epochs'],
n_data_points=X.shape[0],
error=self.evaluation_function(X_shared, y_shared),
),
'minibatch_gradient_descent': lambda: MinibatchStochasticUpdate(
self,
update=self.update_stochastic_gradient_descent_function(X_shared, y_shared, learning_rate),
n_epochs=self.training_method_options['n_epochs'],
n_data_points=X.shape[0],
error=self.evaluation_function(X_shared, y_shared),
batch_size=self.training_method_options['batch_size']
)
}[self.training_method]()
trainer.fit()
return self
Obviosuly the hash-map part is hacky, and could be done more elegantly using a standardized interface for the two classes above (since the hashmaps are still O(N*M) in size for N fitters and M models).

Resources