K-Means clustering Hyperparameter Tuning - python-3.x

I am trying to perform hyperparameter tuning for Spatio-Temporal K-Means clustering by using it in a pipeline with a Decision Tree classifier. The idea is to use K-Means clustering algorithm to generate cluster-distance space matrix and clustered labels which will be then passed to Decision Tree classifier. For hyperparameter tuning, just use parameters for K-Means algorithm.
I am using Python 3.8 and sklearn 0.22.
The data I am interested is having 3 columns/attributes: 'time', 'x' and 'y' (x and y are spatial coordinates).
The code is:
class ST_KMeans(BaseEstimator, TransformerMixin):
# class ST_KMeans():
"""
Note that K-means clustering algorithm is designed for Euclidean distances.
It may stop converging with other distances, when the mean is no longer a
best estimation for the cluster 'center'.
The 'mean' minimizes squared differences (or, squared Euclidean distance).
If you want a different distance function, you need to replace the mean with
an appropriate center estimation.
Parameters:
k: number of clusters
eps1 : float, default=0.5
The spatial density threshold (maximum spatial distance) between
two points to be considered related.
eps2 : float, default=10
The temporal threshold (maximum temporal distance) between two
points to be considered related.
metric : string default='euclidean'
The used distance metric - more options are
‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’,
‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, ‘jensenshannon’,
‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘rogerstanimoto’, ‘sqeuclidean’,
‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘yule’.
n_jobs : int or None, default=-1
The number of processes to start; -1 means use all processors (BE AWARE)
Attributes:
labels : array, shape = [n_samples]
Cluster labels for the data - noise is defined as -1
"""
def __init__(self, k, eps1 = 0.5, eps2 = 10, metric = 'euclidean', n_jobs = 1):
self.k = k
self.eps1 = eps1
self.eps2 = eps2
# self.min_samples = min_samples
self.metric = metric
self.n_jobs = n_jobs
def fit(self, X):
"""
Apply the ST K-Means algorithm
X : 2D numpy array. The first attribute of the array should be time attribute
as float. The following positions in the array are treated as spatial
coordinates.
The structure should look like this [[time_step1, x, y], [time_step2, x, y]..]
For example 2D dataset:
array([[0,0.45,0.43],
[0,0.54,0.34],...])
Returns:
self
"""
# check if input is correct
X = check_array(X)
# type(X)
# numpy.ndarray
# Check arguments for DBSCAN algo-
if not self.eps1 > 0.0 or not self.eps2 > 0.0:
raise ValueError('eps1, eps2, minPts must be positive')
# Get dimensions of 'X'-
# n - number of rows
# m - number of attributes/columns-
n, m = X.shape
# Compute sqaured form Euclidean Distance Matrix for 'time' and spatial attributes-
time_dist = squareform(pdist(X[:, 0].reshape(n, 1), metric = self.metric))
euc_dist = squareform(pdist(X[:, 1:], metric = self.metric))
'''
Filter the euclidean distance matrix using time distance matrix. The code snippet gets all the
indices of the 'time_dist' matrix in which the time distance is smaller than 'eps2'.
Afterward, for the same indices in the euclidean distance matrix the 'eps1' is doubled which results
in the fact that the indices are not considered during clustering - as they are bigger than 'eps1'.
'''
# filter 'euc_dist' matrix using 'time_dist' matrix-
dist = np.where(time_dist <= self.eps2, euc_dist, 2 * self.eps1)
# Initialize K-Means clustering model-
kmeans_clust_model = KMeans(
n_clusters = self.k, init = 'k-means++',
n_init = 10, max_iter = 300,
precompute_distances = 'auto', algorithm = 'auto')
# Train model-
kmeans_clust_model.fit(dist)
self.labels = kmeans_clust_model.labels_
self.X_transformed = kmeans_clust_model.fit_transform(X)
return self
def transform(self, X):
pass
# Initialize ST-K-Means object-
st_kmeans_algo = ST_KMeans(
k = 5, eps1=0.6,
eps2=9, metric='euclidean',
n_jobs=1
)
# Train on a chunk of dataset-
st_kmeans_algo.fit(data.loc[:500, ['time', 'x', 'y']])
# Get clustered data points labels-
kmeans_labels = st_kmeans_algo.labels
kmeans_labels.shape
# (501,)
# Get labels for points clustered using trained model-
kmeans_transformed = st_kmeans_algo.X_transformed
kmeans_transformed.shape
# (501, 5)
dtc = DecisionTreeClassifier()
dtc.fit(kmeans_transformed, kmeans_labels)
y_pred = dtc.predict(kmeans_transformed)
# Get model performance metrics-
accuracy = accuracy_score(kmeans_labels, y_pred)
precision = precision_score(kmeans_labels, y_pred, average='macro')
recall = recall_score(kmeans_labels, y_pred, average='macro')
print("\nDT model metrics are:")
print("accuracy = {0:.4f}, precision = {1:.4f} & recall = {2:.4f}\n".format(
accuracy, precision, recall
))
# DT model metrics are:
# accuracy = 1.0000, precision = 1.0000 & recall = 1.0000
However, when I try to perform hyper-paramter tuning using sklearn's pipeline:
# Hyper-parameter Tuning:
# Define steps of pipeline-
pipeline_steps = [
('st_kmeans_algo' ,ST_KMeans(k = 5, eps1=0.6, eps2=9, metric='euclidean', n_jobs=1)),
('dtc', DecisionTreeClassifier())
]
# Instantiate a pipeline-
pipeline = Pipeline(pipeline_steps)
# Train pipeline-
pipeline.fit(kmeans_transformed, kmeans_labels)
It gives me the following error:
--------------------------------------------------------------------------- TypeError Traceback (most recent call
last) in
8
9 # Train pipeline-
---> 10 pipeline.fit(kmeans_transformed, kmeans_labels)
~/.local/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self,
X, y, **fit_params)
348 This estimator
349 """
--> 350 Xt, fit_params = self._fit(X, y, **fit_params)
351 with _print_elapsed_time('Pipeline',
352 self._log_message(len(self.steps) - 1)):
~/.local/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self,
X, y, **fit_params)
309 cloned_transformer = clone(transformer)
310 # Fit or load from cache the current transformer
--> 311 X, fitted_transformer = fit_transform_one_cached(
312 cloned_transformer, X, y, None,
313 message_clsname='Pipeline',
~/.local/lib/python3.8/site-packages/joblib/memory.py in
call(self, *args, **kwargs)
353
354 def call(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
~/.local/lib/python3.8/site-packages/sklearn/pipeline.py in
_fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
726 with _print_elapsed_time(message_clsname, message):
727 if hasattr(transformer, 'fit_transform'):
--> 728 res = transformer.fit_transform(X, y, **fit_params)
729 else:
730 res = transformer.fit(X, y, **fit_params).transform(X)
~/.local/lib/python3.8/site-packages/sklearn/base.py in
fit_transform(self, X, y, **fit_params)
572 else:
573 # fit method of arity 2 (supervised transformation)
--> 574 return self.fit(X, y, **fit_params).transform(X)
575
576
TypeError: fit() takes 2 positional arguments but 3 were given

The fit method in your ST_KMeans takes in only X as input but in this line:
pipeline.fit(kmeans_transformed, kmeans_labels)
you pass both X and Y as input to your pipeline which tries to call the fit method of the first stage of your pipeline i.e. ST_KKeans with these two arguments resulting in this error. In order to overcome this just add a dummy parameter y to the fit method of your ST_KMeans objects as shown below:
def fit(self, X, Y):
The additional parameter Y is not used anywhere inside the method it just maintains the consistency.

Related

K-Means GridSearchCV hyperparameter tuning

I am trying to perform hyperparameter tuning for Spatio-Temporal K-Means clustering by using it in a pipeline with a Decision Tree classifier. The idea is to use K-Means clustering algorithm to generate cluster-distance space matrix and clustered labels which will be then passed to Decision Tree classifier. For hyperparameter tuning, just use parameters for K-Means algorithm.
I am using Python 3.8 and sklearn 0.22.
The data I am interested is having 3 columns/attributes: 'time', 'x' and 'y' (x and y are spatial coordinates).
The code is:
class ST_KMeans(BaseEstimator, TransformerMixin):
# class ST_KMeans():
"""
Note that K-means clustering algorithm is designed for Euclidean distances.
It may stop converging with other distances, when the mean is no longer a
best estimation for the cluster 'center'.
The 'mean' minimizes squared differences (or, squared Euclidean distance).
If you want a different distance function, you need to replace the mean with
an appropriate center estimation.
Parameters:
k: number of clusters
eps1 : float, default=0.5
The spatial density threshold (maximum spatial distance) between
two points to be considered related.
eps2 : float, default=10
The temporal threshold (maximum temporal distance) between two
points to be considered related.
metric : string default='euclidean'
The used distance metric - more options are
‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’,
‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, ‘jensenshannon’,
‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘rogerstanimoto’, ‘sqeuclidean’,
‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘yule’.
n_jobs : int or None, default=-1
The number of processes to start; -1 means use all processors (BE AWARE)
Attributes:
labels : array, shape = [n_samples]
Cluster labels for the data - noise is defined as -1
"""
def __init__(self, k, eps1 = 0.5, eps2 = 10, metric = 'euclidean', n_jobs = 1):
self.k = k
self.eps1 = eps1
self.eps2 = eps2
# self.min_samples = min_samples
self.metric = metric
self.n_jobs = n_jobs
def fit(self, X, Y = None):
"""
Apply the ST K-Means algorithm
X : 2D numpy array. The first attribute of the array should be time attribute
as float. The following positions in the array are treated as spatial
coordinates.
The structure should look like this [[time_step1, x, y], [time_step2, x, y]..]
For example 2D dataset:
array([[0,0.45,0.43],
[0,0.54,0.34],...])
Returns:
self
"""
# check if input is correct
X = check_array(X)
# type(X)
# numpy.ndarray
# Check arguments for DBSCAN algo-
if not self.eps1 > 0.0 or not self.eps2 > 0.0:
raise ValueError('eps1, eps2, minPts must be positive')
# Get dimensions of 'X'-
# n - number of rows
# m - number of attributes/columns-
n, m = X.shape
# Compute sqaured form Euclidean Distance Matrix for 'time' and spatial attributes-
time_dist = squareform(pdist(X[:, 0].reshape(n, 1), metric = self.metric))
euc_dist = squareform(pdist(X[:, 1:], metric = self.metric))
'''
Filter the euclidean distance matrix using time distance matrix. The code snippet gets all the
indices of the 'time_dist' matrix in which the time distance is smaller than 'eps2'.
Afterward, for the same indices in the euclidean distance matrix the 'eps1' is doubled which results
in the fact that the indices are not considered during clustering - as they are bigger than 'eps1'.
'''
# filter 'euc_dist' matrix using 'time_dist' matrix-
dist = np.where(time_dist <= self.eps2, euc_dist, 2 * self.eps1)
# Initialize K-Means clustering model-
self.kmeans_clust_model = KMeans(
n_clusters = self.k, init = 'k-means++',
n_init = 10, max_iter = 300,
precompute_distances = 'auto', algorithm = 'auto')
# Train model-
self.kmeans_clust_model.fit(dist)
self.labels = self.kmeans_clust_model.labels_
self.X_transformed = self.kmeans_clust_model.fit_transform(X)
return self
def transform(self, X):
if not isinstance(X, np.ndarray):
# Convert to numpy array-
X = X.values
# Get dimensions of 'X'-
# n - number of rows
# m - number of attributes/columns-
n, m = X.shape
# Compute sqaured form Euclidean Distance Matrix for 'time' and spatial attributes-
time_dist = squareform(pdist(X[:, 0].reshape(n, 1), metric = self.metric))
euc_dist = squareform(pdist(X[:, 1:], metric = self.metric))
# filter 'euc_dist' matrix using 'time_dist' matrix-
dist = np.where(time_dist <= self.eps2, euc_dist, 2 * self.eps1)
# return self.kmeans_clust_model.transform(X)
return self.kmeans_clust_model.transform(dist)
# Initialize ST-K-Means object-
st_kmeans_algo = ST_KMeans(
k = 5, eps1=0.6,
eps2=9, metric='euclidean',
n_jobs=1
)
Y = np.zeros(shape = (501,))
# Train on a chunk of dataset-
st_kmeans_algo.fit(data.loc[:500, ['time', 'x', 'y']], Y)
# Get clustered data points labels-
kmeans_labels = st_kmeans_algo.labels
kmeans_labels.shape
# (501,)
# Get labels for points clustered using trained model-
# kmeans_transformed = st_kmeans_algo.X_transformed
kmeans_transformed = st_kmeans_algo.transform(data.loc[:500, ['time', 'x', 'y']])
kmeans_transformed.shape
# (501, 5)
dtc = DecisionTreeClassifier()
dtc.fit(kmeans_transformed, kmeans_labels)
y_pred = dtc.predict(kmeans_transformed)
# Get model performance metrics-
accuracy = accuracy_score(kmeans_labels, y_pred)
precision = precision_score(kmeans_labels, y_pred, average='macro')
recall = recall_score(kmeans_labels, y_pred, average='macro')
print("\nDT model metrics are:")
print("accuracy = {0:.4f}, precision = {1:.4f} & recall = {2:.4f}\n".format(
accuracy, precision, recall
))
# DT model metrics are:
# accuracy = 1.0000, precision = 1.0000 & recall = 1.0000
# Hyper-parameter Tuning:
# Define steps of pipeline-
pipeline_steps = [
('st_kmeans_algo' ,ST_KMeans(k = 5, eps1=0.6, eps2=9, metric='euclidean', n_jobs=1)),
('dtc', DecisionTreeClassifier())
]
# Instantiate a pipeline-
pipeline = Pipeline(pipeline_steps)
kmeans_transformed.shape, kmeans_labels.shape
# ((501, 5), (501,))
# Train pipeline-
pipeline.fit(kmeans_transformed, kmeans_labels)
# Specify parameters to be hyper-parameter tuned-
params = [
{
'st_kmeans_algo__k': [3, 5, 7]
}
]
# Initialize GridSearchCV object-
grid_cv = GridSearchCV(estimator=pipeline, param_grid=params, cv = 2)
# Train GridSearch on computed data from above-
grid_cv.fit(kmeans_transformed, kmeans_labels)
The 'grid_cv.fit()' call gives the following error:
ValueError Traceback (most recent call
last) in
5
6 # Train GridSearch on computed data from above-
----> 7 grid_cv.fit(kmeans_transformed, kmeans_labels)
~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
in fit(self, X, y, groups, **fit_params)
708 return results
709
--> 710 self._run_search(evaluate_candidates)
711
712 # For multi-metric evaluation, store the best_index_, best_params_ and
~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
in _run_search(self, evaluate_candidates) 1149 def
_run_search(self, evaluate_candidates): 1150 """Search all candidates in param_grid"""
-> 1151 evaluate_candidates(ParameterGrid(self.param_grid)) 1152 1153
~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
in evaluate_candidates(candidate_params)
680 n_splits, n_candidates, n_candidates * n_splits))
681
--> 682 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
683 X, y,
684 train=train, test=test,
~/.local/lib/python3.8/site-packages/joblib/parallel.py in
call(self, iterable) 1002 # remaining jobs. 1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator): 1005 self._iterating = self._original_iterator is not None 1006
~/.local/lib/python3.8/site-packages/joblib/parallel.py in
dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~/.local/lib/python3.8/site-packages/joblib/parallel.py in
_dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in
apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in
init(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~/.local/lib/python3.8/site-packages/joblib/parallel.py in
call(self)
253 # change the default number of processes to -1
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 255 return [func(*args, **kwargs)
256 for func, args, kwargs in self.items]
257
~/.local/lib/python3.8/site-packages/joblib/parallel.py in
(.0)
253 # change the default number of processes to -1
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 255 return [func(*args, **kwargs)
256 for func, args, kwargs in self.items]
257
~/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py
in _fit_and_score(estimator, X, y, scorer, train, test, verbose,
parameters, fit_params, return_train_score, return_parameters,
return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
~/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py
in _score(estimator, X_test, y_test, scorer)
589 scores = scorer(estimator, X_test)
590 else:
--> 591 scores = scorer(estimator, X_test, y_test)
592
593 error_msg = ("scoring must return a number, got %s (%s) "
~/.local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py in
call(self, estimator, *args, **kwargs)
87 *args, **kwargs)
88 else:
---> 89 score = scorer(estimator, *args, **kwargs)
90 scores[name] = score
91 return scores
~/.local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py in
_passthrough_scorer(estimator, *args, **kwargs)
369 def _passthrough_scorer(estimator, *args, **kwargs):
370 """Function that wraps estimator.score"""
--> 371 return estimator.score(*args, **kwargs)
372
373
~/.local/lib/python3.8/site-packages/sklearn/utils/metaestimators.py
in (*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
~/.local/lib/python3.8/site-packages/sklearn/pipeline.py in
score(self, X, y, sample_weight)
617 if sample_weight is not None:
618 score_params['sample_weight'] = sample_weight
--> 619 return self.steps[-1][-1].score(Xt, y, **score_params)
620
621 #property
~/.local/lib/python3.8/site-packages/sklearn/base.py in score(self, X,
y, sample_weight)
367 """
368 from .metrics import accuracy_score
--> 369 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
370
371
~/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py
in accuracy_score(y_true, y_pred, normalize, sample_weight)
183
184 # Compute accuracy for each possible representation
--> 185 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
186 check_consistent_length(y_true, y_pred, sample_weight)
187 if y_type.startswith('multilabel'):
~/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py
in _check_targets(y_true, y_pred)
78 y_pred : array or indicator matrix
79 """
---> 80 check_consistent_length(y_true, y_pred)
81 type_true = type_of_target(y_true)
82 type_pred = type_of_target(y_pred)
~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py in
check_consistent_length(*arrays)
209 uniques = np.unique(lengths)
210 if len(uniques) > 1:
--> 211 raise ValueError("Found input variables with inconsistent numbers of"
212 " samples: %r" % [int(l) for l in lengths])
213
ValueError: Found input variables with inconsistent numbers of
samples: [251, 250]
The different dimensions/shapes are:
kmeans_transformed.shape, kmeans_labels.shape, data.loc[:500, ['time', 'x', 'y']].shape
# ((501, 5), (501,), (501, 3))
I don't get it how the error arrives at the "samples: [251, 25]" ?
What's going wrong?
Thanks!
250 and 251 are respectively the shapes of your train and validation in GridSearchCV
look at your custom estimator...
def transform(self, X):
return self.X_transformed
the original transform method doesn't apply any sort of operation it simply returns the train data. we need an estimator that is able to transform the new data (in sour case the validation inside gridsearch) in a flexible way. change the transform method in this way
def transform(self, X):
return self.kmeans_clust_model.transform(X)

I have no idea where 'plot_learning_curve' file or 'learning.utils' module is

While following an example about classifying cats and dogs using AlexNet on some post I got stuck on this import error:
Traceback (most recent call last):
File "C:\Users\Gsum\Desktop\Asirra 개 고양이\asirra-dogs-cats-classification-master\learning\optimizers.py", line 5, in <module>
from learning.utils import plot_learning_curve
ImportError: No module named 'learning'
I've been looking for modules named similar to learning or learn which includes 'plot_learning_curve' function.
Anyone who knows which library includes plot learning curve function, I would appreciate some help.
Here is my code:
import os
import time
from abc import abstractmethod
import tensorflow as tf
from learning.utils import plot_learning_curve
class Optimizer(object):
"""Base class for gradient-based optimization algorithms."""
def __init__(self, model, train_set, evaluator, val_set=None, **kwargs):
"""
Optimizer initializer.
:param model: ConvNet, the model to be learned.
:param train_set: DataSet, training set to be used.
:param evaluator: Evaluator, for computing performance scores during training.
:param val_set: DataSet, validation set to be used, which can be None if not used.
:param kwargs: dict, extra arguments containing training hyperparameters.
- batch_size: int, batch size for each iteration.
- num_epochs: int, total number of epochs for training.
- init_learning_rate: float, initial learning rate.
"""
self.model = model
self.train_set = train_set
self.evaluator = evaluator
self.val_set = val_set
# Training hyperparameters
self.batch_size = kwargs.pop('batch_size', 256)
self.num_epochs = kwargs.pop('num_epochs', 320)
self.init_learning_rate = kwargs.pop('init_learning_rate', 0.01)
self.learning_rate_placeholder = tf.placeholder(tf.float32) # Placeholder for current learning rate
self.optimize = self._optimize_op()
self._reset()
def _reset(self):
"""Reset some variables."""
self.curr_epoch = 1
self.num_bad_epochs = 0 # number of bad epochs, where the model is updated without improvement.
self.best_score = self.evaluator.worst_score # initialize best score with the worst one
self.curr_learning_rate = self.init_learning_rate # current learning rate
#abstractmethod
def _optimize_op(self, **kwargs):
"""
tf.train.Optimizer.minimize Op for a gradient update.
This should be implemented, and should not be called manually.
"""
pass
#abstractmethod
def _update_learning_rate(self, **kwargs):
"""
Update current learning rate (if needed) on every epoch, by its own schedule.
This should be implemented, and should not be called manually.
"""
pass
def _step(self, sess, **kwargs):
"""
Make a single gradient update and return its results.
This should not be called manually.
:param sess: tf.Session.
:param kwargs: dict, extra arguments containing training hyperparameters.
- augment_train: bool, whether to perform augmentation for training.
:return loss: float, loss value for the single iteration step.
y_true: np.ndarray, true label from the training set.
y_pred: np.ndarray, predicted label from the model.
"""
augment_train = kwargs.pop('augment_train', True)
# Sample a single batch
X, y_true = self.train_set.next_batch(self.batch_size, shuffle=True,
augment=augment_train, is_train=True)
# Compute the loss and make update
_, loss, y_pred = \
sess.run([self.optimize, self.model.loss, self.model.pred],
feed_dict={self.model.X: X, self.model.y: y_true,
self.model.is_train: True,
self.learning_rate_placeholder: self.curr_learning_rate})
return loss, y_true, y_pred
def train(self, sess, save_dir='/tmp', details=False, verbose=True, **kwargs):
"""
Run optimizer to train the model.
:param sess: tf.Session.
:param save_dir: str, the directory to save the learned weights of the model.
:param details: bool, whether to return detailed results.
:param verbose: bool, whether to print details during training.
:param kwargs: dict, extra arguments containing training hyperparameters.
:return train_results: dict, containing detailed results of training.
"""
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer()) # initialize all weights
train_results = dict() # dictionary to contain training(, evaluation) results and details
train_size = self.train_set.num_examples
num_steps_per_epoch = train_size // self.batch_size
num_steps = self.num_epochs * num_steps_per_epoch
if verbose:
print('Running training loop...')
print('Number of training iterations: {}'.format(num_steps))
step_losses, step_scores, eval_scores = [], [], []
start_time = time.time()
# Start training loop
for i in range(num_steps):
# Perform a gradient update from a single minibatch
step_loss, step_y_true, step_y_pred = self._step(sess, **kwargs)
step_losses.append(step_loss)
# Perform evaluation in the end of each epoch
if (i+1) % num_steps_per_epoch == 0:
# Evaluate model with current minibatch, from training set
step_score = self.evaluator.score(step_y_true, step_y_pred)
step_scores.append(step_score)
# If validation set is initially given, use it for evaluation
if self.val_set is not None:
# Evaluate model with the validation set
eval_y_pred = self.model.predict(sess, self.val_set, verbose=False, **kwargs)
eval_score = self.evaluator.score(self.val_set.labels, eval_y_pred)
eval_scores.append(eval_score)
if verbose:
# Print intermediate results
print('[epoch {}]\tloss: {:.6f} |Train score: {:.6f} |Eval score: {:.6f} |lr: {:.6f}'\
.format(self.curr_epoch, step_loss, step_score, eval_score, self.curr_learning_rate))
# Plot intermediate results
plot_learning_curve(-1, step_losses, step_scores, eval_scores=eval_scores,
mode=self.evaluator.mode, img_dir=save_dir)
curr_score = eval_score
# else, just use results from current minibatch for evaluation
else:
if verbose:
# Print intermediate results
print('[epoch {}]\tloss: {} |Train score: {:.6f} |lr: {:.6f}'\
.format(self.curr_epoch, step_loss, step_score, self.curr_learning_rate))
# Plot intermediate results
plot_learning_curve(-1, step_losses, step_scores, eval_scores=None,
mode=self.evaluator.mode, img_dir=save_dir)
curr_score = step_score
# Keep track of the current best model,
# by comparing current score and the best score
if self.evaluator.is_better(curr_score, self.best_score, **kwargs):
self.best_score = curr_score
self.num_bad_epochs = 0
saver.save(sess, os.path.join(save_dir, 'model.ckpt')) # save current weights
else:
self.num_bad_epochs += 1
self._update_learning_rate(**kwargs)
self.curr_epoch += 1
if verbose:
print('Total training time(sec): {}'.format(time.time() - start_time))
print('Best {} score: {}'.format('evaluation' if eval else 'training',
self.best_score))
print('Done.')
if details:
# Store training results in a dictionary
train_results['step_losses'] = step_losses # (num_iterations)
train_results['step_scores'] = step_scores # (num_epochs)
if self.val_set is not None:
train_results['eval_scores'] = eval_scores # (num_epochs)
return train_results
class MomentumOptimizer(Optimizer):
"""Gradient descent optimizer, with Momentum algorithm."""
def _optimize_op(self, **kwargs):
"""
tf.train.MomentumOptimizer.minimize Op for a gradient update.
:param kwargs: dict, extra arguments for optimizer.
- momentum: float, the momentum coefficient.
:return tf.Operation.
"""
momentum = kwargs.pop('momentum', 0.9)
update_vars = tf.trainable_variables()
return tf.train.MomentumOptimizer(self.learning_rate_placeholder, momentum, use_nesterov=False)\
.minimize(self.model.loss, var_list=update_vars)
def _update_learning_rate(self, **kwargs):
"""
Update current learning rate, when evaluation score plateaus.
:param kwargs: dict, extra arguments for learning rate scheduling.
- learning_rate_patience: int, number of epochs with no improvement
after which learning rate will be reduced.
- learning_rate_decay: float, factor by which the learning rate will be updated.
- eps: float, if the difference between new and old learning rate is smaller than eps,
the update is ignored.
"""
learning_rate_patience = kwargs.pop('learning_rate_patience', 10)
learning_rate_decay = kwargs.pop('learning_rate_decay', 0.1)
eps = kwargs.pop('eps', 1e-8)
if self.num_bad_epochs > learning_rate_patience:
new_learning_rate = self.curr_learning_rate * learning_rate_decay
# Decay learning rate only when the difference is higher than epsilon.
if self.curr_learning_rate - new_learning_rate > eps:
self.curr_learning_rate = new_learning_rate
self.num_bad_epochs = 0

Any way to fold uncertainties/weights into scikit-learn validation_curve?

I have a time series dataset that includes uncertainties on each point. I want to use sklearn.model_selection.validation_curve() to figure out the best value of my hyperparameter, gamma, for a sklearn.svm.svr() (support vector machine regression). It is simple to pass in the uncertainties (actually their inverse, sample_weights) to the svr().fit() function, which works great. None of the inputs for validation_curve() seem to allow this though. Is there a workaround?
# Create some fake data
times = np.linspace(0, 10, 10)[:, None]
data = np.sin(times).ravel()
uncertainty = np.linspace(0.05, 0.1, num=10)
np.random.shuffle(uncertainty)
sample_weight = 1 / uncertainty
# Quick helper function
def jpm_svr(gamma=1e-6, **kwargs):
return make_pipeline(SVR(kernel='rbf', C=1e3, gamma=gamma, **kwargs))
# Find the best value of gamma
gamma = np.logspace(-2, 5, num=11, base=10)
shuffle_split = ShuffleSplit(n_splits=20, train_size=0.5, test_size=0.5, random_state=None)
######## The next line is where I want to input the sample_weight but can't
train_score, val_score = validation_curve(jpm_svr(), X, y, #sample_weight :(
'svr__gamma',
gamma, cv=shuffle_split, scoring=evs)
score = np.median(val_score, axis=1)
best_score_index = np.argmax(score)
# Generate model with best value of gamma and plot it over the data
# Also note that I can now pass in sample_weight
model = SVR(kernel='rbf', C=1e3, gamma=gamma[best_score_index]).fit(times, data, sample_weight)
X_test = np.linspace(0, 10, 50)[:, None]
y_test = model.predict(X_test)
plt.errorbar(times, data, yerr=uncertainty, fmt='o');
plt.plot(X_test.ravel(), y_test, linewidth=6);

How do I obtain predictions and probabilities from new data input to a CNN in Tensorflow

I'll preface this by saying this is my first posted question on SO. I've just recently started working with Tensorflow, and have been attempting to apply a convolutional-neural network model approach for classification of .csv records in a file representing images from scans of microarray data. (FYI: Microarrays are a grid of spotted DNA on a glass slide, representing specific DNA target sequences for determining the presence of those DNA targets in a sample. The individual pixels represent fluorescence intensity value from 0-1). The file has ~200,000 records in total. Each record (image) has 10816 pixels that represent DNA sequences from known viruses, and one index label which identifies the virus species. The pixels create a pattern which is unique to each of the different viruses. There are 2165 different viruses in total represented within the 200,000 records. I have trained the network on images of labeled microarray datasets, but when I try to pass a new dataset through to classify it/them as one of the 2165 different viruses and determine predicted values and probabilities, I don't seem to be having much luck. This is the code that I am currently using for this:
import tensorflow as tf
import numpy as np
import csv
def extract_data(filename):
print("extracting data...")
NUM_LABELS = 2165
NUM_FEATURES = 10816
labels = []
fvecs = []
rowCount = 0
#iterate over the rows, split the label from the features
#convert the labels to integers and features to floats
for line in open(filename):
rowCount = rowCount + 1
row = line.split(',')
labels.append(row[3])#(int(row[7])) #<<<IT ALWAYS PREDICTS THIS VALUE!
for x in row [4:10820]:
fvecs.append(float(x))
#convert the array of float arrasy into a numpy float matrix
fvecs_np = np.matrix(fvecs).astype(np.float32)
#convert the array of int lables inta a numpy array
labels_np = np.array(labels).astype(dtype=np.uint8)
#convert the int numpy array into a one-hot matrix
labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)
print("arrays converted")
return fvecs_np, labels_onehot
def TestModels():
fvecs_np, labels_onehot = extract_data("MicroarrayTestData.csv")
print('RESTORING NN MODEL')
weights = {}
biases = {}
sess=tf.Session()
init = tf.global_variables_initializer()
#Load meta graph and restore weights
ModelID = "MicroarrayCNN_Data-1000.meta"
print("RESTORING:::", ModelID)
saver = tf.train.import_meta_graph(ModelID)
saver.restore(sess,tf.train.latest_checkpoint('./'))
graph = tf.get_default_graph()
x = graph.get_tensor_by_name("x:0")
y = graph.get_tensor_by_name("y:0")
keep_prob = tf.placeholder(tf.float32)
y_ = tf.placeholder("float", shape=[None, 2165])
wc1 = graph.get_tensor_by_name("wc1:0")
wc2 = graph.get_tensor_by_name("wc2:0")
wd1 = graph.get_tensor_by_name("wd1:0")
Wout = graph.get_tensor_by_name("Wout:0")
bc1 = graph.get_tensor_by_name("bc1:0")
bc2 = graph.get_tensor_by_name("bc2:0")
bd1 = graph.get_tensor_by_name("bd1:0")
Bout = graph.get_tensor_by_name("Bout:0")
weights = {wc1, wc2, wd1, Wout}
biases = {bc1, bc2, bd1, Bout}
print("NEXTArgmax")
prediction=tf.argmax(y,1)
probabilities = y
predY = prediction.eval(feed_dict={x: fvecs_np, y: labels_onehot}, session=sess)
probY = probabilities.eval(feed_dict={x: fvecs_np, y: labels_onehot}, session=sess)
accuracy = tf.reduce_mean(tf.cast(prediction, "float"))
print(sess.run(accuracy, feed_dict={x: fvecs_np, y: labels_onehot}))
print("%%%%%%%%%%%%%%%%%%%%%%%%%%")
print("Predicted::: ", predY, accuracy)
print("%%%%%%%%%%%%%%%%%%%%%%%%%%")
feed_dictTEST = {y: labels_onehot}
probabilities=probY
print("probabilities", probabilities.eval(feed_dict={x: fvecs_np}, session=sess))
########## Run Analysis ###########
TestModels()
So, when I run this code I get the correct prediction for the test set, although I am not sure I believe it, because it appears that whatever value I append in line 14 (see below) is the output it predicts:
labels.append(row[3])#<<<IT ALWAYS PREDICTS THIS VALUE!
I don't understand this, and it makes me suspicious that I've set up the CNN incorrectly, as I would have expected it to ignore my input label and determine a bast match from the trained network based on the trained patterns. The only thing I can figure is that when I pass the value through for the prediction; it is instead training the model on this data as well, and then predicting itself. Is this a correct assumption, or am I misinterpreting how Tensorflow works?
The other issue is that when I try to use code that (based on other tutorials) which is supposed to output the probabilities of all of the 2165 possible outputs, I get the error:
InvalidArgumentError (see above for traceback): Shape [-1,2165] has negative dimensions
[[Node: y = Placeholder[dtype=DT_FLOAT, shape=[?,2165], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
To me, it looks like it is the correct layer based on the 2165 value in the Tensor shape, but I don't understand the -1 value. So, to wrap up the summary, my questions are:
Based on the fact that I get the value that I have in the label of the input data, is this the correct method to make a classification using this model?
Am I missing a layer or have I configured the model incorrectly in order to extract the probabilities of all of the possible output classes, or am I using the wrong code to extract the information? I try to print out the accuracy to see if that would work, but instead it outputs the description of a tensor, so clearly that is incorrect as well.
(ADDITIONAL INFORMATION)
As requested, I'm also including the original code that was used to train the model, which is now below. You can see I do sort of a piece meal training of a limited number of related records at a time by their taxonomic relationships as I iterate through the file. This is mostly because the Mac that I'm training on (Mac Pro w/ 64GB ram) tends to give me the "Killed -9" error due to overuse of resources if I don't do it this way. There may be a better way to do it, but this seems to work.
Original Author: Aymeric Damien
Project: https://github.com/aymericdamien/TensorFlow-Examples/
from __future__ import print_function
import tensorflow as tf
import numpy as np
import csv
import random
# Parameters
num_epochs = 2
train_size = 1609
learning_rate = 0.001 #(larger >speed, lower >accuracy)
training_iters = 5000 # How much do you want to train (more = better trained)
batch_size = 32 #How many samples to train on, size of the training batch
display_step = 10 # How often to diplay what is going on during training
# Network Parameters
n_input = 10816 # MNIST data input (img shape: 28*28)...in my case 104x104 = 10816(rough array size)
n_classes = 2165 #3280 #2307 #787# Switched to 100 taxa/training set, dynamic was too wonky.
dropout = 0.75 # Dropout, probability to keep units. Jeffery Hinton's group developed it, that prevents overfitting to find new paths. More generalized model.
# Functions
def extract_data(filename):
print("extracting data...")
# arrays to hold the labels and feature vectors.
NUM_LABELS = 2165
NUM_FEATURES = 10826
taxCount = 0
taxCurrent = 0
labels = []
fvecs = []
rowCount = 0
#iterate over the rows, split the label from the features
#convert the labels to integers and features to floats
print("entering CNN loop")
for line in open(filename):
rowCount = rowCount + 1
row = line.split(',')
taxCurrent = row[3]
print("profile:", row[0:12])
labels.append(int(row[3]))
fvecs.append([float(x) for x in row [4:10820]])
#convert the array of float arrasy into a numpy float matrix
fvecs_np = np.matrix(fvecs).astype(np.float32)
#convert the array of int lables inta a numpy array
labels_np = np.array(labels).astype(dtype=np.uint8)
#convert the int numpy array into a one-hot matrix
labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)
print("arrays converted")
return fvecs_np, labels_onehot
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1): #Layer 1 : Convolutional layer
# Conv2D wrapper, with bias and relu activation
print("conv2d")
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') # Strides are the tensors...list of integers. Tensors=data
x = tf.nn.bias_add(x, b) #bias is the tuning knob
return tf.nn.relu(x) #rectified linear unit (activation function)
def maxpool2d(x, k=2): #Layer 2 : Takes samples from the image. (This is a 4D tensor)
print("maxpool2d")
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
# Create model
def conv_net(x, weights, biases, dropout):
print("conv_net setup")
# Reshape input picture
x = tf.reshape(x, shape=[-1, 104, 104, 1]) #-->52x52 , -->26x26x64
# Convolution Layer
conv1 = conv2d(x, weights['wc1'], biases['bc1']) #defined above already
# Max Pooling (down-sampling)
conv1 = maxpool2d(conv1, k=2)
print(conv1.get_shape)
# Convolution Layer
conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) #wc2 and bc2 are just placeholders...could actually skip this layer...maybe
# Max Pooling (down-sampling)
conv2 = maxpool2d(conv2, k=2)
print(conv2.get_shape)
# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
fc1 = tf.nn.relu(fc1) #activation function for the NN
# Apply Dropout
fc1 = tf.nn.dropout(fc1, dropout)
# Output, class prediction
out = tf.add(tf.matmul(fc1, weights['Wout']), biases['Bout'])
return out
def Train_Network(Txid_IN, Sess_File_Name):
import tensorflow as tf
tf.reset_default_graph()
x,y = 0,0
weights = {}
biases = {}
# tf Graph input
print("setting placeholders")
x = tf.placeholder(tf.float32, [None, n_input], name="x") #Gateway for data (images)
y = tf.placeholder(tf.float32, [None, n_classes], name="y") # Gateway for data (labels)
keep_prob = tf.placeholder(tf.float32) #dropout # Gateway for dropout(keep probability)
# Store layers weight & bias
#CREATE weights
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32]),name="wc1"), #
# 5x5 conv, 32 inputs, 64 outputs
'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64]),name="wc2"),
# fully connected, 7*7*64 inputs, 1024 outputs
'wd1': tf.Variable(tf.random_normal([26*26*64, 1024]),name="wd1"),
# 1024 inputs, 10 outputs (class prediction)
'Wout': tf.Variable(tf.random_normal([1024, n_classes]),name="Wout")
}
biases = {
'bc1': tf.Variable(tf.random_normal([32]), name="bc1"),
'bc2': tf.Variable(tf.random_normal([64]), name="bc2"),
'bd1': tf.Variable(tf.random_normal([1024]), name="bd1"),
'Bout': tf.Variable(tf.random_normal([n_classes]), name="Bout")
}
# Construct model
print("constructing model")
pred = conv_net(x, weights, biases, keep_prob)
print(pred)
# Define loss(cost) and optimizer
#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y)) Deprecated version of the statement
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = pred, labels=y)) #added reduce_mean 6/27
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
print("%%%%%%%%%%%%%%%%%%%%")
print ("%% ", correct_pred)
print ("%% ", accuracy)
print("%%%%%%%%%%%%%%%%%%%%")
# Initializing the variables
#init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
saver = tf.train.Saver()
fvecs_np, labels_onehot = extract_data("MicroarrayDataOUT.csv") #CHAGE TO PICORNAVIRUS!!!!!AHHHHHH!!!
print("starting session")
# Launch the graph
FitStep = 0
with tf.Session() as sess: #graph is encapsulated by its session
sess.run(init)
step = 1
# Keep training until reach max iterations (training_iters)
while step * batch_size < training_iters:
if FitStep >= 5:
break
else:
#iterate and train
print(step)
print(fvecs_np, labels_onehot)
for step in range(num_epochs * train_size // batch_size):
sess.run(optimizer, feed_dict={x: fvecs_np, y: labels_onehot, keep_prob:dropout}) #no dropout???...added Keep_prob:dropout
if FitStep >= 5:
break
#else:
###batch_x, batch_y = mnist.train.next_batch(batch_size)
# Run optimization op (backprop)
###sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
### keep_prob: dropout}) <<<<SOMETHING IS WRONG IN HERE?!!!
if step % display_step == 0:
# Calculate batch loss and accuracy
loss, acc = sess.run([cost, accuracy], feed_dict={x: fvecs_np,
y: labels_onehot,
keep_prob: 1.})
print("Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(np.mean(loss)) + ", Training Accuracy= " + \
"{:.5f}".format(acc))
TrainAcc = float("{:.5f}".format(acc))
#print("******", TrainAcc)
if TrainAcc >= .99: #Changed from .95 temporarily
print(FitStep)
FitStep = FitStep+1
saver.save(sess, Sess_File_Name, global_step=1000) #
print("Saved Session:", Sess_File_Name)
step += 1
print("Optimization Finished!")
print("Testing Accuracy:", \
sess.run(accuracy, feed_dict={x: fvecs_np[:256],
y: labels_onehot[:256],
keep_prob: 1.}))
#feed_dictTEST = {x: fvecs_np[50]}
#prediction=tf.argmax(y,1)
#print(prediction)
#best = sess.run([prediction],feed_dictTEST)
#print(best)
print("DONE")
sess.close()
def Tax_Iterator(CSV_inFile, CSV_outFile): #Deprecate
#Need to copy *.csv file to MySQL for sorting
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
TaxCount = 0
TaxThreshold = 2165
ThresholdStep = 2165
PrevTax = 0
linecounter = 0
#Open all GenBank profile list
for line in open(CSV_inFile):
linecounter = linecounter+1
print(linecounter)
resultFile = open(CSV_outFile,'a')
wr = csv.writer(resultFile, dialect='excel')
# Check for new TXID
row = line.split(',')
print(row[7], "===", PrevTax)
if row[7] != PrevTax:
print("X1")
TaxCount = TaxCount+1
PrevTax = row[7]
#Check it current Tax count is < or > threshold
# < threshold
print(TaxCount,"=+=", TaxThreshold)
if TaxCount<=3300:
print("X2")
CurrentTax= row[7]
CurrTxCount = CurrentTax
print("TaxCount=", TaxCount)
print( "Add to CSV")
print("row:", CurrentTax, "***", row[0:15])
wr.writerow(row[0:-1])
# is > threshold
else:
print("X3")
# but same TXID....
print(row[7], "=-=", CurrentTax)
if row[7]==CurrentTax:
print("X4")
CurrentTax= row[7]
print("TaxCount=", TaxCount)
print( "Add to CSV")
print("row:", CurrentTax, "***", row[0:15])
wr.writerow(row[0:-1])
# but different TXID...
else:
print(row[7], "=*=", CurrentTax)
if row[7]>CurrentTax:
print("X5")
TaxThreshold=TaxThreshold+ThresholdStep
resultFile.close()
Sess_File_Name = "CNN_VirusIDvSPECIES_XXALL"+ str(TaxThreshold-ThresholdStep)
print("<<<< Start Training >>>>"
print("Training on :: ", CurrTxCount, "Taxa", TaxCount, "data points.")
Train_Network(CurrTxCount, Sess_File_Name)
print("Training complete")
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
CurrentTax= row[7]
#reset tax count
CurrTxCount = 0
TaxCount = 0
resultFile.close()
Sess_File_Name = "MicroarrayCNN_Data"+ str(TaxThreshold+ThresholdStep)
print("<<<< Start Training >>>>")
print("Training on :: ", CurrTxCount, "Taxa", TaxCount, "data points.")
Train_Network(CurrTxCount, Sess_File_Name)
resultFileINIT = open(CSV_outFile,'w')
resultFileINIT.close()
CurrentTax= row[7]
Tax_Iterator("MicroarrayInput.csv", "MicroarrayOutput.csv")
You defined prediction as prediction=tf.argmax(y,1). And in both feed_dict, you feed labels_onehot for y. Consequently, your "prediction" is always equal to the labels.
As you didn't post the code you used to train your network, I can't tell you what exactly you need to change.
Edit: I have isses understanding the underlying problem you're trying to solve - based on your code, you're trying to train a neural network with 2165 different classes using 1609 training examples. How is this even possible? If each example had a different class, there would still be some classes without any training example. Or does one image belong to many classes? From your statement at the beginning of your question, I had assumed you're trying to output a real-valued number between 0-1.
I'm actually surprised that the code actually worked as it looks like you're adding only a single number to your labels list, but your model expects a list with length 2165 for each training example.

Convergence issues in 2D RBF Neuron implemented as a Keras layer

We implemented a 2D Gaussian radial basis layer (RBF) in Keras and are running into convergence issues with batch sizes larger than 1. The Neuron should implement the following function:
f(x,y)=exp(-a((x-x_0)²+(y-y_0)²)
Here x_0, y_0 and a are fit parameters.
Testcase
Currently we are doing correctness tests and are trying to fit just a single Neuron on the 2D function above. The Neuron should be (and is in case of batch_size 1) able to approximate this function exactly. The optimal loss is 0.
Problem
If we choose a batch size of 1 in this code, the prediction with Keras will converge very often and will be nearly independent of the starting parameters.
If we increase the batch size, the fit might produce a random walk, freeze or not converge at all. In all of these cases (even batch_size 2) convergence is a lot worse than in the batch_size 1 case. If we choose the batch_size as the size of the trainingset (i.e. 1296, our desired batch size), the fit will freeze most of the time mostly independent of learning rate.
Code
We implemented this layer in the following code:
# 2D RBF Layer
# In case anybody wants to use this code afterwards:
# Licenses: Apache, MIT, BSD, LGPLv2 and v3 and Public Domain
# Input: x,y Pairs, shape: (2,)
# Output: exp(a* ((x-x_0)**2 + (y-y_0)**2)), shape: (1,)
# Parameters: x_0, y_0, a - called: mean_x, mean_y and opening in the following code:
# x and y should both lie in [0,1] - only [0,infinity] is enforced currently
class RBFLayer2D(Layer):
def __init__(self, **kwargs):
super(RBFLayer2D, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.mean_x = K.variable(0.35)
self.constraints[self.mean_x] = NonNeg()
self.mean_y = K.variable(0.35)
self.constraints[self.mean_y] = NonNeg()
self.opening = K.variable(2.0)
self.constraints[self.opening] = NonNeg()
self.trainable_weights = [self.mean_x,self.mean_y,self.opening]
super(RBFLayer2D, self).build(input_shape) # Be sure to call this somewhere!
def call(self, x):
x_m = x[:,0] - self.mean_x
y_m = x[:,1] - self.mean_y
out = x_m*x_m + y_m*y_m
outexp = 50.0*K.exp(-64.8*self.opening*out)
# Output: exp(-a* ((x-x_0)**2 + (y-y_0)**2))
return outexp
def compute_output_shape(self, input_shape):
# If Inputshape is (None, N) Outputshape is (None,N/2)
# In our example we only look at (None, 2), which outputs (None,1)
output_shape = (input_shape[0], input_shape[1]//2)
return output_shape
Reproduction
To reproduce set a batch_size of 1 in the (not-so) minimal example after this section. When you run it, the code will display the target distribution (a circle in the lower left corner), the starting guess for our RBF ANN (a smaller circler in the middle) and then after each iteration the current guess (a circle getting bigger and moving to the lower left corner).
Afterwards set a batch_size of 12 and restart the code and you will not observe convergence anymore.
Minimal Example
from __future__ import print_function
from __future__ import division
import numpy as np
np.random.seed(1234)
import matplotlib.pyplot as plt
from keras.engine import Layer
from keras.optimizers import SGD
from keras.models import Sequential
from keras.constraints import NonNeg
from keras import backend as K
# 2D RBF Layer
# Input: x,y Pairs, shape: (2,)
# Output: exp(a* ((x-x_0)**2 + (y-y_0)**2)), shape: (1,)
# Parameters: x_0, y_0, a - called: mean_x, mean_y and opening in the following code:
# x and y should both lie in [0,1] - only [0,infinity] is enforced currently
class RBFLayer2D(Layer):
def __init__(self, **kwargs):
super(RBFLayer2D, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.mean_x = K.variable(0.35)
self.constraints[self.mean_x] = NonNeg()
self.mean_y = K.variable(0.35)
self.constraints[self.mean_y] = NonNeg()
self.opening = K.variable(2.0)
self.constraints[self.opening] = NonNeg()
self.trainable_weights = [self.mean_x,self.mean_y,self.opening]
super(RBFLayer2D, self).build(input_shape)
def call(self, x):
x_m = x[:,0] - self.mean_x
y_m = x[:,1] - self.mean_y
out = x_m*x_m + y_m*y_m
outexp = 50.0*K.exp(-64.8*self.opening*out)
# Output: exp(-a* ((x-x_0)**2 + (y-y_0)**2))
return outexp
def compute_output_shape(self, input_shape):
# If Inputshape is (None, N) Outputshape is (None,N/2)
# In our example we only look at (None, 2), which outputs (None,1)
output_shape = (input_shape[0], input_shape[1]//2)
return output_shape
# The function we want to train.
# It can be exactly represented using a single Neuron.
def twodenergy(phi, psi):
r0 = np.array([-180, -180])
b = 0.00005
return 50.0 * np.exp(- b * ((phi - r0[0]) ** 2 + (psi - r0[1]) ** 2))
# One of two plotting helper functions to show the results
def make_plot(y,numsteps,numbins,minangle,maxangle,plotnum, batch_size):
evaluation = np.zeros((numsteps, numsteps))
for i in range(0, numbins):
mx = i % numsteps
my = int(i / numsteps)
evaluation[mx,my]=y[i]
plt.imshow(evaluation.T, origin='lower',extent=[minangle, maxangle, minangle, maxangle])
plt.xlabel("x")
plt.ylabel("y")
if plotnum == 0:
plt.title("Startconfiguration")
else:
plt.title("RBF for batch_size %i at frame %03d" % (batch_size, plotnum))
plt.show()
# One of two plotting helper functions to show the target function
def plot_target_function(phi, psi, minangle, maxangle, delta_angle_half, numbins, numsteps ):
eval_matrix_corr = np.zeros((numsteps, numsteps))
for i in range(0, numbins):
mx = i % numsteps
my = int(i / numsteps)
ph = phi[mx] +delta_angle_half
ps = psi[my] +delta_angle_half
eval_matrix_corr[mx,my] = twodenergy(ph,ps)
plt.imshow(eval_matrix_corr.T, origin='lower', extent=[minangle, maxangle, minangle, maxangle])
plt.title("Target Function")
plt.xlabel("phi")
plt.ylabel("psi")
plt.show()
if __name__ == "__main__":
# batch_size == 1: converges very often nearly independent of input parameters
# batch_size == 2: no to slow convergence, but distribution stays in the right place more or less
# batch_size == 3-12: random walk
# batch_size == 1296: no movement in case of low learning_rate, random_walk in case of high learning_rate
# (this is the case where the whole map is evaluated in every step.
# 1296 is our desired testcase, because it evaluates the whole map we want to fit.
batch_size = 1
learning_rate = 1E-5
### Here we generate the target function ###
### f(phi,psi)
### phi is [-180,180]
### psi is [-180,180]
anglestep = 10.0
minangle = -180.0
maxangle = 180.0
numsteps = int((maxangle - minangle)/anglestep)
anglerange = maxangle - minangle
numbins = numsteps*numsteps
delta_angle_half = anglerange /(2.0* numsteps)
phi = np.arange(minangle, maxangle, anglestep)
psi = np.arange(minangle, maxangle, anglestep)
#Target Function Plot, Gaussian in lower left
plot_target_function(phi, psi, minangle, maxangle, delta_angle_half, numbins, numsteps )
# Input Parameter Regularization
# we map -180..180 to 0..1
# we also calculate the training parameters for our x,y pairs:
x_train = np.zeros((numbins, 2))
y_train = np.zeros((numbins, 1))
for x,ph in enumerate(phi):
for y,ps in enumerate(psi):
myphi = (ph + delta_angle_half - minangle)/(anglerange)
mypsi = (ps + delta_angle_half- minangle)/(anglerange)
x_train[x * numsteps + y, 0] = (ph +delta_angle_half - minangle)/(anglerange)
x_train[x * numsteps + y, 1] = (ps + delta_angle_half- minangle)/(anglerange)
y_train[x * numsteps + y] = twodenergy(ph +delta_angle_half,ps +delta_angle_half)
# Prediction with Keras
model = Sequential()
# Single RBF Layer, only one node
model.add(RBFLayer2D(input_shape=(2,)))
sgd = SGD(lr=learning_rate)
model.compile(loss="mean_squared_error", optimizer=sgd)
# We plot the starting configuration.
y = model.predict(x_train, batch_size=batch_size)
make_plot(y, numsteps, numbins, minangle, maxangle, 0, batch_size)
#Plot the first 15 iterations:
for i in range(0,15):
# For demonstration purposes, we fit 1 epoch and plot the output.
model.fit(x_train,y_train, epochs=1, batch_size=batch_size)
y = model.predict(x_train, batch_size=batch_size)
make_plot(y, numsteps, numbins, minangle, maxangle, 1 + i, batch_size)

Resources