TypeError: Issue with GridSearchCV - python-3.x

I am trying to use GridSearchCV and Pipeline to check parameters of SVM. The code is as follows.
parameter={'svm_C':(0.1, 1, 10, 100), 'svm_gamma':(0.001, 0.01, 0.1, 10)}
pipe=Pipeline([("scaler", StandardScaler), ("svm", SVC())])
print(parameter)
print(pipe)
print(xtrain.shape)
print(ytrain1.shape)
grid=GridSearchCV(pipe, parameter, cv=3, n_jobs=-1)
grid.fit(xtrain,ytrain1)
print("Best set score:{}".format(grid.best_score_))
print("Test set Score:{}".format(grid.score(xtest,ytest1)))
print("Best paameters:{}".format(grid.best_params_))
filename='finalized_svm.sav'
filename1='gridfinal_svm.sav'
joblib.dump(best_estimator_, filename)
joblib.dump(grid, filename1)
pred=grid.predict(xtest)
confusion=confusion_matrix(ytest1,pred), print(confusion)
I loaded some .mat files for xtrain and ytrain. The issue is generated in line "grid.fit(xtrain,ytrain1)".
The error generated is as follows
Traceback (most recent call last):
File "C:/code net/mysvm.py", line 44, in <module>
grid.fit(xtrain,ytrain1)
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\model_selection\_search.py", line 626, in fit
base_estimator = clone(self.estimator)
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 62, in clone
new_object_params[name] = clone(param, safe=False)
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 50, in clone
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 50, in <listcomp>
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 50, in clone
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 50, in <listcomp>
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 60, in clone
new_object_params = estimator.get_params(deep=False)
TypeError: get_params() missing 1 required positional argument: 'self'
By using parameter={'svm_C':(0.1, 1, 10, 100), 'svm_gamma':(0.001, 0.01, 0.1, 10)}
pipe=Pipeline([("scaler", StandardScaler()), the error comes out
Traceback (most recent call last):
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\externals\loky\process_executor.py", line 420, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 563, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 261, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 261, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\model_selection\_validation.py", line 514, in _fit_and_score
estimator.set_params(**parameters)
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\pipeline.py", line 147, in set_params
self._set_params('steps', **kwargs)
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\utils\metaestimators.py", line 52, in _set_params
super(_BaseComposition, self).set_params(**params)
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\base.py", line 213, in set_params
(key, self))
ValueError: Invalid parameter svm_C for estimator Pipeline(memory=None,
steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))]). Check the list of available parameters with `estimator.get_params().keys()`.
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-7-83184f586b10> in <module>
2 print(ytrain1.shape)
3 grid1=GridSearchCV(pipe, parameter, cv=3, n_jobs=-1)
----> 4 grid1.fit(xtrain,ytrain1)
5 print("Best set score:{}".format(grid.best_score_))
6 print("Test set Score:{}".format(grid.score(xtest,ytest1)))
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
994
995 with self._backend.retrieval_context():
--> 996 self.retrieve()
997 # Make sure that we get a last message telling us we are done
998 elapsed_time = time.time() - self._start_time
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\parallel.py in retrieve(self)
897 try:
898 if getattr(self._backend, 'supports_timeout', False):
--> 899 self._output.extend(job.get(timeout=self.timeout))
900 else:
901 self._output.extend(job.get())
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
515 AsyncResults.get from multiprocessing."""
516 try:
--> 517 return future.result(timeout=timeout)
518 except LokyTimeoutError:
519 raise TimeoutError()
~\AppData\Local\conda\conda\envs\tfp36\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
~\AppData\Local\conda\conda\envs\tfp36\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: Invalid parameter svm_C for estimator Pipeline(memory=None,
steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))]). Check the list of available parameters with `estimator.get_params().keys()`.

I have done what you want to do in this way, and it is working:
# Setup the pipeline
steps = [('scaler', StandardScaler()),
('SVM', SVC())]
pipeline = Pipeline(steps)
# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
'SVM__gamma':[0.1, 0.01]}
I see 2 differences respect to your code:
When defining the steps in the pipeline instantination you write StandardScaler, without parenthesis:
Your code is:
pipe=Pipeline([("scaler", StandardScaler), ("svm", SVC())])
In my code I write StandardScaler(), look:
steps = [('scaler', StandardScaler()),('SVM', SVC())]
pipeline = Pipeline(steps)
The second difference is that when you specify the keys for your parameters grid parameters. You provide the keys: 'svm_C', 'svm_gamma' instead of 'SVM__C' and 'SVM__gamma' respectively.
You need double-under_square _+_ , this is, __ , as the 1st person that answered your question said before.
So, instead to write :
parameter={'svm_C':(0.1, 1, 10, 100), 'svm_gamma':(0.001, 0.01, 0.1, 10)}
You should write
parameters = {'SVM__C':[1, 10, 100],'SVM__gamma':[0.1, 0.01]}
I even have gotten an error writting 'svm__c' instead of 'SVM__C' and 'svm__gamma' instead of 'SVM__gamma'.
This last I do not why.
I hope, it helps

Related

Why does ndcg_score result in nan values?

Consider the following code:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, ndcg_score, make_scorer
from sklearn.svm import SVC
X_data = pd.DataFrame(np.random.randint(0,1,size=(100, 4)), columns=list('ABCD'))
X_data = sp.csr_matrix(X_data.to_numpy())
Y_data = pd.DataFrame(np.random.choice([0,1,5], 100), columns=['Y'])
# Set the parameters by cross-validation
param_grid = {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]}
clf = GridSearchCV(SVC(), param_grid, scoring=ndcg_score, refit=True, verbose=3, n_jobs=-1, error_score='raise')
test = clf.fit(X_data, Y_data)
I am wondering why this would raise the following error:
Fitting 5 folds for each of 8 candidates, totalling 40 fits
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\externals\loky\process_executor.py", line 431, in _process_worker
r = call_item()
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\externals\loky\process_executor.py", line 285, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_validation.py", line 625, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer, error_score)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
scores = scorer(estimator, X_test, y_test)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py", line 74, in inner_f
return f(**kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\metrics\_ranking.py", line 1564, in ndcg_score
y_true = check_array(y_true, ensure_2d=False)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py", line 710, in check_array
array = array.astype(np.float64)
TypeError: float() argument must be a string or a number, not 'SVC'
"""
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-45-93a8890b095c> in <module>
18
19 clf = GridSearchCV(SVC(), param_grid, scoring=ndcg_score, refit=True, verbose=3, n_jobs=-1, error_score='raise')
---> 20 test = clf.fit(X_data, Y_data)
21 #print(test.best_score_)
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1294 def _run_search(self, evaluate_candidates):
1295 """Search all candidates in param_grid"""
-> 1296 evaluate_candidates(ParameterGrid(self.param_grid))
1297
1298
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
793 n_splits, n_candidates, n_candidates * n_splits))
794
--> 795 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
796 X, y,
797 train=train, test=test,
~\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~\Anaconda3\envs\kaggleSVM\lib\concurrent\futures\_base.py in result(self, timeout)
442 raise CancelledError()
443 elif self._state == FINISHED:
--> 444 return self.__get_result()
445 else:
446 raise TimeoutError()
~\Anaconda3\envs\kaggleSVM\lib\concurrent\futures\_base.py in __get_result(self)
387 if self._exception:
388 try:
--> 389 raise self._exception
390 finally:
391 # Break a reference cycle with the exception in self._exception
TypeError: float() argument must be a string or a number, not 'SVC'
I am not quite sure why this would result in a TypeError.
I cannot recreate the error you are reporting, but using error_score="raise" and n_jobs=1 (not strictly necessary, but the output is a little easier to read), and wrapping ndcg_score with make_scorer with needs_proba=True, I get this one:
Only ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput') formats are supported. Got multiclass instead
which supports my first comment: NDCG assumes multilabel format. That suggests you need to understand whether NDCG is really appropriate for your task, and if so either turn your problem into a multilabel one or write a custom scorer that converts the multiclass output into a multilabel (one-hot encoded) one before computing the score.

issue using imbalanced dataset with logloss and RFECV

I am using imbalanced dataset(54:38:7%) with RFECV for feature selection like this:
# making a multi logloss metric
from sklearn.metrics import log_loss, make_scorer
log_loss_rfe = make_scorer(score_func=log_loss, greater_is_better=False)
# initiating Light GBM classifier
lgb_rfe = LGBMClassifier(objective='multiclass', learning_rate=0.01, verbose=0, force_col_wise=True,
random_state=100, n_estimators=5_000, n_jobs=7)
# initiating RFECV
rfe = RFECV(estimator=lgb_rfe, min_features_to_select=2, verbose=3, n_jobs=2, cv=3, scoring=log_loss_rfe)
# fitting it
rfe.fit(X=X_train, y=y_train)
And I got an error, presumably because the subsamples sklearn's RFECV has made doesn't have all of the classes from my data. I had no issues fitting the very same data outside of RFECV.
Here's the complete error:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 431, in _process_worker
r = call_item()
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 285, in __call__
return self.fn(*self.args, **self.kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/utils/fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py", line 37, in _rfe_single_fit
return rfe._fit(
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py", line 259, in _fit
self.scores_.append(step_score(estimator, features))
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py", line 39, in <lambda>
lambda estimator, features: _score(
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
scores = scorer(estimator, X_test, y_test)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 199, in __call__
return self._score(partial(_cached_call, None), estimator, X, y_true,
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 242, in _score
return self._sign * self._score_func(y_true, y_pred,
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 2265, in log_loss
raise ValueError("y_true and y_pred contain different number of "
ValueError: y_true and y_pred contain different number of classes 3, 2. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1 2]
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-9-5feb62a6f457> in <module>
1 rfe = RFECV(estimator=lgb_rfe, min_features_to_select=2, verbose=3, n_jobs=2, cv=3, scoring=log_loss_rfe)
----> 2 rfe.fit(X=X_train, y=y_train)
~/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py in fit(self, X, y, groups)
603 func = delayed(_rfe_single_fit)
604
--> 605 scores = parallel(
606 func(rfe, self.estimator, X, y, train, test, scorer)
607 for train, test in cv.split(X, y, groups))
~/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/ds_jup_venv/lib/python3.8/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
1 frames
/usr/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
ValueError: y_true and y_pred contain different number of classes 3, 2. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1 2]
How to fix this to be able to select features recursively?
Log-loss needs the probability predictions, not the class predictions, so you should add
log_loss_rfe = make_scorer(score_func=log_loss, needs_proba=True, greater_is_better=False)
The error is because without that, the passed y_pred is one-dimensional (classes 0,1,2) and sklearn assumes it's a binary classification problem and those predictions are probability of the positive class. To deal with that, it adds on the probability of the negative class, but then there are only two columns compared to your three classes.
Consider applying stratified cross-validation, which will try to preserve the fraction of samples for each class. Experiment with one of these scikit-learn cross-validators:
sklearn.model_selection.StratifiedKFold,
StratifiedShuffleSplit,
RepeatedStratifiedKFold, replacing cv=3 in your RFECV with the chosen cross-validator.
Edit
I have missed the fact that StratifiedKFold is the default cross-validator in RFECV. Actually, the error is related to log_loss_rfe, which was defined with needs_proba=False. Credit to #BenReiniger!

Cannot feed value of shape (242,) for Tensor 'Placeholder_1:0', using core tensorflow api

I'm trying to train a deep dense neural network, using core tensorflow. Basically I 'm adapting the code used in this post https://www.kaggle.com/mohitguptaomg/4-layer-dense-neural-net-using-tensorflow to my data set and my own style of coding.
Here is the dataset that I'm using :
https://drive.google.com/open?id=1bDZVuiKyEDxUaY_mZgAKMIionicLs0yK
The main difference with the code, is that I'm working starting with a dataframe data, instead of a numpy array, even so, I believe I have adapted it properly. The error I get is
Cannot feed value of shape (242,) for Tensor 'Placeholder_1:0', which has shape '(242, 1)'
Here my entire code : Data load and library imports
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
df= pd.read_csv('/home/nacho/Descargas/datasets/heart-disease-uci/heart.csv')
Variable assignments:
X = df.drop('target', axis = 1)
Y = df["target"]
X,Y = shuffle (X, Y, random_state = 0)
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size = 0.20, random_state = 0)
Theoretic architecture of the network :
# The learning rate we want for our gradient descent, and the number of epochs
# We want to use to train our data
learning_rate = 0.2
training_epochs = 500
# The number of layers we want, with the number of neurons we want them
n_hidden_1 = 60
n_hidden_2 = 60
n_hidden_3 = 60
n_hidden_4 = 60
# Define cost function and training algorithm
costf = 'cross entropy'
traininga = "gradient descent optimizer"
Creating the objects that will be placed in the neural network
# We define the inputs as placeholder, we shall fill them when we execute our code
n_dim = X.shape[1] # This will help define the vectors and matrices for calculation correctly
n_class = 1 # The number of categorie values possibles for Y
# We need to solve the nlen thing
x = tf.placeholder( tf.float32, [None, n_dim]) # Specifying where we are going to put the vectors
y_ = tf.placeholder(tf.float32, [None, n_class])
# We define out weights and bias as variables also
W = tf.Variable(tf.zeros([n_dim, n_class]))
b = tf.Variable(tf.zeros([n_class]))
weights = {
'h1': tf.Variable(tf.truncated_normal([n_dim, n_hidden_1])),
'h2': tf.Variable(tf.truncated_normal([n_hidden_1, n_hidden_2])),
'h3': tf.Variable(tf.truncated_normal([n_hidden_2, n_hidden_3])),
'h4': tf.Variable(tf.truncated_normal([n_hidden_3, n_hidden_4])),
'out': tf.Variable(tf.truncated_normal([n_hidden_4, n_class]))
}
biases = {
'b1': tf.Variable(tf.truncated_normal([n_hidden_1])),
'b2': tf.Variable(tf.truncated_normal([n_hidden_2])),
'b3': tf.Variable(tf.truncated_normal([n_hidden_3])),
'b4': tf.Variable(tf.truncated_normal([n_hidden_4])),
'out': tf.Variable(tf.truncated_normal([n_class]))
}
Coding the model :
def multilayer_perceptron(x, weights, biases):
# Hidden layer with RELU activationsd
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
# Hidden layer with sigmoid activation
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
layer_2 = tf.nn.relu(layer_2)
# Hidden layer with sigmoid activation
layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
layer_3 = tf.nn.relu(layer_3)
# Hidden layer with RELU activation
layer_4 = tf.add(tf.matmul(layer_3, weights['h4']), biases['b4'])
layer_4 = tf.nn.sigmoid(layer_4)
# Output layer with linear activation
out_layer = tf.matmul(layer_4, weights['out']) + biases['out']
return out_layer
# Calling model
y = multilayer_perceptron(x, weights, biases) # Basically, this will execute all our layers computations, resulting
# in a tensor y with our predicted results.
cost_function = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)) # Calculates the cross_entropy
training_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)
Coding extra objects, that will allow us to get extra data afterwards
# We are going to create lists, that will allow us to plot the evolution of the epochs accuracy and error after traini
mse_history = []
accuracy_history = []
Coding the execution (note, here is where the error happens )
init = tf.global_variables_initializer()
# The session object we are going to need for execution
sess = tf.Session()
sess.run(init) # We initialize the global variables
for epoch in range(training_epochs):
sess.run(training_step, feed_dict = {x: train_x, y_: train_y}) # We start with the training
cost = sess.run(cost_function, feed_dict={x: train_x, y_: train_y}) #We calculate the loss for that epoch
cost_history = np.append(cost_history, cost) # With that loss calculted we append it to a list
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) # We calculate what would be the correct prediction
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # We define a function to calculate accuracy
pred_y = sess.run(y, feed_dict = {x: test_x}) # Predict after training in the epoch
mse = tf.reduce_mean(tf.square(pred_y - test_y)) # define a function to Calculate the error of that epoch
mse_ = sess.run(mse) # we run said function
mse_history.append(mse_) # We append the result to a list
accuracy = (sess.run(accuracy, feed_dict={x: train_x, y_: train_y})) # Execute the accuracy function
accuracy_history.append(accuracy) # Append the result of the accuracy to a list
The error we get:
ValueError Traceback (most recent call last)
<ipython-input-33-91216a39c8b4> in <module>
1 for epoch in range(training_epochs):
----> 2 sess.run(training_step, feed_dict = {x: train_x, y_: train_y}) # We start with the training
3 cost = sess.run(cost_function, feed_dict={x: train_x, y_: train_y}) #We calculate the loss for that epoch
4 cost_history = np.append(cost_history, cost) # With that loss calculted we append it to a list
5 correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) # We calculate what would be the correct prediction
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
928 try:
929 result = self._run(None, fetches, feed_dict, options_ptr,
--> 930 run_metadata_ptr)
931 if run_metadata:
932 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1127 'which has shape %r' %
1128 (np_val.shape, subfeed_t.name,
-> 1129 str(subfeed_t.get_shape())))
1130 if not self.graph.is_feedable(subfeed_t):
1131 raise ValueError('Tensor %s may not be fed.' % subfeed_t)
ValueError: Cannot feed value of shape (242,) for Tensor 'Placeholder_1:0', which has shape '(242, 1)'
I have tried replacing in the execution section, ( last one before the error ) all references to the data sets with .values, soy they will be trated as numpy arrays. For example, instead of calling X, I call X.values, the error persisted.
I'm new to tensorlow, I know I could probably code this easier with the estimator API, but I really want to be able to code low level networks to make sure I understand them properly. Tried specifying the exact number of rows of data when calling x and y_ placeholders, error also persisted but with slight variation on thee exact message.
Versions used :
jupyterlab 0.35.4 py36hf63ae98_0
jupyterlab_server 0.2.0 py36_0
keras-applications 1.0.7 pypi_0 pypi
keras-preprocessing 1.0.9
scikit-image 0.14.2 py36he6710b0_0
scikit-learn 0.20.3 py36hd81dba3_0
scipy 1.2.1 py36h7c811a0_0
tensorflow 2.0.0a0
anaconda 2019.03 py36_0
anaconda-client 1.7.2 py36_0
anaconda-project 0.8.2
I'll keep working from my end, I just want to understand tensorflow.
****Edit day 2/05/2019:****
So I changed the following lines, and I'm getting interesting advances :
x = tf.placeholder(tf.float32) # Specifying where we are going to put the vectors
y_ = tf.placeholder(tf.float32)
Just changing these two lines in the beginning, changed the presented error to the following:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-142-91216a39c8b4> in <module>
6 accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # We define a function to calculate accuracy
7 pred_y = sess.run(y, feed_dict = {x: test_x}) # Predict after training in the epoch
----> 8 mse = tf.reduce_mean(tf.square(pred_y - test_y)) # define a function to Calculate the error of that epoch
9 mse_ = sess.run(mse) # we run said function
10 mse_history.append(mse_) # We append the result to a list
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/pandas/core/ops.py in wrapper(left, right)
1583 result = safe_na_op(lvalues, rvalues)
1584 return construct_result(left, result,
-> 1585 index=left.index, name=res_name, dtype=None)
1586
1587 wrapper.__name__ = op_name
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/pandas/core/ops.py in _construct_result(left, result, index, name, dtype)
1472 not be enough; we still need to override the name attribute.
1473 """
-> 1474 out = left._constructor(result, index=index, dtype=dtype)
1475
1476 out.name = name
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/pandas/core/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
260 else:
261 data = sanitize_array(data, index, dtype, copy,
--> 262 raise_cast_failure=True)
263
264 data = SingleBlockManager(data, index, fastpath=True)
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/pandas/core/internals/construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
656 elif subarr.ndim > 1:
657 if isinstance(data, np.ndarray):
--> 658 raise Exception('Data must be 1-dimensional')
659 else:
660 subarr = com.asarray_tuplesafe(data, dtype=dtype)
Exception: Data must be 1-dimensional
After that, I changed the execution code, by adding the .value arguments ( the dimensions seemed to be fine, had to be the fact that I was using a dataframe as argument ), causing the code to look like this :
for epoch in range(training_epochs):
sess.run(training_step, feed_dict = {x: train_x.values, y_: train_y.values}) # We start with the training
cost = sess.run(cost_function, feed_dict={x: train_x.values, y_: train_y.values}) #We calculate the loss for that epoch
cost_history = np.append(cost_history, cost) # With that loss calculted we append it to a list
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) # We calculate what would be the correct prediction
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # We define a function to calculate accuracy
pred_y = sess.run(y, feed_dict = {x: test_x.values}) # Predict after training in the epoch
mse = tf.reduce_mean(tf.square(pred_y - test_y.values)) # define a function to Calculate the error of that epoch
mse_ = sess.run(mse) # we run said function
mse_history.append(mse_) # We append the result to a list
accuracy = (sess.run(accuracy, feed_dict={x: train_x.values, y_: train_y.values})) # Execute the accuracy function
accuracy_history.append(accuracy)
this changed, again the error
to the following :
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1334 try:
-> 1335 return fn(*args)
1336 except errors.OpError as e:
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1319 return self._call_tf_sessionrun(
-> 1320 options, feed_dict, fetch_list, target_list, run_metadata)
1321
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1407 self._session, options, feed_dict, fetch_list, target_list,
-> 1408 run_metadata)
1409
InvalidArgumentError: Expected dimension in the range [-1, 1), but got 1
[[{{node ArgMax_1561}}]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-176-fc9234678b87> in <module>
9 mse_ = sess.run(mse) # we run said function
10 mse_history.append(mse_) # We append the result to a list
---> 11 accuracy = (sess.run(accuracy, feed_dict={x: train_x.values, y_: train_y.values})) # Execute the accuracy function
12 accuracy_history.append(accuracy)
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
928 try:
929 result = self._run(None, fetches, feed_dict, options_ptr,
--> 930 run_metadata_ptr)
931 if run_metadata:
932 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1151 if final_fetches or final_targets or (handle and feed_dict_tensor):
1152 results = self._do_run(handle, final_targets, final_fetches,
-> 1153 feed_dict_tensor, options, run_metadata)
1154 else:
1155 results = []
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1327 if handle is None:
1328 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1329 run_metadata)
1330 else:
1331 return self._do_call(_prun_fn, handle, feeds, fetches)
~/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1347 pass
1348 message = error_interpolation.interpolate(message, self._graph)
-> 1349 raise type(e)(node_def, op, message)
1350
1351 def _extend_graph(self):
InvalidArgumentError: Expected dimension in the range [-1, 1), but got 1
[[node ArgMax_1561 (defined at <ipython-input-176-fc9234678b87>:5) ]]
Errors may have originated from an input operation.
Input Source operations connected to node ArgMax_1561:
Placeholder_19 (defined at <ipython-input-166-844432d3b8cf>:11)
Original stack trace for 'ArgMax_1561':
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 505, in start
self.io_loop.start()
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 148, in start
self.asyncio_loop.run_forever()
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
self._run_once()
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/asyncio/base_events.py", line 1451, in _run_once
handle._run()
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/asyncio/events.py", line 145, in _run
self._callback(*self._args)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tornado/ioloop.py", line 690, in <lambda>
lambda f: self._run_callback(functools.partial(callback, future))
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tornado/ioloop.py", line 743, in _run_callback
ret = callback()
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tornado/gen.py", line 781, in inner
self.run()
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tornado/gen.py", line 742, in run
yielded = self.gen.send(value)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 357, in process_one
yield gen.maybe_future(dispatch(*args))
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 534, in execute_request
user_expressions, allow_stdin,
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
yielded = next(result)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2848, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2874, in _run_cell
return runner(coro)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
coro.send(None)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3049, in run_cell_async
interactivity=interactivity, compiler=compiler, result=result)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3214, in run_ast_nodes
if (yield from self.run_code(code, result)):
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-176-fc9234678b87>", line 5, in <module>
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) # We calculate what would be the correct prediction
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 137, in argmax
return argmax_v2(input, axis, output_type, name)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 166, in argmax_v2
return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 938, in arg_max
name=name)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 800, in _apply_op_helper
op_def=op_def)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3479, in create_op
op_def=op_def)
File "/home/nacho/anaconda3/envs/deepl1/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1961, in __init__
self._traceback = tf_stack.extract_stack()
afterwards, I tried by removing this last two lines, as I read somewhere that they were part of a similar problem for someone else
accuracy = (sess.run(accuracy, feed_dict={x: train_x.values, y_: train_y.values})) # Execute the accuracy function
accuracy_history.append(accuracy)
And the code seems to work. SO the problem must be somewhere around there.
One of the following approaches could be tried:
For placeholder, the shape argument is optional. From the documentation, The shape of the tensor to be fed (optional). If the shape is not specified, you can feed a tensor of any shape.
x_ = tf.placeholder(tf.float32)
Expand the dimension of train_x or train_y using np.expand_dims.
train_x = np.expand_dims(train_x, -1) # add a new axis

Tensorflow Error "UnimplementedError: Cast string to float is not supported" - Linear Classifier Model using Estimator

Below are the steps that have been followed:
Created a csv input file for tensorflow.
Defined the input columns and their default data types to read with tf.decode_csv function.
Defined serving input function with appropriate placeholders (same data types as per step 2).
The order of the columns in CSV file and step 2 exactly matches with each other.
Defined Linear Classifier Model with Estimator
Define Train Spec and Eval Spec for train_and_evaluate function
The error occurs when the Estimator runs to read the input data.
Error Log:
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'sample_dir', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001E370166828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Created Estimator
Defining Train Spec
Train Spec Defination Completed
Defining Exporter
Defining Eval Spec
Eval Spec Defination Completed
Running Estimator
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 10 secs (eval_spec.throttle_secs) or training is finished.
Created Dataset
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into sample_dir\model.ckpt.
---------------------------------------------------------------------------
UnimplementedError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1321 try:
-> 1322 return fn(*args)
1323 except errors.OpError as e:
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1306 return self._call_tf_sessionrun(
-> 1307 options, feed_dict, fetch_list, target_list, run_metadata)
1308
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1408 self._session, options, feed_dict, fetch_list, target_list,
-> 1409 run_metadata)
1410 else:
UnimplementedError: Cast string to float is not supported
[[Node: linear/head/ToFloat = Cast[DstT=DT_FLOAT, SrcT=DT_STRING, _class=["loc:#linea...t/Switch_1"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](linear/head/labels/ExpandDims, ^linear/head/labels/assert_equal/Assert/Assert)]]
During handling of the above exception, another exception occurred:
UnimplementedError Traceback (most recent call last)
<ipython-input-229-7ea5d3d759fb> in <module>()
----> 1 train_and_evaluate(OUTDIR, num_train_steps=5)
<ipython-input-227-891dd877d57e> in train_and_evaluate(output_dir, num_train_steps)
26
27 print('Running Estimator')
---> 28 tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\training.py in train_and_evaluate(estimator, train_spec, eval_spec)
445 '(with task id 0). Given task id {}'.format(config.task_id))
446
--> 447 return executor.run()
448
449
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\training.py in run(self)
529 config.task_type != run_config_lib.TaskType.EVALUATOR):
530 logging.info('Running training and evaluation locally (non-distributed).')
--> 531 return self.run_local()
532
533 # Distributed case.
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\training.py in run_local(self)
667 input_fn=self._train_spec.input_fn,
668 max_steps=self._train_spec.max_steps,
--> 669 hooks=train_hooks)
670
671 if not self._continuous_eval_listener.before_eval():
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
364
365 saving_listeners = _check_listeners_type(saving_listeners)
--> 366 loss = self._train_model(input_fn, hooks, saving_listeners)
367 logging.info('Loss for final step: %s.', loss)
368 return self
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
1117 return self._train_model_distributed(input_fn, hooks, saving_listeners)
1118 else:
-> 1119 return self._train_model_default(input_fn, hooks, saving_listeners)
1120
1121 def _train_model_default(self, input_fn, hooks, saving_listeners):
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_model_default(self, input_fn, hooks, saving_listeners)
1133 return self._train_with_estimator_spec(estimator_spec, worker_hooks,
1134 hooks, global_step_tensor,
-> 1135 saving_listeners)
1136
1137 def _train_model_distributed(self, input_fn, hooks, saving_listeners):
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks, global_step_tensor, saving_listeners)
1334 loss = None
1335 while not mon_sess.should_stop():
-> 1336 _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
1337 return loss
1338
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
575 feed_dict=feed_dict,
576 options=options,
--> 577 run_metadata=run_metadata)
578
579 def run_step_fn(self, step_fn):
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
1051 feed_dict=feed_dict,
1052 options=options,
-> 1053 run_metadata=run_metadata)
1054 except _PREEMPTION_ERRORS as e:
1055 logging.info('An error was raised. This may be due to a preemption in '
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, *args, **kwargs)
1142 raise six.reraise(*original_exc_info)
1143 else:
-> 1144 raise six.reraise(*original_exc_info)
1145
1146
C:\ProgramData\Anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, *args, **kwargs)
1127 def run(self, *args, **kwargs):
1128 try:
-> 1129 return self._sess.run(*args, **kwargs)
1130 except _PREEMPTION_ERRORS:
1131 raise
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, fetches, feed_dict, options, run_metadata)
1199 feed_dict=feed_dict,
1200 options=options,
-> 1201 run_metadata=run_metadata)
1202
1203 for hook in self._hooks:
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py in run(self, *args, **kwargs)
979
980 def run(self, *args, **kwargs):
--> 981 return self._sess.run(*args, **kwargs)
982
983 def run_step_fn(self, step_fn, raw_session, run_with_hooks):
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
898 try:
899 result = self._run(None, fetches, feed_dict, options_ptr,
--> 900 run_metadata_ptr)
901 if run_metadata:
902 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1133 if final_fetches or final_targets or (handle and feed_dict_tensor):
1134 results = self._do_run(handle, final_targets, final_fetches,
-> 1135 feed_dict_tensor, options, run_metadata)
1136 else:
1137 results = []
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1314 if handle is None:
1315 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1316 run_metadata)
1317 else:
1318 return self._do_call(_prun_fn, handle, feeds, fetches)
C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1333 except KeyError:
1334 pass
-> 1335 raise type(e)(node_def, op, message)
1336
1337 def _extend_graph(self):
UnimplementedError: Cast string to float is not supported
[[Node: linear/head/ToFloat = Cast[DstT=DT_FLOAT, SrcT=DT_STRING, _class=["loc:#linea...t/Switch_1"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](linear/head/labels/ExpandDims, ^linear/head/labels/assert_equal/Assert/Assert)]]
Caused by op 'linear/head/ToFloat', defined at:
File "C:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "C:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "C:\ProgramData\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
app.start()
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 486, in start
self.io_loop.start()
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 127, in start
self.asyncio_loop.run_forever()
File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 422, in run_forever
self._run_once()
File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 1432, in _run_once
handle._run()
File "C:\ProgramData\Anaconda3\lib\asyncio\events.py", line 145, in _run
self._callback(*self._args)
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 117, in _handle_events
handler_func(fileobj, events)
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 450, in _handle_events
self._handle_recv()
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 480, in _handle_recv
self._run_callback(callback, msg)
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
callback(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
handler(stream, idents, msg)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2662, in run_cell
raw_cell, store_history, silent, shell_futures)
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2785, in _run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2909, in run_ast_nodes
if self.run_code(code, result):
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-229-7ea5d3d759fb>", line 1, in <module>
train_and_evaluate(OUTDIR, num_train_steps=5)
File "<ipython-input-227-891dd877d57e>", line 28, in train_and_evaluate
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\training.py", line 447, in train_and_evaluate
return executor.run()
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\training.py", line 531, in run
return self.run_local()
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\training.py", line 669, in run_local
hooks=train_hooks)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 366, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 1119, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 1132, in _train_model_default
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 1107, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\linear.py", line 311, in _model_fn
config=config)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\linear.py", line 164, in _linear_model_fn
logits=logits)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\head.py", line 239, in create_estimator_spec
regularization_losses))
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\head.py", line 1208, in _create_tpu_estimator_spec
features=features, mode=mode, logits=logits, labels=labels))
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\estimator\canned\head.py", line 1114, in create_loss
labels = math_ops.to_float(labels)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 719, in to_float
return cast(x, dtypes.float32, name=name)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 665, in cast
x = gen_math_ops.cast(x, base_type, name=name)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 1613, in cast
"Cast", x=x, DstT=DstT, name=name)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3414, in create_op
op_def=op_def)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1740, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
UnimplementedError (see above for traceback): Cast string to float is not supported
[[Node: linear/head/ToFloat = Cast[DstT=DT_FLOAT, SrcT=DT_STRING, _class=["loc:#linea...t/Switch_1"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](linear/head/labels/ExpandDims, ^linear/head/labels/assert_equal/Assert/Assert)]]
Tensorflow Code:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import shutil
# Read data
df = pd.read_csv('sample.csv')
# Separate label from dataset
X = df.drop(['label'], axis=1).values
y = df[['label']].values
# Split into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Convert to dataframe
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)
# Concatenate for writing into csv
train = pd.concat([X_train, y_train], axis=1)
valid = pd.concat([X_valid, y_valid], axis=1)
# Write into csv file
train.to_csv('train.csv', header=False, index=False)
valid.to_csv('valid.csv', header=False, index=False)
# Specify structure for tensorflow input
CSV_COLUMNS = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8']
LABEL_COLUMN = 'label'
DEFAULTS = [['none'], ['none'], ['none'], ['none'], ['none'], ['0'], [0], [0]]
# Function for reading input file and creating dataset
def read_dataset(filename, mode, batch_size = 512):
def _input_fn():
def decode_csv(value_column):
columns = tf.decode_csv(value_column, record_defaults=DEFAULTS)
features = dict(zip(CSV_COLUMNS, columns))
label = features.pop(LABEL_COLUMN)
return features, label
# Create list of files that match pattern
file_list = tf.gfile.Glob(filename)
# Create dataset from file list
dataset = tf.data.TextLineDataset(file_list).map(decode_csv)
if mode==tf.estimator.ModeKeys.TRAIN:
num_epochs = None # indefinitely
dataset = dataset.shuffle(buffer_size = 10 * batch_size)
else:
num_epochs = 1 # end-of-input after this
dataset = dataset.repeat(num_epochs).batch(batch_size)
return dataset.make_one_shot_iterator().get_next()
return _input_fn
# Input feature columns
INPUT_COLUMNS = [
tf.feature_column.categorical_column_with_vocabulary_list('col1', vocabulary_list=['1', '2', '3', '4']),
tf.feature_column.categorical_column_with_vocabulary_list('col2', vocabulary_list = [ '1', '2', '3', '4', '5', '6']),
tf.feature_column.categorical_column_with_vocabulary_list('col3', vocabulary_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9']),
tf.feature_column.categorical_column_with_vocabulary_list('col4', vocabulary_list = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']),
tf.feature_column.categorical_column_with_vocabulary_list('col5', vocabulary_list = [ '0', '1', '2', '3', '4', '5']),
tf.feature_column.categorical_column_with_vocabulary_list('col6', vocabulary_list=['0', '1']),
tf.feature_column.numeric_column('col7'),
tf.feature_column.numeric_column('col8')
]
def add_more_features(feats):
# for future reference
return(feats)
feature_cols = add_more_features(INPUT_COLUMNS)
# Serving function
def serving_input_fn():
feature_placeholders = {
'col1': tf.placeholder(tf.string, [None]),
'col2': tf.placeholder(tf.string, [None]),
'col3': tf.placeholder(tf.string, [None]),
'col4': tf.placeholder(tf.string, [None]),
'col5': tf.placeholder(tf.string, [None]),
'col6': tf.placeholder(tf.string, [None]),
'col7': tf.placeholder(tf.int64, [None]),
'col8': tf.placeholder(tf.int64, [None])
}
features = {
key: tf.expand_dims(tensor, -1)
for key, tensor in feature_placeholders.items()
}
return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)
# Train and evaluate function
def train_and_evaluate(output_dir, num_train_steps):
estimator = tf.estimator.LinearClassifier(
model_dir=output_dir,
feature_columns=feature_cols)
train_spec = tf.estimator.TrainSpec(
input_fn = read_dataset('train.csv', mode = tf.estimator.ModeKeys.TRAIN),
max_steps=num_train_steps)
exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
eval_spec = tf.estimator.EvalSpec(
input_fn = read_dataset('valid.csv', mode = tf.estimator.ModeKeys.EVAL),
steps = None,
start_delay_secs = 1,
throttle_secs = 10,
exporters = exporter)
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
# Log level and cleanup
tf.logging.set_verbosity(tf.logging.INFO)
OUTDIR = 'sample_dir'
shutil.rmtree(OUTDIR, ignore_errors=True)
# Run training and evaluation
train_and_evaluate(OUTDIR, num_train_steps=1)
I have been struggling with this error. Help would be much appreciated.
While debugging this issue, the issue was resolved but I am not sure what step did actually resolve it.
I have tried below things while debugging this issue:
In reference to the stackoverflow thread: float64 with pandas to_csv, changed the floating type format which is written to CSV file as below:
Prior Code:
train.to_csv('train.csv', header=False, index=False)
valid.to_csv('valid.csv', header=False, index=False)
Modified Code:
train.to_csv('train.csv', header=False, index=False, float_format='%.4f')
valid.to_csv('valid.csv', header=False, index=False, float_format='%.4f')
Added columns one by one to the input CSV file and checked the
corresponding default datatypes. I found one of the columns in which
the pandas written CSV file had 0.0 (although it was being read as integer value). In Tensorflow it was being
read as int64. Changing the datatype to float64 resolved the mismatching datatype issue.
Now the model is up and running.

keras autoencoder resource exhausted error

I have a dataset where the number of samples is 25000 and number of features is 24995. I am trying to train an keras autoencoder model on this data and facing OOM error. Some specifics of the model are
Input matrix shape : (25000, 24995)
This input matrix is divided into validation set as training and testing data.
Train Matrix shape : (18750, 24995)
Test Matrix shape : (6250, 24995)
The code for training is
from keras.layers import Input, Dense
input_layer = Input(shape=(train_matrix.shape[1],))
encoding_hlayer1_dims = 12500
encoding_hlayer1 = Dense(encoding_hlayer1_dims, activation='relu', trainable=True, name="layer1")(input_layer)
decoding_hlayer1 = Dense(train_matrix.shape[1], activation='relu')(encoding_hlayer1)
autoencoder = Model(input_layer, decoding_hlayer1)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
The summary of the model is
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) (None, 24995) 0
_________________________________________________________________
layer1 (Dense) (None, 12500) 312450000
_________________________________________________________________
dense_1 (Dense) (None, 24995) 312462495
=================================================================
Total params: 624,912,495
Trainable params: 624,912,495
Non-trainable params: 0
Code to train the model
## Train
history = autoencoder.fit(train_matrix.toarray(), train_matrix.toarray(),
epochs=50,
batch_size=64,
shuffle=True,
validation_data=(test_matrix.toarray(), test_matrix.toarray()))
When I start training the mode, I get the following error:
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[24995,12500]
[[Node: mul_3 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](beta_1/read, Variable/read)]]
I am using 2 Nvidia Tesla K40c Gpu's with 12G each. As per my knowledge, the model should fit in memory as 25000 * 12500 * 2 = 0.625 GB. Also, the input matrix dtype is numpy.float32.
Can anyone point out what exactly am I doing wrong here ?
Update: Complete error log
Train on 18750 samples, validate on 6250 samples
Epoch 1/100
ResourceExhaustedErrorTraceback (most recent call last)
<ipython-input-8-503b20168fa5> in <module>()
6 batch_size=4096,
7 shuffle=True,
----> 8 validation_data=(test_matrix.toarray(), test_matrix.toarray()))
9 # autoencoder.save("/tmp/Models/sae_models/epochs_" + str(epochs) + ".model", include_optimizer=True)
10
/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
1428 val_f=val_f, val_ins=val_ins, shuffle=shuffle,
1429 callback_metrics=callback_metrics,
-> 1430 initial_epoch=initial_epoch)
1431
1432 def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):
/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch)
1077 batch_logs['size'] = len(batch_ids)
1078 callbacks.on_batch_begin(batch_index, batch_logs)
-> 1079 outs = f(ins_batch)
1080 if not isinstance(outs, list):
1081 outs = [outs]
/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.pyc in __call__(self, inputs)
2263 value = (indices, sparse_coo.data, sparse_coo.shape)
2264 feed_dict[tensor] = value
-> 2265 session = get_session()
2266 updated = session.run(self.outputs + [self.updates_op],
2267 feed_dict=feed_dict,
/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.pyc in get_session()
166 if not _MANUAL_VAR_INIT:
167 with session.graph.as_default():
--> 168 _initialize_variables()
169 return session
170
/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.pyc in _initialize_variables()
339 if uninitialized_variables:
340 sess = get_session()
--> 341 sess.run(tf.variables_initializer(uninitialized_variables))
342
343
/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
787 try:
788 result = self._run(None, fetches, feed_dict, options_ptr,
--> 789 run_metadata_ptr)
790 if run_metadata:
791 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
995 if final_fetches or final_targets:
996 results = self._do_run(handle, final_targets, final_fetches,
--> 997 feed_dict_string, options, run_metadata)
998 else:
999 results = []
/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1130 if handle is None:
1131 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1132 target_list, options, run_metadata)
1133 else:
1134 return self._do_call(_prun_fn, self._session, handle, feed_dict,
/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
1150 except KeyError:
1151 pass
-> 1152 raise type(e)(node_def, op, message)
1153
1154 def _extend_graph(self):
ResourceExhaustedError: OOM when allocating tensor with shape[24995,12500]
[[Node: layer1/kernel/Assign = Assign[T=DT_FLOAT, _class=["loc:#layer1/kernel"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](layer1/kernel, layer1/random_uniform)]]
Caused by op u'layer1/kernel/Assign', defined at:
File "/usr/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/usr/local/lib/python2.7/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "/usr/local/lib/python2.7/dist-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python2.7/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python2.7/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-4-ee2fe8e92d7c>", line 4, in <module>
encoding_hlayer1 = Dense(encoding_hlayer1_dims, activation='relu', trainable=True, name="layer1")(input_layer)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 569, in __call__
self.build(input_shapes[0])
File "/usr/local/lib/python2.7/dist-packages/keras/layers/core.py", line 825, in build
constraint=self.kernel_constraint)
File "/usr/local/lib/python2.7/dist-packages/keras/legacy/interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 391, in add_weight
weight = K.variable(initializer(shape), dtype=dtype, name=name)
File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 321, in variable
v = tf.Variable(value, dtype=_convert_string_dtype(dtype), name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variables.py", line 200, in __init__
expected_shape=expected_shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variables.py", line 309, in _init_from_args
validate_shape=validate_shape).op
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/state_ops.py", line 271, in assign
validate_shape=validate_shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_state_ops.py", line 45, in assign
use_locking=use_locking, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
self._traceback = _extract_stack()
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[24995,12500]
[[Node: layer1/kernel/Assign = Assign[T=DT_FLOAT, _class=["loc:#layer1/kernel"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](layer1/kernel, layer1/random_uniform)]]
The total number of parameters is 624,912,495 as per your code. That should take 624912495 * 4 / 1024**3 = 2.32 GB just to store the weights (not 0.625 as you computed).
In addition to this, you need to store the initializer and at least another 3 copies for the optimizer, one for the first and second order momentum respectively, and one for the actual update, not to mention some temporaries to store computations since anytime you write a + b, you need memory to store that, and there likely is some hidden.
Overall you quickly find that the total memory use is far above 12 GB, which is why you run out of memory.
You could try using a SGD optimizer which uses less memory, but you may still run out.

Resources