Stacking with original data by mlxtend or other tools - scikit-learn

I want to predict result by meta-features made from stacking with original features.
I have used mlxtend for stacking, and I tried to use original features with meta-features but this library can't work well.
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston
from mlxtend.regressor import StackingRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
boston= load_boston()
y = boston['target']
X = boston['data']
class extAll(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X):
return self
def predict(self, X):
return self
RF = RandomForestRegressor()
LGBM = LGBMRegressor()
pipe = make_pipeline(extAll())
stack1 = StackingRegressor(regressors=[RF,LGBM,pipe], meta_regressor=LGBM, verbose=1)
scores = cross_validate(stack1, X, y, cv=10)
and errors occured as
Fitting 3 regressors...
Fitting regressor1: randomforestregressor (1/3)
Fitting regressor2: lgbmregressor (2/3)
Fitting regressor3: pipeline (3/3)
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\regressor\stacking_regression.py", line 154, in fit
meta_features = self.predict_meta_features(X)
File "C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\regressor\stacking_regression.py", line 221, in predict_meta_features
return np.column_stack([r.predict(X) for r in self.regr_])
File "C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\shape_base.py", line 369, in column_stack
return _nx.concatenate(arrays, 1)
ValueError: all the input array dimensions except for the concatenation axis must match exactly
I think it was caused by original data which had multi-dimensional.
I want to know a better method or tools.
What should I do?

The code has some mistake in the part of the prediction.
It should be correct as
class extAll(BaseEstimator, TransformerMixin,RegressorMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X):
return self
def predict(self, X):
return X
When we develop a scikit-learn type method, RegressorMixin or ClassifierMixin is needed for predictions. This code works well.

Related

Saving Patsy for sklean inference

I am a huge lover of your sklego project, especially patsy implementation within sklean.
However, there is one thing I still would like your opinion on - how do you use a pipeline containing PatsyTransformer only for inference?
As the pickling is not yet supported on the patsy side I came up with a workaround.
import seaborn as sns
from joblib import dump, load
from sklego.preprocessing import PatsyTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
# Load The data
data = sns.load_dataset("tips")
# Basic Pipeline
pipe = Pipeline([
("patsy", PatsyTransformer("tip + C(day)")),
("model", LinearRegression())
])
data
# Train the pipeline
pipe.fit(data, data['total_bill'])
from sklearn.base import BaseEstimator, TransformerMixin
# Class for inferencing with pre-trained model (fit only passes, no training happens)
class Model_Inferencer(BaseEstimator, TransformerMixin):
"""
Function that applyes pre-trained models within a pipeline setting.
"""
def __init__(self, pre_trained_model=None):
self.pre_trained_model = pre_trained_model
def transform(self, X):
preds = self.pre_trained_model.predict(X)
return preds
def predict(self, X):
preds = self.pre_trained_model.predict(X)
return preds
def fit(self, X, y=None, **fit_params):
return self
pipe.predict(data)[:10]
# Save the model
dump(pipe['model'], 'model_github.joblib')
# Load The model
loaded_model = load('model_github.joblib')
# Create Inference Pipeline
pipe_inference = Pipeline([
("patsy", PatsyTransformer("tip + C(day)")),
("inferencer", Model_Inferencer(loaded_model))
])
# Inference pipeline needs to be fitted
# pipe_inference.fit(data)
# Save predictions (works only when fitted)
pipe_inference.predict(data)
I also tried saving the info by hand:
import h5py
def save_patsy(patsy_step, filename):
"""Save the coefficients of a linear model into a .h5 file."""
with h5py.File(filename, 'w') as hf:
hf.create_dataset("design_info", data=patsy_step.design_info_)
def load_coefficients(patsy_step, filename):
"""Attach the saved coefficients to a linear model."""
with h5py.File(filename, 'r') as hf:
design_info = hf['design_info'][:]
patsy_step.design_info_ = design_info
save_patsy(pipe['patsy'], "clf.h5")
However, a bummer error will occur.
Object dtype dtype('O') has no native HDF5 equivalent

Keras Tensorflow - Exception while predicting from multiple threads

I am using keras 2.0.8 with tensorflow 1.3.0 backend.
I am loading a model in the class init and then use it to predict multithreaded.
import tensorflow as tf
from keras import backend as K
from keras.models import load_model
class CNN:
def __init__(self, model_path):
self.cnn_model = load_model(model_path)
self.session = K.get_session()
self.graph = tf.get_default_graph()
def query_cnn(self, data):
X = self.preproccesing(data)
with self.session.as_default():
with self.graph.as_default():
return self.cnn_model.predict(X)
I initialize the CNN once and the query_cnn method happens from multiple threads.
The exception i get in my log is:
File "/home/*/Similarity/CNN.py", line 43, in query_cnn
return self.cnn_model.predict(X)
File "/usr/local/lib/python3.5/dist-packages/keras/models.py", line 913, in predict
return self.model.predict(x, batch_size=batch_size, verbose=verbose)
File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 1713, in predict
verbose=verbose, steps=steps)
File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 1269, in _predict_loop
batch_outs = f(ins_batch)
File "/usr/local/lib/python3.5/dist-packages/keras/backend/tensorflow_backend.py", line 2273, in __call__
**self.session_kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1124, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1321, in _do_run
options, run_metadata)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1340, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.NotFoundError: PruneForTargets: Some target nodes not found: group_deps
The code works fine most of the times, its probably some problem with the multithreading.
How can i fix it?
Make sure you finish the graph creation before creating the other threads.
Calling finalize() on the graph may help you with that.
def __init__(self, model_path):
self.cnn_model = load_model(model_path)
self.session = K.get_session()
self.graph = tf.get_default_graph()
self.graph.finalize()
Update 1: finalize() will make your graph read-only so it can be safely used in multiple threads. As a side effect, it will help you find unintentional behavior and sometimes memory leaks as it will throw an exception when you try to modify the graph.
Imagine that you have a thread that does for instance one hot encoding of your inputs. (bad example:)
def preprocessing(self, data):
one_hot_data = tf.one_hot(data, depth=self.num_classes)
return self.session.run(one_hot_data)
If you print the amount of objects in the graph you will notice that it will increase over time
# amount of nodes in tf graph
print(len(list(tf.get_default_graph().as_graph_def().node)))
But if you define the graph first that won't be the case (slightly better code):
def preprocessing(self, data):
# run pre-created operation with self.input as placeholder
return self.session.run(self.one_hot_data, feed_dict={self.input: data})
Update 2: According to this thread you need to call model._make_predict_function() on a keras model before doing multithreading.
Keras builds the GPU function the first time you call predict(). That
way, if you never call predict, you save some time and resources.
However, the first time you call predict is slightly slower than every
other time.
The updated code:
def __init__(self, model_path):
self.cnn_model = load_model(model_path)
self.cnn_model._make_predict_function() # have to initialize before threading
self.session = K.get_session()
self.graph = tf.get_default_graph()
self.graph.finalize() # make graph read-only
Update 3: I did a proof of concept of a warming up, because _make_predict_function() doesn't seems to work as expected.
First I created a dummy model:
import tensorflow as tf
from keras.layers import *
from keras.models import *
model = Sequential()
model.add(Dense(256, input_shape=(2,)))
model.add(Dense(1, activation='softmax'))
model.compile(loss='mean_squared_error', optimizer='adam')
model.save("dummymodel")
Then in another script I loaded that model and made it run on multiple threads
import tensorflow as tf
from keras import backend as K
from keras.models import load_model
import threading as t
import numpy as np
K.clear_session()
class CNN:
def __init__(self, model_path):
self.cnn_model = load_model(model_path)
self.cnn_model.predict(np.array([[0,0]])) # warmup
self.session = K.get_session()
self.graph = tf.get_default_graph()
self.graph.finalize() # finalize
def preproccesing(self, data):
# dummy
return data
def query_cnn(self, data):
X = self.preproccesing(data)
with self.session.as_default():
with self.graph.as_default():
prediction = self.cnn_model.predict(X)
print(prediction)
return prediction
cnn = CNN("dummymodel")
th = t.Thread(target=cnn.query_cnn, kwargs={"data": np.random.random((500, 2))})
th2 = t.Thread(target=cnn.query_cnn, kwargs={"data": np.random.random((500, 2))})
th3 = t.Thread(target=cnn.query_cnn, kwargs={"data": np.random.random((500, 2))})
th4 = t.Thread(target=cnn.query_cnn, kwargs={"data": np.random.random((500, 2))})
th5 = t.Thread(target=cnn.query_cnn, kwargs={"data": np.random.random((500, 2))})
th.start()
th2.start()
th3.start()
th4.start()
th5.start()
th2.join()
th.join()
th3.join()
th5.join()
th4.join()
Commenting the lines for the warmingup and finalize I was able to reproduce your first issue

Combining w2vec and feature selection in pipeline

Based on this article: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/ I am trying to implement a gensim word2vec model with the pretrained vectors of GloVe in a text classification task. However, I would like to do FeatureSelection also in my text data. I tried multiple sequences in the pipeline but i get fast a memory error which points to the transform part of TfidfEmbeddingVectorizer.
return np.array([
np.mean([self.word2vec[w] * self.word2weight[w]
for w in words if w in self.word2vec] or
[np.zeros(self.dim)], axis=0)
for words in X
If I replace the TfidfEmbeddingVectorizer class with a regular TfIdfVectorizer it works properly. Is there a way I could combine SelectFromModel and W2vec in the pipeline?
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support as score, f1_score
import pickle
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
import gensim
import collections
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return (X[self.column])
class TextStats(BaseEstimator, TransformerMixin):
"""Extract features from each document for DictVectorizer"""
def fit(self, x, y=None):
return self
def transform(self, posts):
return [{'REPORT_M': text}
for text in posts]
class TfidfEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
self.word2weight = None
self.dim = len(word2vec.values())
def fit(self, X, y):
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of
# known idf's
max_idf = max(tfidf.idf_)
self.word2weight = collections.defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] * self.word2weight[w]
for w in words if w in self.word2vec] or
[np.zeros(self.dim)], axis=0)
for words in X
])
# training model
def train(data_train, data_val):
with open("glove.6B/glove.6B.50d.txt", "rb") as lines:
w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
for line in lines}
classifier = Pipeline([
('union', FeatureUnion([
('text', Pipeline([
('selector', ItemSelector(column='TEXT')),
("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False),threshold=0.01))
])),
('category', Pipeline([
('selector', ItemSelector(column='category')),
('stats', TextStats()),
('vect', DictVectorizer())
]))
])),
('clf',ExtraTreesClassifier(n_estimators=200, max_depth=500, min_samples_split=6, class_weight= 'balanced'))])
classifier.fit(data_train,data_train.CLASSES)
predicted = classifier.predict(data_val)
I think in here self.dim = len(word2vec.values()) you should specify the dimension of the model. If you are using glove.6B.50d.txt, then the dimension should be 50.
len(word2vec.values()) is the total number of words, thus will create a huge matrix, i.e., memory error.

Scikit-learn pipeline for same data and steps fails to classifiy

I have vectors of floats that I created from doc2vec algorithm, and their labels. When i use them with a simple classifier, it works normally and gives an expected accuracy. Working code is below:
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
train_vecs #ndarray (20418,100)
#train_vecs = [[0.3244, 0.3232, -0.5454, 1.4543, ...],...]
y_train #labels
test_vecs #ndarray (6885,100)
y_test #labels
classifier = LinearSVC()
classifier.fit(train_vecs, y_train )
print('Test Accuracy: %.2f'%classifier.score(test_vecs, y_test))
However now I want to move it into a pipeline, because in the future I plan to do a feature union with different features. What I do is move the vectors into a dataframe, then use 2 custom transformers to i)select the column, ii) change the array type. Strangely the exact same data, with exact same shape, dtype and type.. gives 0.0005 accuracy. Which it does not make sense to me at all, it should give almost equal accuracy. After the ArrayCaster transformer the shapes and types of the inputs are exactly the same as before. The whole thing has been really frustrating.
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
# transformer that picks a column from the dataframe
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
print('item selector type',type(X[self.column]))
print('item selector shape',len(X[self.column]))
print('item selector dtype',X[self.column].dtype)
return (X[self.column])
# transformer that converts the series into an ndarray
class ArrayCaster(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def transform(self, data):
print('array caster type',type(np.array(data.tolist())))
print('array caster shape',np.array(data.tolist()).shape)
print('array caster dtype',np.array(data.tolist()).dtype)
return np.array(data.tolist())
train_vecs #ndarray (20418,100)
y_train #labels
test_vecs #ndarray (6885,100)
y_test #labels
train['vecs'] = pd.Series(train_vecs.tolist())
val['vecs'] = pd.Series(test_vecs.tolist())
classifier = Pipeline([
('selector', ItemSelector(column='vecs')),
('array', ArrayCaster()),
('clf',LinearSVC())])
classifier.fit(train, y_train)
print('Test Accuracy: %.2f'%classifier.score(test, y_test))
Ok sorry about that.. I figured it out. The error is pretty annoying to notice. All I had to do is cast them as list and place them into the dataframe, instead of converting them to series.
Change this
train['vecs'] = pd.Series(train_vecs.tolist())
val['vecs'] = pd.Series(test_vecs.tolist())
into:
train['vecs'] = list(train_vecs)
val['vecs'] = list(test_vecs)

ValueError: Found arrays with inconsistent numbers of samples: [ 1 25]

I am running classification test on my articles and would like to perform leave one out CV as well, but after I run my function I get an error I do not quite understand. This is my code:
import sklearn
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
bunch = load_files('corpus')
X = bunch.data
y = bunch.target
count_vect = CountVectorizer(stop_words = 'english')
X_counts = count_vect.fit_transform(X)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)
classifier = MultinomialNB().fit(X_tfidf, y)
def evaluate_cross_validation(clf, X, y, K):
# create a k-fold cross validation iterator of k=N folds
cv = KFold(len(y), K, shuffle=False, random_state=0)
# by default the score used is the one returned by score >>> method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print scores
print ("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores))
clfs = [classifier]
for clf in clfs:
evaluate_cross_validation(clf, X, y, 26)
However, I get this error and I do not understand what is happening:
Traceback (most recent call last):
File "<ipython-input-289-8f2a0d6aa294>", line 4, in <module>
evaluate_cross_validation(clf, X, y, 26)
File "<ipython-input-287-ecb52eb2fc76>", line 5, in evaluate_cross_validation
scores = cross_val_score(clf, X, y, cv=cv)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1433, in cross_val_score
for train, test in cv)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__
while self.dispatch_one_batch(iterator):
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch
self._dispatch(tasks)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch
job = ImmediateComputeBatch(batch)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__
self.results = batch()
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py", line 527, in fit
X, y = check_X_y(X, y, 'csr')
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py", line 520, in check_X_y
check_consistent_length(X, y)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py", line 176, in check_consistent_length
"%s" % str(uniques))
ValueError: Found arrays with inconsistent numbers of samples: [ 1 25]
Help would be appreciated.
thanks
EDITED:
I get values for y.shape but for X.shape just an error.
X.shape
Traceback (most recent call last):
File "<ipython-input-305-270dd209b8a9>", line 1, in <module>
X.shape
AttributeError: 'list' object has no attribute 'shape'
y.shape
Out[306]: (26,)
EDITED II:
Yes, I am operating with 26 articles. What I tried to do with my code was to transform articles into tfidf representations and then use it for my classifier. However, it seems that there is a disconnection between my classifier and my def evaluate_cross_validation(clf, X, y, K) function. I am trying to perform leave-one-out CV. My X_tfidf.shape is as follows:
X_tfidf.shape
Out[17]: (26, 3777)
EDITED III:
I would like to use this pipeline approach but it seems to me that it does not tfidf vectorize the articles:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words = 'english')),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
Cannot figure out how to modify this pipeline classifier to perform tfidf vectorization. :S

Resources