I am running classification test on my articles and would like to perform leave one out CV as well, but after I run my function I get an error I do not quite understand. This is my code:
import sklearn
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
bunch = load_files('corpus')
X = bunch.data
y = bunch.target
count_vect = CountVectorizer(stop_words = 'english')
X_counts = count_vect.fit_transform(X)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)
classifier = MultinomialNB().fit(X_tfidf, y)
def evaluate_cross_validation(clf, X, y, K):
# create a k-fold cross validation iterator of k=N folds
cv = KFold(len(y), K, shuffle=False, random_state=0)
# by default the score used is the one returned by score >>> method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print scores
print ("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores))
clfs = [classifier]
for clf in clfs:
evaluate_cross_validation(clf, X, y, 26)
However, I get this error and I do not understand what is happening:
Traceback (most recent call last):
File "<ipython-input-289-8f2a0d6aa294>", line 4, in <module>
evaluate_cross_validation(clf, X, y, 26)
File "<ipython-input-287-ecb52eb2fc76>", line 5, in evaluate_cross_validation
scores = cross_val_score(clf, X, y, cv=cv)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1433, in cross_val_score
for train, test in cv)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__
while self.dispatch_one_batch(iterator):
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch
self._dispatch(tasks)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch
job = ImmediateComputeBatch(batch)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__
self.results = batch()
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py", line 527, in fit
X, y = check_X_y(X, y, 'csr')
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py", line 520, in check_X_y
check_consistent_length(X, y)
File "/home/fledrmaus/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py", line 176, in check_consistent_length
"%s" % str(uniques))
ValueError: Found arrays with inconsistent numbers of samples: [ 1 25]
Help would be appreciated.
thanks
EDITED:
I get values for y.shape but for X.shape just an error.
X.shape
Traceback (most recent call last):
File "<ipython-input-305-270dd209b8a9>", line 1, in <module>
X.shape
AttributeError: 'list' object has no attribute 'shape'
y.shape
Out[306]: (26,)
EDITED II:
Yes, I am operating with 26 articles. What I tried to do with my code was to transform articles into tfidf representations and then use it for my classifier. However, it seems that there is a disconnection between my classifier and my def evaluate_cross_validation(clf, X, y, K) function. I am trying to perform leave-one-out CV. My X_tfidf.shape is as follows:
X_tfidf.shape
Out[17]: (26, 3777)
EDITED III:
I would like to use this pipeline approach but it seems to me that it does not tfidf vectorize the articles:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words = 'english')),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
Cannot figure out how to modify this pipeline classifier to perform tfidf vectorization. :S
Related
Running into an issue trying to get a custom metric callback to work with Tensorflow. I've created a minimal working example below to help troubleshoot. I'm running:
Windows 10
Python 3.6
scikit-learn==0.23.2
pandas==0.25.3
numpy==1.18.5
tensorflow==2.3.0
Using the breast cancer binary dataset, I'm trying to invoke the custom metric that was shown as a solution here, but running into the above error, probably because I'm not using it right.
This code...
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
# Get binary classification dataset
data = load_breast_cancer(as_frame=True)
print(data)
df = data['data']
df['target'] = data['target']
# Train Test split
train, test = train_test_split(data, test_size = 0.10, shuffle=False)
# Define features and labels
x_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]
x_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]
# https://github.com/keras-team/keras/issues/10472#issuecomment-472543538
class Metrics(Callback):
def __init__(self, val_data, batch_size=20):
super().__init__()
self.validation_data = val_data
self.batch_size = batch_size
def on_train_begin(self, logs={}):
# print(self.validation_data)
self.val_f1s = []
self.val_recalls = []
self.val_precisions = []
def on_epoch_end(self, epoch, logs={}):
batches = len(self.validation_data)
total = batches * self.batch_size
val_pred = np.zeros((total,1))
val_true = np.zeros((total))
for batch in range(batches):
xVal, yVal = next(self.validation_data)
val_pred[batch * self.batch_size : (batch+1) * self.batch_size] = np.asarray(self.model.predict(xVal)).round()
val_true[batch * self.batch_size : (batch+1) * self.batch_size] = yVal
val_pred = np.squeeze(val_pred)
_val_f1 = f1_score(val_true, val_pred)
_val_precision = precision_score(val_true, val_pred)
_val_recall = recall_score(val_true, val_pred)
self.val_f1s.append(_val_f1)
self.val_recalls.append(_val_recall)
self.val_precisions.append(_val_precision)
return
# Define a function that creates a basic model
def make_deep_learning_classifier():
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=x_train.shape[1], kernel_initializer='normal'))
model.add(Dense(32, activation='relu', input_dim=x_train.shape[1], kernel_initializer='normal'))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
return model
# Get our model
model = make_deep_learning_classifier()
print(model.summary())
# Define some params
batch_size = 32
# Call our custom callback
callback = [Metrics(val_data=[x_test, y_test], batch_size=batch_size)] # < Issue here?
# Start training
model.fit(x_train, y_train, epochs=1000, batch_size=batch_size, verbose=1, callbacks=callback, validation_data=(x_test, y_test))
print(Metrics.val_precisions) # < Issue here?
...produces this traceback...
File "test.py", line 91, in <module>
model.fit(x_train, y_train, epochs=1000, batch_size=batch_size, verbose=1, callbacks=callback, validation_data=(x_test, y_test))
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\training.py", line 108, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1137, in fit
callbacks.on_epoch_end(epoch, epoch_logs)
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\callbacks.py", line 416, in on_epoch_end
callback.on_epoch_end(epoch, numpy_logs)
File "test.py", line 54, in on_epoch_end
xVal, yVal = next(self.validation_data)
TypeError: 'list' object is not an iterator
When I change val_data=[x_test, y_test] to val_data=(x_test, y_test) in the callback variable, I get...
TypeError: 'tuple' object is not an iterator
The user who proposed this callback solution mentions something about generators, but I'm not sure how those work. Just trying to define my own custom metric for Tensorflow/Keras. I won't be using this exact callback, but once I get this one running, I can modify it to my own. Just providing it as an example that seemed to work in that GitHub post that I hope someone will be able to point out what I'm doing wrong.
Thanks!
UPDATE
Using the solution below, I try to properly call my iterator function on my val_data by using
iter_val_data = iter(self.validation_data)
for batch in range(batches):
xVal, yVal = next(iter_val_data)
But then I get a too many values to unpack error, so I change it to:
iter_val_data = iter(self.validation_data)
for batch in range(batches):
xVal = next(iter_val_data)
yVal = next(iter_val_data)
Then I get the error:
Traceback (most recent call last):
File "test.py", line 89, in <module>
model.fit(x_train, y_train, epochs=1000, batch_size=batch_size, verbose=1, callbacks=callback, validation_data=(x_test, y_test))
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\training.py", line 108, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1137, in fit
callbacks.on_epoch_end(epoch, epoch_logs)
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\callbacks.py", line 416, in on_epoch_end
callback.on_epoch_end(epoch, numpy_logs)
File "test.py", line 53, in on_epoch_end
val_pred[batch * self.batch_size : (batch+1) * self.batch_size] = np.asarray(self.model.predict(xVal)).round()
ValueError: could not broadcast input array from shape (57,1) into shape (32,1)
Ideas from here? Try and run the code in the same environment as described above if you can. Thanks!
As you see here and as your error messages states, you need to use next() with an iterator. You call next() on the list, how should next() know, which element is coming next? For that you need an iterator, that saves that state. So this should fix your issue:
iter_val_data = iter(self.validation_data)
for batch in range(batches):
xVal, yVal = next(iter_val_data)
I am trying to create a neural net model that return the similarity score of two sentences using manhattan LSTM (e.g.https://medium.com/mlreview/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07 ). I have used quora-questions pairs dataset and generated their embeddings using google-bert. Now, i want to create a LSTM model like the above examples and use it but i am getting the following error:
Using TensorFlow backend.
(100000, 1, 768)
(100000, 1, 768)
(100000,)
(100000, 100)
Traceback (most recent call last):
File "train_model_manhattan.py", line 151, in <module>
model = Model(inputs=[inp1,inp2], outputs=[malstm_distance])
File "/home/manishp/anaconda3/envs/bert_env/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
return func(*args, **kwargs)
File "/home/manishp/anaconda3/envs/bert_env/lib/python3.6/site-packages/keras/engine/network.py", line 93, in __init__
self._init_graph_network(*args, **kwargs)
File "/home/manishp/anaconda3/envs/bert_env/lib/python3.6/site-packages/keras/engine/network.py", line 231, in _init_graph_network
self.inputs, self.outputs)
File "/home/manishp/anaconda3/envs/bert_env/lib/python3.6/site-packages/keras/engine/network.py", line 1366, in _map_graph_network
tensor_index=tensor_index)
File "/home/manishp/anaconda3/envs/bert_env/lib/python3.6/site-packages/keras/engine/network.py", line 1353, in build_map
node_index, tensor_index)
File "/home/manishp/anaconda3/envs/bert_env/lib/python3.6/site-packages/keras/engine/network.py", line 1353, in build_map
node_index, tensor_index)
File "/home/manishp/anaconda3/envs/bert_env/lib/python3.6/site-packages/keras/engine/network.py", line 1325, in build_map
node = layer._inbound_nodes[node_index]
AttributeError: 'NoneType' object has no attribute '_inbound_nodes'
Here is what I have already tried. Note that embeddings returned has shape(768) i.e. is a vector of size 768 like this [1.2e+05 2.7e-01 7.8 .... 8.9]
print(np.shape(train_vec1)) => (100000, 1, 768)
print(np.shape(train_vec2)) => (100000, 1, 768)
print(np.shape(train_label))
#################################################
def exponent_neg_manhattan_distance(left, right):
return np.exp(-np.sum(np.abs(left-right), axis=1, keepdims=True))
def manhattan_distance(left, right):
''' Helper function for the similarity estimate of the LSTMs outputs'''
print(np.shape(left))
return K.sum(K.abs(left - right), axis=1, keepdims=True)
#################################################
import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model
inp1= Input(shape=(768,))
inp2= Input(shape=(768,))
x = keras.layers.concatenate([inp1, inp2],axis=-1)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5) (x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5) (x)
x = Dense(64, activation='relu')(x)
out=Dense(1)(x)
# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(100)
left_output = shared_lstm(train_vec1_tensor)
right_output = shared_lstm(train_vec2_tensor)
# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(function=lambda x: manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
#######################
Getting error when code flow reaches the following line
#######################
model = Model(inputs=[inp1,inp2], outputs=[malstm_distance])
This is my entire code
import os
data_file='quora_duplicate_questions.tsv'
# 0 means dont load, 1 means fetch from file
LOAD_ENCODING_FROM_FILE=1
encoding_data_file_quest1='encoding_quest1'
encoding_data_file_quest2='encoding_quest2'
encoding_data_file_label='quest_label'
#################################################
import numpy as np
import pandas as pd
import tensorflow as tf
import re
from bert_serving.client import BertClient
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle
from keras import models
from keras import layers
from keras import optimizers
from keras.layers import Dropout
from keras import backend as K
from keras.layers import Lambda
#################################################
maxlen = 125 # We will cut reviews after 125 words
# The next step is to tranform all sentences to fixed length encoding using bert embeddings
# [0.1 0.4 0.4] [0.9 0.6 0.1] 2.4
# [0.4 0.1 0.3] [0.5 0.6 0.1] 1.0
# Save the encodings in a file
if LOAD_ENCODING_FROM_FILE == 1:
with open(encoding_data_file_quest1, "rb") as fp:
vec1=pickle.load(fp)
with open(encoding_data_file_quest2, "rb") as fp:
vec2=pickle.load(fp)
with open(encoding_data_file_label, "rb") as fp:
label=pickle.load(fp)
train_vec1 = np.asarray(vec1, np.float32)
train_vec2 = np.asarray(vec2, np.float32)
train_vec1 = train_vec1.reshape((100000,1,768))
train_vec2 = train_vec2.reshape((100000,1,768))
train_vec1_tensor = K.cast(train_vec1,dtype='float32')
train_vec2_tensor = K.cast(train_vec2,dtype='float32')
train_label = np.asarray(label,np.float32)
print(np.shape(train_vec1))
print(np.shape(train_vec2))
print(np.shape(train_label))
#################################################
def exponent_neg_manhattan_distance(left, right):
return np.exp(-np.sum(np.abs(left-right), axis=1, keepdims=True))
def manhattan_distance(left, right):
''' Helper function for the similarity estimate of the LSTMs outputs'''
print(np.shape(left))
return K.sum(K.abs(left - right), axis=1, keepdims=True)
#################################################
import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model
inp1= Input(shape=(768,))
inp2= Input(shape=(768,))
x = keras.layers.concatenate([inp1, inp2],axis=-1)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5) (x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5) (x)
x = Dense(64, activation='relu')(x)
out=Dense(1)(x)
# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(100)
left_output = shared_lstm(train_vec1_tensor)
right_output = shared_lstm(train_vec2_tensor)
# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(function=lambda x: manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
#######################
Getting error when code flow reaches the following line
#######################
model = Model(inputs=[inp1,inp2], outputs=[malstm_distance])
model.summary()
optimizer = optimizers.Adadelta(clipnorm=gradient_clipping_norm)
model.compile(optimizer,
loss='mean_squared_error',
metrics=['accuracy'])
history=model.fit([train_vec1, train_vec2], train_label,
epochs=30,batch_size=200,
validation_split=0.2)
I want the model to take two embeddings, calculate the manhattan distance of the embeddings and return the distance.
left_output and right_output are obtained from the LSTM layer. The inputs are fed to the Input layer and through a series of Dense layers. However, note that there is no connection anywhere between the set of Dense layers and the LSTM. The Model expects the output from the LSTM layer which is not possible. This line keras.layers.concatenate should use the outputs from the shared_lstm rather than using the outputs of input layers directly. Like this
keras.layers.concatenate([left_output, right_output],axis=-1)
Only, then this can be a Siamese network.
I am trying to start using TensorFlow (1.4.1, with python 3.5.2) by building a simple sort of online -learning LSTM. I wrote the following code to test at least a 1 shot training (train the network with one single sample).
Here are the parameters:
input_size = 4
output_size = 4
rnn_size = 128
t=1
and here is the RNN code
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.python.ops import rnn_cell
from tensorflow.contrib import rnn
import numpy as np
import rnnPar as par
x = tf.placeholder('float', [None, par.input_size])
y = tf.placeholder('float')
def reshapeData(x):
x = tf.transpose(x,[1,0])
x = tf.reshape(x,[-1, par.input_size])
x = tf.split(x, par.t)
return x
def convertData(xs):
return np.vstack([np.expand_dims(x, 0) for x in xs])
def neural_network_model(x):
x = reshapeData(x)
lstm_layer = rnn_cell.BasicLSTMCell(par.rnn_size)
outputs, states = rnn.static_rnn(lstm_layer, x, dtype = tf.float32)
output_layer = {'weights' : tf.Variable(tf.random_normal([par.rnn_size, par.output_size])),
'biases' : tf.Variable(tf.random_normal([par.output_size]))}
output = tf.matmul(outputs[-1],output_layer['weights']) + output_layer['biases']
return output
def train_neural_network(x, train_x, train_y):
prediction = neural_network_model(x)
cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y) )
optimizer = tf.train.AdamOptimizer().minimize(cost)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
train_x = convertData(reshapeData(train_x))
_, c = sess.run([optimizer, cost], feed_dict={x: train_x, y: train_y})
epoch_loss += c
i then try to train the network by running the following script:
import exampleRNNTF as rnn
train_x, train_y = [[1.0,1.0,0.0,1.0]] , [2.0,4.0,3.0,2.0]
rnn.train_neural_network(rnn.x, train_x, train_y)
However this will result in python giving me this strange erro which I can't seem to resolve (with or without the convertData function, i get this error)
tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
Traceback (most recent call last):
File "runRNN.py", line 4, in <module>
rnn.train_neural_network(rnn.x, train_x, train_y)
File "/home/daddabarba/Desktop/exampleRNNTF.py", line 51, in train_neural_network
_, c = sess.run([optimizer, cost], feed_dict={x: train_x, y: train_y})
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 889, in run
run_metadata_ptr)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1089, in _run
np_val = np.asarray(subfeed_val, dtype=subfeed_dtype)
File "/home/daddabarba/.local/lib/python3.5/site-packages/numpy/core/numeric.py", line 531, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: setting an array element with a sequence.
I am calling Keras predict_generator() like:
bottleneck_features_train = model.predict_generator(train_gen, len(telemetry))
where train_gen() is defined like
def train_gen():
# ...
yield (X, y)
and X is a numpy array with shape (48, 299, 299, 3), y is a numpy array with shape (48,)
I get the error below. What should I do instead?
Otherwise, a link to a working example would help. Only examples I have found are for Keras 1 or using ImageDataGenerator.flow().
I am running Keras 2.0.2.
Here the error:
Traceback (most recent call last):
File "/home/fanta/workspace/CarND-Behavioral-Cloning-P3/cache.py", line 143, in <module>
tf.app.run()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/home/fanta/workspace/CarND-Behavioral-Cloning-P3/cache.py", line 138, in main
bottleneck_features_train = model.predict_generator(train_gen, len(telemetry))
File "/usr/local/lib/python3.5/dist-packages/keras/legacy/interfaces.py", line 88, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 2094, in predict_generator
outs = self.predict_on_batch(x)
File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 1677, in predict_on_batch
self._feed_input_shapes)
File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 100, in _standardize_input_data
'Found: array with shape ' + str(data.shape))
ValueError: The model expects 0 input arrays, but only received one array. Found: array with shape (48, 299, 299, 3)
Process finished with exit code 1
===== UPDATE =====
The issue is not related to the generator. Here below a short program to reproduce it. Note that if you switch the network from inception to vgg, it works fine.
from keras.applications.inception_v3 import InceptionV3
from keras.applications.vgg16 import VGG16
from keras.layers import Input, AveragePooling2D
from keras.models import Model
from keras.datasets import cifar10
from scipy.misc import imresize
import pickle
import tensorflow as tf
import keras.backend as K
import numpy as np
network='inception' # Must be 'inception' or 'vgg'
dataset='cifar10'
batch_size=64
if network == 'vgg':
size = (224, 224)
elif network == 'inception':
size = (299, 299)
else:
assert False, "network must be either 'inception' or 'vgg'"
def create_model():
input_tensor = Input(shape=(size[0], size[1], 3))
if network == 'inception':
model = InceptionV3(input_tensor=input_tensor, include_top=False)
x = model.output
x = AveragePooling2D((8, 8), strides=(8, 8))(x)
model = Model(model.input, x)
elif network == 'vgg':
model = VGG16(input_tensor=input_tensor, include_top=False)
x = model.output
x = AveragePooling2D((7, 7))(x)
model = Model(model.input, x)
else:
assert False
return model
def main():
# Download and load cifar10 dataset
(X_train, y_train), (_, _) = cifar10.load_data()
# Reduce the dataset to the first 1000 entries, to save memory and computation time
X_train = X_train[0:1000]
y_train = y_train[0:1000]
# Resize dataset images to comply with expected input image size
X_train = [imresize(image, size) for image in X_train]
X_train = np.array(X_train)
# File name where to save bottlenecked features
train_output_file = "{}_{}_{}.p".format(network, dataset, 'bottleneck_features_train')
print("Saving to", train_output_file)
with tf.Session() as sess:
K.set_session(sess)
K.set_learning_phase(1)
model = create_model()
# We skip pre-processing and bottleneck the features
bottleneck_features_train = model.predict(X_train, batch_size=batch_size, verbose=1)
data = {'features': bottleneck_features_train, 'labels': y_train}
pickle.dump(data, open(train_output_file, 'wb'))
if __name__ == '__main__':
main()
At prediction step your generator should only yield the input and not the targets. So only the X, not the y.
Does that help?
Hello I would like to reduce the dimension of my train matrix to then use a support vectorial machine, my code looks as follows:
from sklearn.decomposition import PCA
First I tried performin the pca:
pca = PCA(n_components=100)
#pca.fit(train_matrix)
train_matrix = np.concatenate([cities,state_matrix,work_type,company_matrix,seg,ag,rep], axis=1)
Then I assigned it to a variable to then train my model as follows:
train_matrix = pca.fit_transform(train_matrix)
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
X_train, X_test, y_train, y_test = train_test_split(
pca, labels_list, test_size=0.1, random_state=47)
However I am not sure what is wrong I got, so I would like to recieve support to overcome this situation:
state shape: (282521, 572)
work type shape: (282521, 164)
train matrix shape (5000, 100)
Traceback (most recent call last):
File "build_model.py", line 61, in <module>
pca, labels_list, test_size=0.1, random_state=47)
File "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py", line 2039, in train_test_split
arrays = indexable(*arrays)
File "/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py", line 206, in indexable
check_consistent_length(*result)
File "/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py", line 177, in check_consistent_length
lengths = [_num_samples(X) for X in arrays if X is not None]
File "/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py", line 177, in <listcomp>
lengths = [_num_samples(X) for X in arrays if X is not None]
File "/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py", line 126, in _num_samples
" a valid collection." % x)
TypeError: Singleton array array(PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
svd_solver='auto', tol=0.0, whiten=False), dtype=object) cannot be considered a valid collection.
You are sending pca to the train_test_split. Check the arguments here
Send the transformed data (train_matrix) into it.
Correct code should be:
X_train, X_test, y_train, y_test = train_test_split(
train_matrix, labels_list, test_size=0.1, random_state=47)