Skipping a text line if the line is empty in tensorflow - string

I would like to process a text file containing sentences.
Each sentence is stored as a each line of that text file. I would like to retrieve each line using an iterator as follows:
class Reader(object):
def __init__(self, file_name):
dataset = tf.data.TextLineDataset(file_name)
self._iterator = dataset.make_one_shot_iterator()
def next_line(self):
# What I want to do is skipping blank lines here.
return self._iterator.get_next()
However, if the line is an empty line, I would like to skip that line. What would be the best way of implementing this skipping? I would like to implement that functionality in the above next_line method.
Any suggestion is welcomed.

You just need to apply filter to dataset.
filter(lambda line:tf.not_equal(tf.strings.length(line),0))
Suppose your data are as follows:
1
2,2
3,3,3
5,5,5
6,6,6
An example:
import tensorflow as tf
tf.enable_eager_execution()
dataset = tf.data.TextLineDataset('a.csv').filter(lambda line:tf.not_equal(tf.strings.length(line),0))
iterator = dataset.make_one_shot_iterator()
while True:
try:
print(iterator.get_next())
except tf.errors.OutOfRangeError:
break
The result:
tf.Tensor(b'1', shape=(), dtype=string)
tf.Tensor(b'2,2', shape=(), dtype=string)
tf.Tensor(b'3,3,3', shape=(), dtype=string)
tf.Tensor(b'5,5,5', shape=(), dtype=string)
tf.Tensor(b'6,6,6', shape=(), dtype=string)

Related

TypeError: __init__() takes from 1 to 2 positional arguments but 4 were given (Text Analysis with Python)

I was trying to follow https://github.com/foxbook/atap/blob/master/snippets/ch04/loader.py but having the below error:
TypeError: init() takes from 1 to 2 positional arguments but 4 were given
Any thoughts on how to resolve the error
I was able to make changes and run PickledCorpusReader but Corpus Loader is giving some issues as shared below.
from sklearn.model_selection import KFold
class CorpusLoader(object):
"""
The corpus loader knows how to deal with an NLTK corpus at the top of a
pipeline by simply taking as input a corpus to read from. It exposes both
the data and the labels and can be set up to do cross-validation.
If a number of folds is passed in for cross-validation, then the loader
is smart about how to access data for train/test splits. Otherwise it will
simply yield all documents in the corpus.
"""
def __init__(self, corpus, folds=None, shuffle=True):
self.n_docs = len(corpus.fileids())
self.corpus = corpus
self.folds = folds
self.shuffle = True
if folds is not None:
# Generate the KFold cross validation for the loader.
self.folds = KFold(self.n_docs, folds, shuffle)
#property
def n_folds(self):
"""
Returns the number of folds if it exists; 0 otherwise.
"""
if self.folds is None: return 0
return self.folds.n_folds
def fileids(self, fold=None, train=False, test=False):
"""
Returns a listing of the documents filtering to retreive specific
data from the folds/splits. If no fold, train, or test is specified
then the method will return all fileids.
If a fold is specified (should be an integer between 0 and folds),
then the loader will return documents from that fold. Further, train
or test must be specified to split the fold correctly.
"""
if fold is None:
# If no fold is specified, return all the fileids.
return self.corpus.fileids()
# Otherwise, identify the fold specifically and get the train/test idx
for fold_idx, (train_idx, test_idx) in enumerate(self.folds):
if fold_idx == fold: break
else:
# We have discovered no correct fold.
raise ValueError(
"{} is not a fold, specify an integer less than {}".format(
fold, self.folds.n_folds
)
)
# Now determine if we're in train or test mode.
if not (test or train) or (test and train):
raise ValueError(
"Please specify either train or test flag"
)
# Select only the indices to filter upon.
indices = train_idx if train else test_idx
return [
fileid for doc_idx, fileid in enumerate(self.corpus.fileids())
if doc_idx in indices
]
def labels(self, fold=None, train=False, test=False):
"""
Fit will load a list of the labels from the corpus categories.
If a fold is specified (should be an integer between 0 and folds),
then the loader will return documents from that fold. Further, train
or test must be specified to split the fold correctly.
"""
return [
self.corpus.categories(fileids=fileid)[0]
for fileid in self.fileids(fold, train, test)
]
def documents(self, fold=None, train=False, test=False):
"""
A generator of documents being streamed from disk. Each document is
a list of paragraphs, which are a list of sentences, which in turn is
a list of tuples of (token, tag) pairs. All preprocessing is done by
NLTK and the CorpusReader object this object wraps.
If a fold is specified (should be an integer between 0 and folds),
then the loader will return documents from that fold. Further, train
or test must be specified to split the fold correctly. This method
allows us to maintain the generator properties of document reads.
"""
for fileid in self.fileids(fold, train, test):
yield list(self.corpus.tagged(fileids=fileid))
if __name__ == '__main__':
from reader import PickledCorpusReader
corpus4 = PickledCorpusReader(nomi,r'.+\.txt')
loader = CorpusLoader(corpus, folds=12)
for fid in loader.fileids(0):
print(fid)
I had an error with KFold as well. Specifying the keyword-arguments explicitly, like so:
self.folds = KFold(n_splits=folds, shuffle=shuffle)
resolved it for me. I do not know why, though.
Hope this helps
I had this error as well, and I solve it by the following code:
KFold(n_splits=2, random_state=None, shuffle=False)
based on this link:
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

problem saving pre-trained fasttext vectors in "word2vec" format with _save_word2vec_format()

For a list of words I want to get their fasttext vectors and save them to a file in the same "word2vec" .txt format (word+space+vector in txt format).
This is what I did:
dict = open("word_list.txt","r") #the list of words I have
path = "cc.en.300.bin"
model = load_facebook_model(path)
vectors = []
words =[]
for word in dict:
vectors.append(model[word])
words.append(word)
vectors_array = np.array(vectors)
*I want to take the list "words" and nd.array "vectors_array" and save in the original .txt format.
I try to use the function from gensim "_save_word2vec_format":
def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, total_vec=None):
"""Store the input-hidden weight matrix in the same format used by the original
C word2vec-tool, for compatibility.
Parameters
----------
fname : str
The file path used to save the vectors in.
vocab : dict
The vocabulary of words.
vectors : numpy.array
The vectors to be stored.
fvocab : str, optional
File path used to save the vocabulary.
binary : bool, optional
If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
total_vec : int, optional
Explicitly specify total number of vectors
(in case word vectors are appended with document vectors afterwards).
"""
if not (vocab or vectors):
raise RuntimeError("no input")
if total_vec is None:
total_vec = len(vocab)
vector_size = vectors.shape[1]
if fvocab is not None:
logger.info("storing vocabulary in %s", fvocab)
with utils.open(fvocab, 'wb') as vout:
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count)))
logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
assert (len(vocab), vector_size) == vectors.shape
with utils.open(fname, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
# store in sorted order: most frequent words at the top
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
row = vectors[vocab_.index]
if binary:
row = row.astype(REAL)
fout.write(utils.to_utf8(word) + b" " + row.tostring())
else:
fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))
but I get the error:
INFO:gensim.models._fasttext_bin:loading 2000000 words for fastText model from cc.en.300.bin
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.word2vec:Updating model with new vocabulary
INFO:gensim.models.word2vec:New added 2000000 unique words (50% of original 4000000) and increased the count of 2000000 pre-existing words (50% of original 4000000)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 2000000 items
INFO:gensim.models.word2vec:sample=1e-05 downsamples 6996 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 390315457935 word corpus (70.7% of prior 552001338161)
INFO:gensim.models.fasttext:loaded (4000000, 300) weight matrix for fastText model from cc.en.300.bin
trials.py:42: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
vectors.append(model[word])
INFO:__main__:storing 8664x300 projection weights into arrays_to_txt_oct3.txt
loading the model for: en
finish loading the model for: en
len(vectors): 8664
len(words): 8664
shape of vectors_array (8664, 300)
mission launched!
Traceback (most recent call last):
File "trials.py", line 102, in <module>
_save_word2vec_format(YOUR_VEC_FILE_PATH, words, vectors_array, fvocab=None, binary=False, total_vec=None)
File "trials.py", line 89, in _save_word2vec_format
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
File "/cs/snapless/oabend/tailin/transdiv/lib/python3.7/site-packages/six.py", line 589, in iteritems
return iter(d.items(**kw))
AttributeError: 'list' object has no attribute 'items'
I understand that it has to do with the second argument in the function, but I don't understand how should I make a list of words into a dictionary object?
I tried doing that with:
#convert list of words into a dictionary
words_dict = {i:x for i,x in enumerate(words)}
But still got the error message:
Traceback (most recent call last):
File "trials.py", line 99, in <module>
_save_word2vec_format(YOUR_VEC_FILE_PATH, dict, vectors_array, fvocab=None, binary=False, total_vec=None)
File "trials.py", line 77, in _save_word2vec_format
total_vec = len(vocab)
TypeError: object of type '_io.TextIOWrapper' has no len()
I don't understand how to insert the word list in the right format...
You can directly import & re-use the Gensim KeyedVectors class to assemble your own (sub)set of word-vectors as one instance of KeyedVectors, then use its .save_word2vec_format() method.
For example, roughly this should work:
from gensim.models import KeyedVectors
words_file = open("word_list.txt","r") # your word-list as a text file
words_list = list(words_file) # reads each line of file into a new `list` object
fasttext_path = "cc.en.300.bin"
model = load_facebook_model(path)
kv = KeyedVectors(vector_size=model.wv.vector_size) # new empty KV object
vectors = []
for word in words_list:
vectors.append(model[word]) # vectors for words_list, in same order
kv.add(words_list, vectors) # adds those keys (words) & vectors as batch
kv.save_word2vec_format('my_kv.vec', binary=False)

Cannot get the value of hidden weights of RNN

I declare my RNN as
self.rnn = torch.nn.RNN(input_size=encoding_dim, hidden_size=1, num_layers=1, nonlinearity='relu')
Later
self.rnn.all_weights
# [[Parameter containing:
tensor([[-0.8099, -0.9543, 0.1117, 0.6221, 0.5034, -0.6766, -0.3360, -0.1700,
-0.9361, -0.3428]], requires_grad=True), Parameter containing:
tensor([[-0.1929]], requires_grad=True), Parameter containing:
tensor([0.7881], requires_grad=True), Parameter containing:
tensor([0.4320], requires_grad=True)]]
self.rnn.all_weights[0][0][0].values
# {RuntimeError}Could not run 'aten::values' with arguments from the 'CPU' backend. 'aten::values' is only available for these backends: [SparseCPU, Autograd, Profiler, Tracer].
Clearly I see the value of the weights, but cannot access to it. Documentation says I need to specify requires_grad=True, but that does not work.
Is there a more elegant and usable way than self.rnn.all_weights[0][0][0]?
Use torch.nn.Module.named_parameters or torch.nn.Module.parameters.
>>> import torch.nn as nn
>>> model = nn.RNN(input_size=encoding_dim, hidden_size=1, num_layers=1, nonlinearity='relu')
>>> weights = []
>>> for name, parameter in model.named_parameters():
... weights.append({name: parameter[0]})
...
>>> just_weights = []
>>> for parameter in model.parameters():
... just_weights.append(parameter[0])
...

error in converting tensor to numpy array

I'm trying to convert input_image which is a tensor to numpy array.Following the already answered questions here and several others that suggested to use input_image.eval() or equivalently sess.run() for this conversion, I did the same, but it throws an error and apparently expects a feed_dict value for the sess.run(). But since here I'm not trying to run an operation dependent on unknown values, I don't see the need for the feed_dict here because all I'm doing here is just conversion.
Besides, just so as to check I also tried converting a tf.constant([1,2,3]) value right above it using the same method and it got successfully compiled despite its data type being the same as input_image. Here's my code which is the part of larger script:
def call(self, x):
input_image = Input(shape=(None, None, 3))
print(input_image.shape)
print(type(tf.constant([1,2,3])))
print(type(input_image))
print(type(K.get_session().run(tf.constant([1,2,3]))))
print(type(K.get_session().run(input_image)))
and here's the error:
(?, ?, ?, 3)
<class 'tensorflow.python.framework.ops.Tensor'>
<class 'tensorflow.python.framework.ops.Tensor'>
<class 'numpy.ndarray'>
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
return fn(*args)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
target_list, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: You must feed a value for placeholder tensor 'input_1' with dtype float and shape [?,?,?,3]
[[{{node input_1}}]]
[[input_1/_1051]]
(1) Invalid argument: You must feed a value for placeholder tensor 'input_1' with dtype float and shape [?,?,?,3]
[[{{node input_1}}]]
0 successful operations.
0 derived errors ignored.
I wonder why the former would work and the latter won't.
There is no such thing as "converting" a symbolic tensor to a numpy array, as the latter cannot hold the same kind of information as the former.
When you use eval() or session.run(), what you are doing is evaluating a symbolic expression to get a numerical result, which is a numpy array, but this is not a conversion. Evaluating an expression might or might not require additional input data (that's what the feed_dict is for), depending on the expression.
Evaluating a constant (tf.constant) does not require any input data, but evaluating your other expression does require the input data, so you cannot "convert" this to a numpy array.
Just adding to (or elaborating on) what #MatiasValdenegro said,
TensorFlow follows something called graph execution (or define-then-run). In other words, when you write a TensorFlow program it defines something called a data-flow graph which shows how the operations you defined are related to each other. And then you execute bits and pieces of that graph depending on the results you're after.
Let's consider two examples. (I am switching to a simple TensorFlow program instead of Keras bits as it makes things more clear - After all K.get_session() returns a Session object).
Example 1
Say you have the following program.
import tensorflow as tf
a = tf.placeholder(shape=[2,2], dtype=tf.float32)
b = tf.constant(1, dtype=tf.float32)
c = a * b
# Wrong: This is what you're doing essentially when you do sess.run(input_image)
with tf.Session() as sess:
print(sess.run(c))
# Right: You need to feed values that c is dependent on
with tf.Session() as sess:
print(sess.run(c, feed_dict={a: np.array([[1,2],[2,3]])}))
Whenever a resulting tensor (e.g. c) is dependent on a placeholder you cannot execute it and get the result without feeding values to all the dependent placeholders.
Example 2
When you define a tf.constant(1) this is not dependent on anything. In other words you don't need a feed_dict and can directly run eval() or sess.run() on it.
Update: Further explanation on why you need a feed_dict for input_image
TLDR: You need a feed_dict because your resulting Tensor is produced by an Input layer.
Your input_image is basically the resulting tensor you get by feeding something to the Input layer. Usually in Keras, you are not exposed to the internal placeholder level details. But you would do that via using model.fit() or model.evaluate(). You can see that Keras Input layer in fact uses a placeholder by analysing this line.
Hope I made my point clear that you do need to feed in a value to the placeholder to successfully evaluate the output of an Input layer. Because that basically holds a placeholder.
Update 2: How to feed to your Input layer
So, appears you can use feed_dict with Keras Input layer in the following manner. Instead of defining shape argument you straight away pass a placeholder to the tensor argument, which will bypass the internal placeholder creation in the layer.
from tensorflow.keras.layers import InputLayer
import numpy as np
import tensorflow.keras.backend as K
x = tf.placeholder(shape=[None, None, None, 3], dtype=tf.float32)
input_image = Input(tensor=x)
arr = np.array([[[[1,1,1]]]])
print(arr.shape)
print(K.get_session().run(input_image, feed_dict={x: arr}))

Keras fit_generator() & Input array should have the same as target samples

So far I'm trying to implement the fit-generator for sentiment analysis as I only have a small PGU and big dataset. But I keep getting this error
Using Theano backend.
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
b'In file included from C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include/driver_types.h:53:0,\r\n from C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include/cudnn.h:63,\r\n from C:\\Users\\Def\\AppData\\Local\\Temp\\try_flags_p2iwer2o.c:4:\r\nC:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include/host_defines.h:84:0: warning: "__cdecl" redefined\r\n #define __cdecl\r\n ^\r\n<built-in>: note: this is the location of the previous definition\r\nd000029.o:(.idata$5+0x0): multiple definition of `__imp___C_specific_handler\'\r\nd000026.o:(.idata$5+0x0): first defined here\r\nC:/Users/Def/Anaconda3/envs/Final/Library/mingw-w64/bin/../lib/gcc/x86_64-w64-mingw32/5.3.0/../../../../x86_64-w64-mingw32/lib/../lib/crt2.o: In function `__tmainCRTStartup\':\r\nC:/repo/mingw-w64-crt-git/src/mingw-w64/mingw-w64-crt/crt/crtexe.c:285: undefined reference to `_set_invalid_parameter_handler\'\r\ncollect2.exe: error: ld returned 1 exit status\r\n'
Mapped name None to device cuda: GeForce GTX 960M (0000:01:00.0)
Epoch 1/10
Traceback (most recent call last):
File "C:/Users/Def/PycharmProjects/KerasUkExpenditure/TweetParsing.py", line 136, in <module>
epochs=10)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\legacy\interfaces.py", line 88, in wrapper
return func(*args, **kwargs)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\models.py", line 1097, in fit_generator
initial_epoch=initial_epoch)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\legacy\interfaces.py", line 88, in wrapper
return func(*args, **kwargs)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 1876, in fit_generator
class_weight=class_weight)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 1614, in train_on_batch
check_batch_axis=True)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 1307, in _standardize_user_data
_check_array_lengths(x, y, sample_weights)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 229, in _check_array_lengths
'and ' + str(list(set_y)[0]) + ' target samples.')
ValueError: Input arrays should have the same number of samples as target arrays. Found 1000 input samples and 1 target samples.
I have a matrix that is 1000 elements long since I only have a maximum corpus of 1000 words which is specified in the Tokenizer().
I then have the sentiment which is either a 0 for negative or a 1 for positive.
My question is why do I receive the error? I have tried to use the transform on both the data and labels and I still receive the same error. here is my code.
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import re
"""
the amount of samples out to the 1 million to use, my 960m 2GB can only handle
about 30,000ish at the moment depending on a number of neurons in the
deep layer and a number of layers.
"""
maxSamples = 3000
#Load the CSV and get the correct columns
data = pd.read_csv("C:\\Users\\Def\\Desktop\\Sentiment Analysis Dataset1.csv")
dx = pd.DataFrame()
dy = pd.DataFrame()
dy[['Sentiment']] = data[['Sentiment']]
dx[['SentimentText']] = data[['SentimentText']]
dataY = dy.iloc[0:maxSamples]
dataX = dx.iloc[0:maxSamples]
testY = dy.iloc[maxSamples: maxSamples + 1000]
testX = dx.iloc[maxSamples: maxSamples + 1000]
"""
here I filter the data and clean it up by removing # tags, hyperlinks and
also any characters that are not alpha-numeric.
"""
def removeTagsAndLinks(dataframe):
for x in dataframe.iterrows():
#Removes Hyperlinks
x[1].values[0] = re.sub("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,#?^=%&:/~+#-]*[\w#?^=%&/~+#-])?", "", str(x[1].values[0]))
#Removes # tags
x[1].values[0] = re.sub("#\\w+", '', str(x[1].values[0]))
#keeps only alpha-numeric chars
x[1].values[0] = re.sub("\W+", ' ', str(x[1].values[0]))
return dataframe
xData = removeTagsAndLinks(dataX)
xTest = removeTagsAndLinks(testX)
"""
This loop looks for any Tweets with characters shorter than 2 and once found write the
index of that Tweet to an array so I can remove from the Dataframe of sentiment and the
list of Tweets later
"""
indexOfBlankStrings = []
for index, string in enumerate(xData):
if len(string) < 2:
indexOfBlankStrings.append(index)
for row in indexOfBlankStrings:
dataY.drop(row, axis=0, inplace=True)
"""
This makes a BOW model out of all the tweets then creates a
vector for each of the tweets containing all the words from
the BOW model, each vector is the same size becuase the
network expects it
"""
def vectorise(tokenizer, list):
return tokenizer.fit_on_texts(list)
#Make BOW model and vectorise it
t = Tokenizer(lower=False, num_words=1000)
t.fit_on_texts(dataX.iloc[:,0].tolist())
t.fit_on_texts(dataX.iloc[:,0].tolist())
"""
Here im experimenting with multiple layers of the total
amount of words in the syllabus divided by ^2 - This
has given me quite accurate results compared to random guess's
of amount of neron's.
"""
l1 = int(xData.shape[0] / 4) #To big for my GPU
l2 = int(xData.shape[0] / 8) #To big for my GPU
l3 = int(xData.shape[0] / 16)
l4 = int(xData.shape[0] / 32)
l5 = int(xData.shape[0] / 64)
l6 = int(xData.shape[0] / 128)
#Make the model
model = Sequential()
model.add(Dense(l1, input_dim=xData.shape[1]))
model.add(Dropout(0.15))
model.add(Dense(l2))
model.add(Dropout(0.2))
model.add(Dense(l3))
model.add(Dropout(0.2))
model.add(Dense(l4))
model.add(Dense(1, activation='relu'))
#Compile the model
model.compile(optimizer='RMSProp', loss='binary_crossentropy', metrics=['acc'])
"""
This here will use multiple batches to train the model.
startIndex:
This is the starting index of the array for which you want to
start training the network from.
dataRange:
The number of elements use to train the network in each batch so
since dataRange = 1000 this mean it goes from
startIndex...dataRange OR 0...1000
amountOfEpochs:
This is kinda self explanitory, the more Epochs the more it
is supposed to learn AKA updates the optimisation algo numbers
"""
amountOfEpochs = 1
dataRange = 1000
startIndex = 0
def generator(tokenizer, data, labels, totalSize=maxSamples, startIndex=0):
l = labels.as_matrix()
while True:
for i in range(startIndex, totalSize):
batch_features = tokenizer.texts_to_matrix(xData.iloc[i])
batch_labels = l[i]
yield batch_features, batch_labels
derp = generator(t, data=xData, labels=dataY)
##This runs the model for batch AKA load a little them process then load a little more
for amountOfData in range(1000, maxSamples, 1000):
#(loss, acc) = model.train_on_batch(x=dim[startIndex:amountOfData], y=np.asarray(dataY.iloc[startIndex:amountOfData]))
history = model.fit_generator(generator=generator(tokenizer=t,
data=xData,
labels=dataY),
steps_per_epoch=1,
epochs=10)
Thanks
The problem you are having is that the number of samples in your input array, do not equal the number of samples in your target array. This means the number of rows in you matrices do not match. The problems stems from your generator function. You index the data as
batch_labels = l[i]
which is only returning one sample (row of matrix). When instead it should be something like...
batch_labels = l[i:i+1000]
However there are other problems with your use of the fit_generator. You should not be using this within a loop. I don't see how it is benefiting the program, and calling the fit_generator in a loop defeats the purpose of using a generator. The function you would use to train an an individual batch of data would be
train_on_batch()
as seen in the docs

Resources