Getting batches in tensorflow - python-3.x

This question may have been asked but I failed to find it.
What is the simplest way to constantly get batches of data from a dataset? Is there a built-in tensorflow function to do so?
for instance:
for i in num_trains:
x_batch, y_batch = get_batch(x_train, y_train, batch_size)
sess.run(train_step, feed_dict={x:x_batch,y:y_batch})
If there is not such a built in function, how would you implement it? I tried myself but I could not figure out how I can get a new batch different from the previous ones each time I call the function.
Thanks!

You can try:
# Feed batch data
def get_batch(inputX, inputY, batch_size):
duration = len(inputX)
for i in range(0,duration//batch_size):
idx = i*batch_size
yield inputX[idx:idx+batch_size], inputY[idx:idx+batch_size]
You can also use tensorflow's dataset API as well:
dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
dataset = dataset.batch(batch_size)
Getting the batch:
X = np.arange(100)
Y = X
batch = get_batch(X, Y, 5)
batch_x, batch_y = next(batch)
print(batch_x, batch_y)
#[0 1 2 3 4] [0 1 2 3 4]
batch_x, batch_y = next(batch)
print(batch_x, batch_y)
#[5 6 7 8 9] [5 6 7 8 9]
Typically for running over the dataset for multiple epochs, you would do:
for epoch in range(number of epoch):
for step in range(size_of_dataset//batch_size):
for x_batch, y_batch in get_batch(x_train, y_train, batch_size):
sess.run(train_step, feed_dict={x:x_batch,y:y_batch})
Using the dataset API:
dataset = tf.data.Dataset.from_tensor_slices((X, Y))
dataset = dataset.batch(5)
iterator = dataset.make_initializable_iterator()
train_x, train_y = iterator.get_next()
with tf.Session() as sess:
sess.run(iterator.initializer)
for i in range(2):
print(sess.run([train_x, train_y]))
#[array([0, 1, 2, 3, 4]), array([0, 1, 2, 3, 4])]
#[array([5, 6, 7, 8, 9]), array([5, 6, 7, 8, 9])]

Related

Keras: How to define input shape for 1st DENSE layer?

I am new to deep learning & keras.
Refer to below code.
I want to confirm the terminology. Before processing to create batches by the TimeSeries Generator, there are 10 samples. After processing by the Generator, am I correct to say
there are 8 samples in 1 batch?
I don't understand why yhat differs when I define the 1st layer input shape as 'input_shape' vs 'input_dim'. yhat should only be (1,1) - a single value.
If instead, I use a simple RNN layer as my 1st layer, what should the inputs shape be?
Thank you
# univariate one step problem with mlp
from numpy import array
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.sequence import TimeseriesGenerator
# define dataset
series = array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) # 10 samples before processing by the Generator
# define generator
timestep = 2
generator = TimeseriesGenerator(series, series, length=timestep, batch_size=8)
# number of batch
print('Batches: %d' % len(generator))
# OUT --> Batches: 1
# print each batch
for i in range(len(generator)):
x, y = generator[i]
print('%s => %s' % (x, y))
#OUT:
[[1 2]
[2 3]
[3 4]
[4 5]
[5 6]
[6 7]
[7 8]
[8 9]] => [ 3 4 5 6 7 8 9 10]
#After processing by the Generator, there are 8 samples in 1 batch.
x, y = generator[0]
print(x.shape)
# define model
model = Sequential()
#TensorFlow assumes the first dimension is the batch_size which can have any size so you don't need to define it. The 2nd D is the number of time steps. The 3rd D is the number of features
#1st LAYER with input shape defined by input_shape
#model.add(Dense(100, activation='relu', input_shape= (timestep,1)))
#1st LAYER with input shape defined by input_dim
model.add(Dense(100, activation='relu', input_dim=timestep))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
# fit model
model.fit_generator(generator, steps_per_epoch=1, epochs=200, verbose=0)
# make a one step prediction out of sample
x_input = array([9, 10]).reshape((1, timestep))
print(x_input.shape)
yhat = model.predict(x_input, verbose=0)
print(yhat)
# OUT: [[9.3066435, 10.239568]] if 1st layer's shape is input_shape
# OUT: [[11.545249]] if 1st layer's shape is input_dim

How to make manual batch in pytorch with MNIST dataset

I am new to pytorch.
I am trying to use a different approach when training MNIST dataset in pytorch.
Normally, we put all shuffled numbers for each batch.
What I want to do is (assuming by batch size is 20 and there are 1000 training examples).
Make a batch look this this (I wrote only labels for simplicity)
batch_1 = [0, 0, 1, 0, ... ] only 0 and 1 for this batch
batch_2 = [6, 6, 7, 7, ... ] only 6 and 7 for this batch
batch_3 = [2, 3, 2, 2, ... ] only 2 and 3 for this batch
batch_4 = [4, 5, 5, 4, ... ] only 4 and 5 for this batch
batch_5 = [6, 6, 7, 7, ... ] only 6 and 7 for this batch
batch_6 = [8, 9, 9, 9, ... ] only 8 and 9 for this batch
batch_7 = [2, 3, 2, 2, ... ] only 2 and 3 for this batch
batch_8 = [4, 5, 5, 4, ... ] only 4 and 5 for this batch
...
batch_50 = [4, 5, 5, 4, ... ] only 4 and 5 for this batch
How can I do this in pytorch??
Thank you
So far this is my implementation for training / testing.
I will expand/manipulate this code below to develop the above.
import torch
from torch.utils.data import DataLoader, Subset
import torchvision.datasets as datasets
from torchvision import transforms
import numpy as np
mnist_trainset = datasets.MNIST(root='./data', train=True,
download=True,
transform=transforms.Compose([transforms.ToTensor()]))
mnist_testset = datasets.MNIST(root='./data',
train=False,
download=True,
transform=transforms.Compose([transforms.ToTensor()]))
class_inds = [torch.where(mnist_trainset.targets == class_idx)[0]
for class_idx in mnist_trainset.class_to_idx.values()]
train_dataloaders = [
DataLoader(dataset=Subset(mnist_trainset, inds),
batch_size=50,
shuffle=True,
drop_last=False
)
for inds in class_inds
]
test_dataloader = torch.utils.data.DataLoader(mnist_testset,
batch_size=50,
shuffle=False)
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear_1 = torch.nn.Linear(784, 256)
self.linear_2 = torch.nn.Linear(256, 10)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
x = x.reshape(x.size(0), -1)
x = self.linear_1(x)
x = self.sigmoid(x)
pred = self.linear_2(x)
return pred
model = Model()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 20
criterion = torch.nn.CrossEntropyLoss()
test_acc = list()
for epoch in range(epochs):
print(f"epoch {epoch} started")
model.train()
itr = 0
iterators = list(map(iter, train_dataloaders))
while iterators:
itr = itr + 1
iterator = np.random.choice(iterators)
try:
image, label = next(iterator)
optimizer.zero_grad()
pred = model(image)
loss = criterion(pred, label)
loss.backward()
optimizer.step()
except StopIteration:
iterators.remove(iterator)
model.eval()
total = 0
for itr, (image, label) in enumerate(test_dataloader):
pred = model(image)
loss = criterion(pred, label)
# we now need softmax because we are testing.
pred = torch.nn.functional.softmax(pred, dim=1)
for i, p in enumerate(pred):
if label[i] == torch.max(p.data, 0)[1]:
total = total + 1
accuracy = total / len(mnist_testset)
# append accuracy here
test_acc.append(accuracy)

How to adjust the batch data by the amount of labels in PyTorch

I have made n-grams / doc-ids for document classification,
def create_dataset(tok_docs, vocab, n):
n_grams = []
document_ids = []
for i, doc in enumerate(tok_docs):
for n_gram in [doc[0][i:i+n] for i in range(len(doc[0]) - 1)]:
n_grams.append(n_gram)
document_ids.append(i)
return n_grams, document_ids
def create_pytorch_datasets(n_grams, doc_ids):
n_grams_tensor = torch.tensor(n_grams)
doc_ids_tensor = troch.tensor(doc_ids)
full_dataset = TensorDataset(n_grams_tensor, doc_ids_tensor)
return full_dataset
create_dataset returns pair of (n-grams, document_ids) like below:
n_grams, doc_ids = create_dataset( ... )
train_data = create_pytorch_datasets(n_grams, doc_ids)
>>> train_data[0:100]
(tensor([[2076, 517, 54, 3647, 1182, 7086],
[517, 54, 3647, 1182, 7086, 1149],
...
]),
tensor(([0, 0, 0, 0, 0, ..., 3, 3, 3]))
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
The first of tensor content means n-grams and the second one does doc_id.
But as you know, by the length of documents, the amount of training data according to the label would changes.
If one document has very long length, there would be so many pairs that have its label in training data.
I think it can cause overfitting in model, because the classification model tends to classify inputs to long length documents.
So, I want to extract input batches from a uniform distribution for label (doc_ids). How can I fix it in code above?
p.s)
If there is train_data like below, I want to extract batch by the probability like that:
n-grams doc_ids
([1, 2, 3, 4], 1) ====> 0.33
([1, 3, 5, 7], 2) ====> 0.33
([2, 3, 4, 5], 3) ====> 0.33 * 0.25
([3, 5, 2, 5], 3) ====> 0.33 * 0.25
([6, 3, 4, 5], 3) ====> 0.33 * 0.25
([2, 3, 1, 5], 3) ====> 0.33 * 0.25
In pytorch you can specify a sampler or a batch_sampler to the dataloader to change how the sampling of datapoints is done.
docs on the dataloader:
https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler
documentation on the sampler: https://pytorch.org/docs/stable/data.html#torch.utils.data.Sampler
For instance, you can use the WeightedRandomSampler to specify a weight to every datapoint. The weighting can be the inverse length of the document for instance.
I would make the following modifications in the code:
def create_dataset(tok_docs, vocab, n):
n_grams = []
document_ids = []
weights = [] # << list of weights for sampling
for i, doc in enumerate(tok_docs):
for n_gram in [doc[0][i:i+n] for i in range(len(doc[0]) - 1)]:
n_grams.append(n_gram)
document_ids.append(i)
weights.append(1/len(doc[0])) # << ngrams of long documents are sampled less often
return n_grams, document_ids, weights
sampler = WeightedRandomSampler(weights, 1, replacement=True) # << create the sampler
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False, sampler=sampler) # << includes the sampler in the dataloader

Missmatching shapes in Tensorflow

I am trying to do a multivariate linear regression and I am having some issues. Namely, I am getting the following error:
ValueError: Cannot feed value of shape (3,) for Tensor 'X:0', which has shape '(1, 3)'
I have 3 feature variables, which I call trainX and 1 label, which I call trainY. Their shapes are the following (they are numpy arrays):
trainX.shape:
(2500, 3)
trainY.shape:
(2500,)
The following piece of code defines the tensors that I use to compute the model:
X = tf.compat.v1.placeholder("float", [1, 3], name="X")
Y = tf.compat.v1.placeholder("float", [1], name="Y")
W = tf.Variable(tf.zeros([3, 1]), name="W")
b = tf.Variable(tf.zeros([1]), name="b")
I calculate the predicted label and the cost function and the optimizer by doing:
predicted_y = tf.matmul(X, W) + b
cost = tf.reduce_sum(tf.pow(predicted_y-Y, 2)) / (2 * n)
optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate).minimize(cost)
I am getting the error in the tensor-flow session, namely in the following piece of code:
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
for (_x, _y) in zip(trainX, trainY):
sess.run(optimizer, feed_dict={X: _x, Y: _y})
if (epoch + 1) % 100 == 0:
c = sess.run(cost, feed_dict={X: trainX, Y: trainY})
print("Epoch", (epoch + 1), ": cost =", c, "W =", sess.run(W), "b =", sess.run(b))
# Storing necessary values to be used outside the Session
training_cost = sess.run(cost, feed_dict={X: trainX, Y: trainY})
weight = sess.run(W)
bias = sess.run(b)
Any help would be greatly appreciated.
The problem is that _x is a vector with 3 elements, while X expects a matrix with 1 line and 3 columns. One possible solution would be to reshape _x:
_x = np.reshape(_x, [1, 3])
Another possibility would be to change the placeholder to the input shape:
X = tf.compat.v1.placeholder("float", [3], name="X")
Often one wants to train on more than one example. In this case, you may want to define the placeholders to allow any number of inputs:
X = tf.compat.v1.placeholder("float", [None, 3], name="X")
Y = tf.compat.v1.placeholder("float", [None], name="Y")
Then we could for example we use batches of 100:
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
for i in range(trainX.shape[0] % 100):
sess.run(optimizer, feed_dict={X: trainX[i*100:(i+1)*100, ...], Y: trainY[i*100:(i+1)*100]})

Tensorflow error : Dimensions must be equal

I have a dataset of 25000 colored pictures 100*100(*3) and I am trying to build a simple neural network with one convolutional layer. Its pictures of cells that are infected or not by Malaria, so my output is 2.
But it seems like I have a dimension mismatch, and I don't know where my error comes from.
My neural network :
def simple_nn(X_training, Y_training, X_test, Y_test):
input = 100*100*3
batch_size = 25
X = tf.placeholder(tf.float32, [batch_size, 100, 100, 3])
#Was:
# W = tf.Variable(tf.zeros([input, 2]))
# b = tf.Variable(tf.zeros([2]))
#Now:
W = tf.Variable(tf.truncated_normal([4, 4, 3, 3], stddev=0.1))
B = tf.Variable(tf.ones([3])/10) # What should I put here ??
init = tf.global_variables_initializer()
# model
#Was:
# Y = tf.nn.softmax(tf.matmul(tf.reshape(X, [-1, input]), W) + b)
#Now:
stride = 1 # output is still 28x28
Ycnv = tf.nn.conv2d(X, W, strides=[1, stride, stride, 1], padding='SAME')
Y = tf.nn.relu(Ycnv + B)
# placeholder for correct labels
Y_ = tf.placeholder(tf.float32, [None, 2])
# loss function
cross_entropy = -tf.reduce_sum(Y_ * tf.log(Y))
# % of correct answers found in batch
is_correct = tf.equal(tf.argmax(Y,1), tf.argmax(Y_,1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))
learning_rate = 0.00001
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_step = optimizer.minimize(cross_entropy)
sess = tf.Session()
sess.run(init)
#Training here...
My error :
Traceback (most recent call last):
File "neural_net.py", line 135, in <module>
simple_nn(X_training, Y_training, X_test, Y_test)
File "neural_net.py", line 69, in simple_nn
cross_entropy = -tf.reduce_sum(Y_ * tf.log(Y))
...
ValueError: Dimensions must be equal, but are 2 and 3 for 'mul' (op: 'Mul') with input shapes: [?,2], [25,100,100,3].
I used a simple layer before, and it was working. I changed my weight and bias, and honestly, I don't know why my bias are setup like this, I followed a tutorial (https://codelabs.developers.google.com/codelabs/cloud-tensorflow-mnist/#11) but it is not explained.
I also replaced my Y to a conv2D.
And I don't know what my ouput should be if I want to get a vector of size 2*1 as a result.
You have correctly defined your labels as
Y_ = tf.placeholder(tf.float32, [None, 2])
So the last dimension is 2. However, the output from the convolution step is not directly suitable for comparing it to the labels. What I mean is the following: if you do
Ycnv = tf.nn.conv2d(X, W, strides=[1, stride, stride, 1], padding='SAME')
Y = tf.nn.relu(Ycnv + B)
The dimensions of this are going to be four as the error says:
ValueError: Dimensions must be equal, but are 2 and 3 for 'mul' (op: 'Mul') with input shapes: [?,2], [25,100,100,3].
So it is impossible to multiply directly (or operate) the output from convolution with the labels. What I recommend is to flatten (reshape it to only one dimension) the output of convolution and pass it to a fully connected layer of 2 units (as much as classes you have). Like this:
Y = tf.reshape(Y, [1,-1])
logits = tf.layers.dense(Y, units= 2)
and you can pass this to the loss.
Also I recommend you to change the loss to a more approprite version. For example, tf.losses.sigmoid_cross_entropy.
Also, the way you use convolutions is strange. Why do you put handmade filters in the convolution? besides you should have to initialize and before it putting them in a collection. In conclusion I recommend you to delete all the following code:
W = tf.Variable(tf.truncated_normal([4, 4, 3, 3], stddev=0.1))
B = tf.Variable(tf.ones([3])/10) # What should I put here ??
init = tf.global_variables_initializer()
# model
#Was:
# Y = tf.nn.softmax(tf.matmul(tf.reshape(X, [-1, input]), W) + b)
#Now:
stride = 1 # output is still 28x28
Ycnv = tf.nn.conv2d(X, W, strides=[1, stride, stride, 1], padding='SAME')
Y = tf.nn.relu(Ycnv + B)
and substitute it by:
conv1 = tf.layers.conv2d(X, filters=64, kernel_size=3,
strides=1, padding='SAME',
activation=tf.nn.relu, name="conv1")
Also the init = tf.global_variable_initializer() should be at the end of the graph construction becuase, if not, there will be variables it won't catch.
My final working code is:
def simple_nn():
inp = 100*100*3
batch_size = 2
X = tf.placeholder(tf.float32, [batch_size, 100, 100, 3])
Y_ = tf.placeholder(tf.float32, [None, 2])
#Was:
# W = tf.Variable(tf.zeros([input, 2]))
# b = tf.Variable(tf.zeros([2]))
#Now:
# model
#Was:
# Y = tf.nn.softmax(tf.matmul(tf.reshape(X, [-1, input]), W) + b)
#Now:
stride = 1 # output is still 28x28
conv1 = tf.layers.conv2d(X, filters=64, kernel_size=3,
strides=1, padding='SAME',
activation=tf.nn.relu, name="conv1")
Y = tf.reshape(conv1, [1,-1])
logits = tf.layers.dense(Y, units=2, activation=tf.nn.relu)
# placeholder for correct labels
# loss function
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_, logits=logits)
loss = tf.reduce_mean(cross_entropy)
# % of correct answers found in batch
is_correct = tf.equal(tf.argmax(Y,1), tf.argmax(Y_,1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))
learning_rate = 0.00001
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_step = optimizer.minimize(cross_entropy)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
...

Resources