I am using keras with a tensorflow-gpu back end on a Ubuntu 17.04 VM.
I have created a custom generator to read inputs and classes from pickle files, but it seems to get the following error:
terminate called after throwing an instance of 'std::ba
d_alloc'
what(): std::bad_alloc
the code for loading data can be seen here:
def data_gen(self, pklPaths, batch_size=16):
while True:
data = []
labels = []
for i, pklPath in enumerate(pklPaths):
# print(pklPath)
image = pickle.load(open(pklPath, 'rb'))
for i in range(batch_size):
# Set a label
data.append(image[0][0])
labels.append(image[1][1])
yield np.array(data), np.array(labels)
then in the train section i'm using a fit generator:
vm_model.fit_generator(vm.data_gen(pkl_train), validation_data=vm.data_gen(pkl_validate), epochs=15, verbose=2,
steps_per_epoch=(5000/16), validation_steps=(1000/16), callbacks=[tb])
the generator should have better memory management than loading everything, however it doesn't seem to be the case! any ideas?
ok, so i found the issue so I'm answering my own question.
Basically, previous one had one unnecessary loop and also kept increasing the size of data and labels essentially loading the entire data in memory:
def data_gen(self, pklPaths, batch_size=16):
while True:
data = []
labels = []
for i, pklPath in enumerate(pklPaths):
# load pickle
image = pickle.load(open(pklPath, 'rb'))
# append
data.append(image[0][0])
labels.append(image[1])
# if batch is complete yield data and labels and reset
if i % batch_size == 0 and i != 0:
yield np.array(data), np.array(labels)
data.clear()
labels.clear()
Related
I am trying to generate text in a certain style using tensorflow, and even when I copy and paste the code from the tensorflow website it only predicts unknown characters even though there is a mask to prevent this. I'm thinking it has something to do with the version of python I'm running (3.9.15) since it doesnt even work with their code and dataset.
Has anyone else run into this issue?
class OneStep(tf.keras.Model):
def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
super().__init__()
self.temperature = temperature
self.model = model
self.chars_from_ids = chars_from_ids
self.ids_from_chars = ids_from_chars
# Create a mask to prevent "[UNK]" from being generated.
skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
sparse_mask = tf.SparseTensor(
# Put a -inf at each bad index.
values=[-float('inf')]*len(skip_ids),
indices=skip_ids,
# Match the shape to the vocabulary
dense_shape=[len(ids_from_chars.get_vocabulary())])
self.prediction_mask = tf.sparse.to_dense(sparse_mask)
#tf.function
def generate_one_step(self, inputs, states=None):
# Convert strings to token IDs.
input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
input_ids = self.ids_from_chars(input_chars).to_tensor()
# Run the model.
# predicted_logits.shape is [batch, char, next_char_logits]
predicted_logits, states = self.model(inputs=input_ids, states=states,
return_state=True)
# Only use the last prediction.
predicted_logits = predicted_logits[:, -1, :]
predicted_logits = predicted_logits/self.temperature
# Apply the prediction mask: prevent "[UNK]" from being generated.
predicted_logits = predicted_logits + self.prediction_mask
# Sample the output logits to generate token IDs.
predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
predicted_ids = tf.squeeze(predicted_ids, axis=-1)
# Convert from token ids to characters
predicted_chars = self.chars_from_ids(predicted_ids)
# Return the characters and model state.
return predicted_chars, states
I lifted this straight from their tutorial and tried to run it in my enviroment and it just predicted '[UNK]'
I'm running a mac M1 with the latest version of tensorflow. So, that may also be an issue.
PLEASE HELP!!
Tutorial for reference: https://www.tensorflow.org/text/tutorials/text_generation
Is there a way can I visualize the complete training loop for the GAN architecture in TensorBoard using Pytorch? I think it's possible using TF, but I am having a hard time to figure out one using Pytorch.
You can use TensorboardX for this.
You can make use of SummaryWriter from TensorboardX to create an event file in a given directory and add summaries and events to it.
The code below is an example that you can use but you have to add in the loss values, the ground truth images and the generated images yourself. I commented where they would have to go.
from tensorboardX import SummaryWriter
import torchvision.utils as vutils
import numpy as np
REPORT_EVERY_ITER = 100
SAVE_IMAGE_EVERY_ITER = 1000
if __name__ == "__main__":
writer = SummaryWriter()
gen_losses = []
dis_losses = []
iter_no = 0
// looping over the batches in the environment
for batch_v in iterate_batches(envs):
// getting the outputs
// getting the generators loss
// getting the discriminators loss
iter_no += 1
// save the loss values for both generators and the discriminator every 100 steps
if iter_no % REPORT_EVERY_ITER == 0:
log.info(
"Iter %d: gen_loss=%.3e, dis_loss=%.3e",
iter_no,
np.mean(gen_losses),
np.mean(dis_losses),
)
writer.add_scalar("gen_loss", np.mean(gen_losses), iter_no)
writer.add_scalar("dis_loss", np.mean(dis_losses), iter_no)
gen_losses = []
dis_losses = []
// save the images being produced from both the ground truth and the generator
// it is saved every 1000 iterations
if iter_no % SAVE_IMAGE_EVERY_ITER == 0:
// save the generated images from the generator
writer.add_image(
"fake",
vutils.make_grid(gen_output_v.data[:64], normalize=True),
iter_no
)
// add the ground truth images here
// these will be the same throughout the cycle
writer.add_image(
"real",
vutils.make_grid(batch_v.data[:64], normalize=True),
iter_no
)
To view the results just run the command: tensorboard --logdir runs in the same directory where you ran the model training(runs contains the results from the training). A link will be shown which you can go to view the plots such as the one below. If you want to run Tensorboard on a remote server then you would have to add in the command --bind_all in the command line to access it from the outside.
Viewing the generated images
Viewing the loss values
How to use tfrecord with pytorch?
I have downloaded "Youtube8M" datasets with video-level features, but it is stored in tfrecord.
I tried to read some sample from these file to convert it to numpy and then load in pytorch. But it failed.
reader = YT8MAggregatedFeatureReader()
files = tf.gfile.Glob("/Data/youtube8m/train*.tfrecord")
filename_queue = tf.train.string_input_producer(
files, num_epochs=5, shuffle=True)
training_data = [
reader.prepare_reader(filename_queue) for _ in range(1)
]
unused_video_id, model_input_raw, labels_batch, num_frames = tf.train.shuffle_batch_join(
training_data,
batch_size=1024,
capacity=1024 * 5,
min_after_dequeue=1024,
allow_smaller_final_batch=True ,
enqueue_many=True)
with tf.Session() as sess:
label_numpy = labels_batch.eval()
print(type(label_numpy))
But this step have no result, just stuck for a long while without any response.
One work around is to use tensorflow 1.1* eager mode or tensorflow 2+ to loop through the dataset(so you can use var len feature, use buckets window), then just
torch.as_tensor(val.numpy()).to(device) to use in torch.
You can use the DALI library to load the tfrecords directly in a PyTorch code.
You can find out, how to do it in their documentation.
Maybe this can help you: TFRecord reader for PyTorch
I cooked up this:
class LiTS(torch.utils.data.Dataset):
def __init__(self, filenames):
self.filenames = filenames
def __len__(self):
return len(self.filenames)
def __getitem__(self, idx):
volume, segmentation = None, None
if idx >= len(self):
raise IndexError()
ds = tf.data.TFRecordDataset(filenames[idx:idx+1])
for x, y in ds.map(read_tfrecord):
volume = torch.from_numpy(x.numpy())
segmentation = torch.from_numpy(y.numpy())
return volume, segmentation
ERROR MESSAGE:
NotFoundError: Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:
""" PLACEHOLDER """
x=tf.placeholder(tf.float32,shape=[None,784])
y_true=tf.placeholder(tf.float32,shape=[None,4])
Weight=tf.global_variables()
bias=tf.global_variables()
""" LAYERS:"""
"the image input layer 28*28 input"
x_image = tf.reshape(x,[-1,28,28,1])
"""first convolution, using 6*6 filter, and then max pooling 2by2, final
output will have depth 32
here we are calculating 32 features
"""
convo_1 = convolutional_layer(x_image,shape=[6,6,1,32])
convo_1_pooling = max_pool_2by2(convo_1)
"""first convolution, using 6*6 filter, and then max pooling 2by2, final
output will have 64
features hence depth 64
"""
convo_2 = convolutional_layer(convo_1_pooling,shape=[6,6,32,64])
convo_2_pooling = max_pool_2by2(convo_2)
"flattening the output of the last pooling layer to fuly connect it"
convo_2_flat = tf.reshape(convo_2_pooling,[-1,7*7*64])
Wt,bs=normal_full_layer(convo_2_flat,1024)#1024 nodes in the final layer
full_layer_one = tf.nn.relu(tf.matmul(convo_2_flat,Wt)+bs)
# NOTE THE PLACEHOLDER HERE!
hold_prob = tf.placeholder(tf.float32)
full_one_dropout = tf.nn.dropout(full_layer_one,keep_prob=hold_prob)
Weight, bias= normal_full_layer(full_one_dropout,2)
y_pred=tf.matmul(full_one_dropout,Weight)+bias
"""INITIALIZE VARIABLES"""
init = tf.global_variables_initializer()
"""Add ops to save and restore all the variables"""
saver = tf.train.Saver()
"""Later, launch the model, use the saver to restore variables from disk,
and
# do some work with the model"""
with tf.Session() as sess:
# Restore variables from disk.
saver.restore(sess,SAVE_PATH)
print("Model restored.")
If you use the spyder, please Restart kernel! It can work well.
I want to be able to load an existing tensorflow network simultaneously from several processes using multiprocessing library to do inference on different cores simultaneously.
def spawn_process(x):
g = tf.Graph()
sess = tf.Session(graph=g)
with g.as_default():
meta_graph = tf.train.import_meta_graph('model.meta')
meta_graph.restore(sess, tf.train.latest_checkpoint(chkp_dir)
x_ph = tf.get_collection('x')[0] # placeholder tensor that we use to pass in new x's to do inference on.
pred = tf.get_collection('pred')[0] # tensor responsible for computing prediction given x
prediction = sess.run(pred, feed_dict={x_ph: x})
This is basically the function I want to pass to Pool.map to infer parallely.
Above function works with just map and it looks like this
predictions = list(map(spawn_process, range(10)))
predictions = [spawn_process(x) for x in range(10)]
Both of the above work as expected.
But when I try do this, it fails, and each process just hangs right before the meta_graph.restore line and I'm stumped.
p = multiprocess.Pool(4)
predictions = p.map(spawn_process, range(10))
p.close()
p.join()
I don't know why this isn't working with tensorflow, this normally works for me when I try to do any sort of computation parallely. It stops right before the meta_graph.restore line and all the processes hang there.