Tensorflow multithreading image loading - multithreading

So I have this toy example code;
import glob
from tqdm import tqdm
import tensorflow as tf
imgPaths = glob.glob("/home/msmith/imgs/*/*") # Some images
filenameQ = tf.train.string_input_producer(imgPaths)
reader = tf.WholeFileReader()
key, value = reader.read(filenameQ)
img = tf.image.decode_jpeg(value)
init_op = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in tqdm(range(10000)):
img.eval().mean()
which loads images and prints the mean of each one. How to I edit it so it's multithreading the loading part of the images, which is at the moment my bottleneck on my tf image scripts.

EDIT (2018/3/5): It's now easier to get the same results using the tf.data API.
import glob
from tqdm import tqdm
import tensorflow as tf
imgPaths = glob.glob("/home/msmith/imgs/*/*") # Some images
dataset = (tf.data.Dataset.from_tensor_slices(imgPaths)
.map(lambda x: tf.reduce_mean(tf.decode_jpeg(tf.read_file(x))),
num_parallel_calls=16)
.prefetch(128))
iterator = dataset.make_one_shot_iterator()
next_mean = iterator.get_next()
with tf.Session() as sess:
for i in tqdm(range(10000)):
sess.run(next_mean)
As sygi suggests in their comment, a tf.train.QueueRunner can be used to define some ops that run in a separate thread, and (typically) enqueue values into a TensorFlow queue.
import glob
from tqdm import tqdm
import tensorflow as tf
imgPaths = glob.glob("/home/msmith/imgs/*/*") # Some images
filenameQ = tf.train.string_input_producer(imgPaths)
# Define a subgraph that takes a filename, reads the file, decodes it, and
# enqueues it.
filename = filenameQ.dequeue()
image_bytes = tf.read_file(filename)
decoded_image = tf.image.decode_jpeg(image_bytes)
image_queue = tf.FIFOQueue(128, [tf.uint8], None)
enqueue_op = image_queue.enqueue(decoded_image)
# Create a queue runner that will enqueue decoded images into `image_queue`.
NUM_THREADS = 16
queue_runner = tf.train.QueueRunner(
image_queue,
[enqueue_op] * NUM_THREADS, # Each element will be run from a separate thread.
image_queue.close(),
image_queue.close(cancel_pending_enqueues=True))
# Ensure that the queue runner threads are started when we call
# `tf.train.start_queue_runners()` below.
tf.train.add_queue_runner(queue_runner)
# Dequeue the next image from the queue, for returning to the client.
img = image_queue.dequeue()
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for i in tqdm(range(10000)):
img.eval().mean()

Related

GPU not used on d3rlpy

I am new to using d3rlpy for offline RL training and makes use of pytorch. So I installed cuda 1.16 as recommended from PYtorch doc: pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116. I installed d3rlpy after and run the following sample code:
from d3rlpy.algos import BC,DDPG,CRR,PLAS,PLASWithPerturbation,TD3PlusBC,IQL
import d3rlpy
import numpy as np
import glob
import time
#models
continuous_models = {
"BehaviorCloning": BC,
"DeepDeterministicPolicyGradients": DDPG,
"CriticRegularizedRegression": CRR,
"PolicyLatentActionSpace": PLAS,
"PolicyLatentActionSpacePerturbation": PLASWithPerturbation,
"TwinDelayedPlusBehaviorCloning": TD3PlusBC,
"ImplicitQLearning": IQL,
}
#load dataset data_batch is created as a*.h5 file with d3rlpy
dataset = d3rlpy.dataset.MDPDataset.load(data_batch)
# preprocess
mean = np.mean(dataset.observations, axis=0, keepdims=True)
std = np.std(dataset.observations, axis=0, keepdims=True)
scaler = d3rlpy.preprocessing.StandardScaler(mean=mean, std=std)
# test models
for _model in continuous_models:
the_model = continuous_models[_model](scaler = scaler)
the_model.use_gpu = True
the_model.build_with_dataset(dataset)
the_model.fit(dataset = dataset.episodes,
n_steps_per_epoch = 10800,
n_steps = 54000,
logdir = './logs',
experiment_name = f"{_model}",
tensorboard_dir = 'logs',
save_interval = 900, # we don't want to save intermediate parameters
)
#save model
the_timestamp = int(time.time())
the_model.save_model(f"./models/{_model}/{_model}_{the_timestamp}.pt")
The issue is that None of the models, despite being set with use_gpu =True are actually using the GPU. With a sample code of pytotch and testing torch.cuda.current_device() I can see that pytorch is properly set and detecting the gpu. Any idea where to look for solving this issue? I am not sure this is a bug from the d3rlpy so I would bother creating an issue on github yet :)

pytorch can not iterate dataset when using multi processes

When I use Process in torch.multiprocessing packages to spawn multi processes, the DistributedSampler instance can not iterate the dataset in each process.
import torch
import torchvision
from torch.multiprocessing import Process
import torch.distributed as dist
import torch.multiprocessing as mp
import os
from torchvision import transforms
from torchvision.transforms import RandomResizedCrop
print(torchvision.__version__)
print(torch.__version__)
mnist_transform = transforms.Compose([transforms.ToTensor(),
RandomResizedCrop(224),
transforms.Normalize(mean=[0.5], std=[0.5])])
train_data = torchvision.datasets.MNIST('data', train=True, transform=mnist_transform, download=True)
# train_data = torchvision.datasets.CIFAR10('data', train=True,download=True, transform=mnist_transform)
def main_fun(rank, world_size):
print('rank=',rank)
print('world_size=', world_size)
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
dist_backend = 'nccl'
dist_url = 'env://'
dist.init_process_group(backend=dist_backend, init_method=dist_url,
world_size=world_size, rank=rank)
dist.barrier()
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
train_batch_sampler = torch.utils.data.BatchSampler(
train_sampler, batch_size=16, drop_last=True)
print(len(train_data) , len(train_batch_sampler), train_batch_sampler)
for i in train_sampler: # train_batch_sampler
print(type(i),i)
print('finished...')
def main():
world_size = 2
# mp.spawn(main_fun,
# args=(world_size,),
# nprocs=world_size,
# join=True)
processes = []
for rank in range(world_size):
p = Process(target=main_fun, args=(rank, world_size))
p.start()
processes.append(p)
for p in processes:
p.join()
if __name__ == '__main__':
main()
It's the output:
0.11.3
1.10.2
rank= 0
world_size= 2
rank= 1
world_size= 2
60000 1875 <torch.utils.data.sampler.BatchSampler object at 0x7f7848f5bb80>
60000 1875 <torch.utils.data.sampler.BatchSampler object at 0x7f7848f5ba90>
It can not iterate the dataset and can not print print('finished...') on screen. So I think the process was killed.
Strangely enough, when I use the val dataset, such as
train_data = torchvision.datasets.MNIST('data', train=False, transform=mnist_transform, download=True)
it can work.
And when I move definition of train_data with train=True into main_fun(), it can work again.
Furthermore, when I use mp.spawn() to spawn multi processes, it can work. I think it may be a standard solution, but I want to try to use Process() and know why this will happen.

why the output of model is different in pytorch

I have a simple model, just only one linear layer.
model = torch.nn.Linear(1,1).to(device)
x_train1 = torch.FloatTensor([[1], [2], [3]])
out = model(x_train1)
print(out)
But whenever I tried to run this code, the printed output is diffrent.
Also I set these random seeds.
import random
import torch
import numpy as np
random_seed=76
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)
I want to know why the output keep changing when the code is run.
You must set the seed every time you run the code if want to get the same result.
import torch
def my_func(device: str, seed: int):
torch.manual_seed(seed)
model = torch.nn.Linear(1,1).to(device)
x_train1 = torch.FloatTensor([[1], [2], [3]])
out = model(x_train1)
print(out)
# Whenever you run the function you'll get the same result!
my_func(device="cpu", seed=76)
# tensor([[0.3573],
# [0.5021],
# [0.6470]], grad_fn=<AddmmBackward>)
my_func(device="cpu", seed=76)
# tensor([[0.3573],
# [0.5021],
# [0.6470]], grad_fn=<AddmmBackward>)

How to train an image similarity model on 20 millions images(total size 10GB)?

My system is configured with 16GB RAM. I have tried to train image similarity model on 20 millions images(total size 10GB) using VGG19 and KNN's nearest neighbor. When tried to read images i am getting Memory error. Even I have tried to train model on 200000(total size 770MB) but issue is same. How I can read millions of images to train ML models.
Ubuntu 18.04.2 LTS,Core™ i7,Intel® HD Graphics 5500 (Broadwell GT2), 64-bit, 16GB RAM
import os
import skimage.io
import tensorflow as tf
from skimage.transform import resize
import numpy as np
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from sklearn import manifold
import pickle
skimage.io.use_plugin('matplotlib')
dirPath = 'train_data'
args = [os.path.join(dirPath, filename) for filename in os.listdir(dirPath)]
imgs_train = [skimage.io.imread(arg, as_gray=False) for arg in args]
shape_img = (130, 130, 3)
model = tf.keras.applications.VGG19(weights='imagenet', include_top=False,
input_shape=shape_img)
model.summary()
shape_img_resize = tuple([int(x) for x in model.input.shape[1:]])
input_shape_model = tuple([int(x) for x in model.input.shape[1:]])
output_shape_model = tuple([int(x) for x in model.output.shape[1:]])
n_epochs = None
def resize_img(img, shape_resized):
img_resized = resize(img, shape_resized,
anti_aliasing=True,
preserve_range=True)
assert img_resized.shape == shape_resized
return img_resized
def normalize_img(img):
return img / 255.
def transform_img(img, shape_resize):
img_transformed = resize_img(img, shape_resize)
img_transformed = normalize_img(img_transformed)
return img_transformed
def apply_transformer(imgs, shape_resize):
imgs_transform = [transform_img(img, shape_resize) for img in imgs]
return imgs_transform
imgs_train_transformed = apply_transformer(imgs_train, shape_img_resize)
X_train = np.array(imgs_train_transformed).reshape((-1,) + input_shape_model)
E_train = model.predict(X_train)
E_train_flatten = E_train.reshape((-1, np.prod(output_shape_model)))
knn = NearestNeighbors(n_neighbors=5, metric="cosine")
knn.fit(E_train_flatten)
Knowing that keras is working well with generator, you should consider using one:
python generator tutorial,
using a generator with keras (example)
It allows you to load your image during your training, batch by batch.

AttributeError: 'module' object has no attribute 'cuda'

I was trying to run this repository: https://github.com/WaqasSultani/AnomalyDetectionCVPR2018
In the Test_Anomaly_Detector_public.py I am stuck with error:theano.sandbox.cuda.use('gpu0')
AttributeError: 'module' object has no attribute 'cuda'.
I am using theano as backend
This is Test_Anomaly_Detector_public.py:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.regularizers import l2
from keras.optimizers import SGD ,Adagrad
from scipy.io import loadmat, savemat
from keras.models import model_from_json
import theano.tensor as T
import theano
import csv
import ConfigParser
import collections
import time
import csv
import os
from os import listdir
import skimage.transform
from skimage import color
from os.path import isfile, join
import numpy as np
import numpy
from datetime import datetime
from scipy.spatial.distance import cdist,pdist,squareform
import theano.sandbox
import shutil
theano.sandbox.cuda.use('gpu0')
seed = 7
numpy.random.seed(seed)
def load_model(json_path): # Function to load the model
model = model_from_json(open(json_path).read())
return model
def load_weights(model, weight_path): # Function to load the model weights
dict2 = loadmat(weight_path)
dict = conv_dict(dict2)
i = 0
for layer in model.layers:
weights = dict[str(i)]
layer.set_weights(weights)
i += 1
return model
def conv_dict(dict2):
i = 0
dict = {}
for i in range(len(dict2)):
if str(i) in dict2:
if dict2[str(i)].shape == (0, 0):
dict[str(i)] = dict2[str(i)]
else:
weights = dict2[str(i)][0]
weights2 = []
for weight in weights:
if weight.shape in [(1, x) for x in range(0, 5000)]:
weights2.append(weight[0])
else:
weights2.append(weight)
dict[str(i)] = weights2
return dict
# Load Video
def load_dataset_One_Video_Features(Test_Video_Path):
VideoPath =Test_Video_Path
f = open(VideoPath, "r")
words = f.read().split()
num_feat = len(words) / 4096
# Number of features per video to be loaded. In our case num_feat=32, as we divide the video into 32 segments. Note that
# we have already computed C3D features for the whole video and divided the video features into 32 segments.
count = -1;
VideoFeatues = []
for feat in xrange(0, num_feat):
feat_row1 = np.float32(words[feat * 4096:feat * 4096 + 4096])
count = count + 1
if count == 0:
VideoFeatues = feat_row1
if count > 0:
VideoFeatues = np.vstack((VideoFeatues, feat_row1))
AllFeatures = VideoFeatues
return AllFeatures
print("Starting testing...")
AllTest_Video_Path = '/newdata/UCF_Anomaly_Dataset/Dataset/CVPR_Data/C3D_Complete_Video_txt/Test/'
# AllTest_Video_Path contains C3D features (txt file) of each video. Each file contains 32 features, each of 4096 dimensions.
Results_Path = '../Eval_Res/'
# Results_Path is the folder where you can save your results
Model_dir='../Trained_AnomalyModel/'
# Model_dir is the folder where we have placed our trained weights
weights_path = Model_dir + 'weights_L1L2.mat'
# weights_path is Trained model weights
model_path = Model_dir + 'model.json'
if not os.path.exists(Results_Path):
os.makedirs(Results_Path)
All_Test_files= listdir(AllTest_Video_Path)
All_Test_files.sort()
model=load_model(model_path)
load_weights(model, weights_path)
nVideos=len(All_Test_files)
time_before = datetime.now()
for iv in range(nVideos):
Test_Video_Path = os.path.join(AllTest_Video_Path, All_Test_files[iv])
inputs=load_dataset_One_Video_Features(Test_Video_Path) # 32 segments features for one testing video
predictions = model.predict_on_batch(inputs) # Get anomaly prediction for each of 32 video segments.
aa=All_Test_files[iv]
aa=aa[0:-4]
A_predictions_path = Results_Path + aa + '.mat' # Save array of 1*32, containing anomaly score for each segment. Please see Evaluate Anomaly Detector to compute ROC.
print "Total Time took: " + str(datetime.now() - time_before)
My .theanorc file:
[global]
floatX = float32
device = cuda0
[gpuarray]
preallocate = 1
You can comment out this line. When you run please follow this
THEANO_FLAGS=mode=FAST_RUN,device=cuda0,floatX=float32 python [...]

Resources