pytorch can not iterate dataset when using multi processes - pytorch

When I use Process in torch.multiprocessing packages to spawn multi processes, the DistributedSampler instance can not iterate the dataset in each process.
import torch
import torchvision
from torch.multiprocessing import Process
import torch.distributed as dist
import torch.multiprocessing as mp
import os
from torchvision import transforms
from torchvision.transforms import RandomResizedCrop
print(torchvision.__version__)
print(torch.__version__)
mnist_transform = transforms.Compose([transforms.ToTensor(),
RandomResizedCrop(224),
transforms.Normalize(mean=[0.5], std=[0.5])])
train_data = torchvision.datasets.MNIST('data', train=True, transform=mnist_transform, download=True)
# train_data = torchvision.datasets.CIFAR10('data', train=True,download=True, transform=mnist_transform)
def main_fun(rank, world_size):
print('rank=',rank)
print('world_size=', world_size)
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
dist_backend = 'nccl'
dist_url = 'env://'
dist.init_process_group(backend=dist_backend, init_method=dist_url,
world_size=world_size, rank=rank)
dist.barrier()
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
train_batch_sampler = torch.utils.data.BatchSampler(
train_sampler, batch_size=16, drop_last=True)
print(len(train_data) , len(train_batch_sampler), train_batch_sampler)
for i in train_sampler: # train_batch_sampler
print(type(i),i)
print('finished...')
def main():
world_size = 2
# mp.spawn(main_fun,
# args=(world_size,),
# nprocs=world_size,
# join=True)
processes = []
for rank in range(world_size):
p = Process(target=main_fun, args=(rank, world_size))
p.start()
processes.append(p)
for p in processes:
p.join()
if __name__ == '__main__':
main()
It's the output:
0.11.3
1.10.2
rank= 0
world_size= 2
rank= 1
world_size= 2
60000 1875 <torch.utils.data.sampler.BatchSampler object at 0x7f7848f5bb80>
60000 1875 <torch.utils.data.sampler.BatchSampler object at 0x7f7848f5ba90>
It can not iterate the dataset and can not print print('finished...') on screen. So I think the process was killed.
Strangely enough, when I use the val dataset, such as
train_data = torchvision.datasets.MNIST('data', train=False, transform=mnist_transform, download=True)
it can work.
And when I move definition of train_data with train=True into main_fun(), it can work again.
Furthermore, when I use mp.spawn() to spawn multi processes, it can work. I think it may be a standard solution, but I want to try to use Process() and know why this will happen.

Related

GPU runs out of memory when training a ml model

I am trying to train a ml model using dask. I am training on my local machine with 1 GPU. My GPU has 24 GiBs of memory.
from dask_cuda import LocalCUDACluster
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
import xgboost as xgb
np.random.seed(42)
def get_columns(filename):
return pd.read_csv(filename, nrows=10).iloc[:, :NUM_FEATURES].columns
def get_data(filename, target):
import dask_cudf
X = dask_cudf.read_csv(filename)
# X = dd.read_csv(filename, assume_missing=True)
y = X[[target]]
X = X.iloc[:, :NUM_FEATURES]
return X, y
def main(client: Client) -> None:
X, y = get_data(FILENAME, TARGET)
model = xgb.dask.DaskXGBRegressor(
tree_method="gpu_hist",
objective="reg:squarederror",
seed=42,
max_depth=5,
eta=0.01,
n_estimators=10)
model.client = client
model.fit(X, y, eval_set=[(X, y)])
print("Saving the model..")
model.get_booster().save_model("xgboost.model")
print("Doing model importance..")
columns = get_columns(FILENAME)
pd.Series(model.feature_importances_, index=columns).sort_values(ascending=False).to_pickle("~/yolo.pkl")
if __name__ == "__main__":
os.environ["MALLOC_TRIM_THRESHOLD_"]="65536"
with LocalCUDACluster(device_memory_limit="15 GiB", rmm_pool_size="20 GiB") as cluster:
# with LocalCluster() as cluster:
with Client(cluster) as client:
print(client)
main(client)
Error as follows.
MemoryError: std::bad_alloc: out_of_memory: RMM failure at:/workspace/.conda-bld/work/include/rmm/mr/device/pool_memory_resource.hpp:192: Maximum pool size exceeded
Basically my GPU runs out of memory when I call model.fit. It works when I use a csv with 64100 rows and fails when I use a csv with 128198 rows (2x rows). These aren't large files so I assume I am doing something wrong.
I have tried fiddling around with
LocalCUDACluster: device_memory_limit and rmm_pool_size
dask_cudf.read_csv: chunksize
Nothing has worked.
I have been stuck on this all day so any help would be much appreciated.
You cannot train an xgboost model where the model grows larger than the remaining GPU memory size. You can scale out with dask_xgboost, but you need to ensure that the total GPU memory is sufficient.
Here is a great blog on this by Coiled: https://coiled.io/blog/dask-xgboost-python-example/

why the output of model is different in pytorch

I have a simple model, just only one linear layer.
model = torch.nn.Linear(1,1).to(device)
x_train1 = torch.FloatTensor([[1], [2], [3]])
out = model(x_train1)
print(out)
But whenever I tried to run this code, the printed output is diffrent.
Also I set these random seeds.
import random
import torch
import numpy as np
random_seed=76
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)
I want to know why the output keep changing when the code is run.
You must set the seed every time you run the code if want to get the same result.
import torch
def my_func(device: str, seed: int):
torch.manual_seed(seed)
model = torch.nn.Linear(1,1).to(device)
x_train1 = torch.FloatTensor([[1], [2], [3]])
out = model(x_train1)
print(out)
# Whenever you run the function you'll get the same result!
my_func(device="cpu", seed=76)
# tensor([[0.3573],
# [0.5021],
# [0.6470]], grad_fn=<AddmmBackward>)
my_func(device="cpu", seed=76)
# tensor([[0.3573],
# [0.5021],
# [0.6470]], grad_fn=<AddmmBackward>)

How to train an image similarity model on 20 millions images(total size 10GB)?

My system is configured with 16GB RAM. I have tried to train image similarity model on 20 millions images(total size 10GB) using VGG19 and KNN's nearest neighbor. When tried to read images i am getting Memory error. Even I have tried to train model on 200000(total size 770MB) but issue is same. How I can read millions of images to train ML models.
Ubuntu 18.04.2 LTS,Core™ i7,Intel® HD Graphics 5500 (Broadwell GT2), 64-bit, 16GB RAM
import os
import skimage.io
import tensorflow as tf
from skimage.transform import resize
import numpy as np
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from sklearn import manifold
import pickle
skimage.io.use_plugin('matplotlib')
dirPath = 'train_data'
args = [os.path.join(dirPath, filename) for filename in os.listdir(dirPath)]
imgs_train = [skimage.io.imread(arg, as_gray=False) for arg in args]
shape_img = (130, 130, 3)
model = tf.keras.applications.VGG19(weights='imagenet', include_top=False,
input_shape=shape_img)
model.summary()
shape_img_resize = tuple([int(x) for x in model.input.shape[1:]])
input_shape_model = tuple([int(x) for x in model.input.shape[1:]])
output_shape_model = tuple([int(x) for x in model.output.shape[1:]])
n_epochs = None
def resize_img(img, shape_resized):
img_resized = resize(img, shape_resized,
anti_aliasing=True,
preserve_range=True)
assert img_resized.shape == shape_resized
return img_resized
def normalize_img(img):
return img / 255.
def transform_img(img, shape_resize):
img_transformed = resize_img(img, shape_resize)
img_transformed = normalize_img(img_transformed)
return img_transformed
def apply_transformer(imgs, shape_resize):
imgs_transform = [transform_img(img, shape_resize) for img in imgs]
return imgs_transform
imgs_train_transformed = apply_transformer(imgs_train, shape_img_resize)
X_train = np.array(imgs_train_transformed).reshape((-1,) + input_shape_model)
E_train = model.predict(X_train)
E_train_flatten = E_train.reshape((-1, np.prod(output_shape_model)))
knn = NearestNeighbors(n_neighbors=5, metric="cosine")
knn.fit(E_train_flatten)
Knowing that keras is working well with generator, you should consider using one:
python generator tutorial,
using a generator with keras (example)
It allows you to load your image during your training, batch by batch.

AttributeError: 'module' object has no attribute 'cuda'

I was trying to run this repository: https://github.com/WaqasSultani/AnomalyDetectionCVPR2018
In the Test_Anomaly_Detector_public.py I am stuck with error:theano.sandbox.cuda.use('gpu0')
AttributeError: 'module' object has no attribute 'cuda'.
I am using theano as backend
This is Test_Anomaly_Detector_public.py:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.regularizers import l2
from keras.optimizers import SGD ,Adagrad
from scipy.io import loadmat, savemat
from keras.models import model_from_json
import theano.tensor as T
import theano
import csv
import ConfigParser
import collections
import time
import csv
import os
from os import listdir
import skimage.transform
from skimage import color
from os.path import isfile, join
import numpy as np
import numpy
from datetime import datetime
from scipy.spatial.distance import cdist,pdist,squareform
import theano.sandbox
import shutil
theano.sandbox.cuda.use('gpu0')
seed = 7
numpy.random.seed(seed)
def load_model(json_path): # Function to load the model
model = model_from_json(open(json_path).read())
return model
def load_weights(model, weight_path): # Function to load the model weights
dict2 = loadmat(weight_path)
dict = conv_dict(dict2)
i = 0
for layer in model.layers:
weights = dict[str(i)]
layer.set_weights(weights)
i += 1
return model
def conv_dict(dict2):
i = 0
dict = {}
for i in range(len(dict2)):
if str(i) in dict2:
if dict2[str(i)].shape == (0, 0):
dict[str(i)] = dict2[str(i)]
else:
weights = dict2[str(i)][0]
weights2 = []
for weight in weights:
if weight.shape in [(1, x) for x in range(0, 5000)]:
weights2.append(weight[0])
else:
weights2.append(weight)
dict[str(i)] = weights2
return dict
# Load Video
def load_dataset_One_Video_Features(Test_Video_Path):
VideoPath =Test_Video_Path
f = open(VideoPath, "r")
words = f.read().split()
num_feat = len(words) / 4096
# Number of features per video to be loaded. In our case num_feat=32, as we divide the video into 32 segments. Note that
# we have already computed C3D features for the whole video and divided the video features into 32 segments.
count = -1;
VideoFeatues = []
for feat in xrange(0, num_feat):
feat_row1 = np.float32(words[feat * 4096:feat * 4096 + 4096])
count = count + 1
if count == 0:
VideoFeatues = feat_row1
if count > 0:
VideoFeatues = np.vstack((VideoFeatues, feat_row1))
AllFeatures = VideoFeatues
return AllFeatures
print("Starting testing...")
AllTest_Video_Path = '/newdata/UCF_Anomaly_Dataset/Dataset/CVPR_Data/C3D_Complete_Video_txt/Test/'
# AllTest_Video_Path contains C3D features (txt file) of each video. Each file contains 32 features, each of 4096 dimensions.
Results_Path = '../Eval_Res/'
# Results_Path is the folder where you can save your results
Model_dir='../Trained_AnomalyModel/'
# Model_dir is the folder where we have placed our trained weights
weights_path = Model_dir + 'weights_L1L2.mat'
# weights_path is Trained model weights
model_path = Model_dir + 'model.json'
if not os.path.exists(Results_Path):
os.makedirs(Results_Path)
All_Test_files= listdir(AllTest_Video_Path)
All_Test_files.sort()
model=load_model(model_path)
load_weights(model, weights_path)
nVideos=len(All_Test_files)
time_before = datetime.now()
for iv in range(nVideos):
Test_Video_Path = os.path.join(AllTest_Video_Path, All_Test_files[iv])
inputs=load_dataset_One_Video_Features(Test_Video_Path) # 32 segments features for one testing video
predictions = model.predict_on_batch(inputs) # Get anomaly prediction for each of 32 video segments.
aa=All_Test_files[iv]
aa=aa[0:-4]
A_predictions_path = Results_Path + aa + '.mat' # Save array of 1*32, containing anomaly score for each segment. Please see Evaluate Anomaly Detector to compute ROC.
print "Total Time took: " + str(datetime.now() - time_before)
My .theanorc file:
[global]
floatX = float32
device = cuda0
[gpuarray]
preallocate = 1
You can comment out this line. When you run please follow this
THEANO_FLAGS=mode=FAST_RUN,device=cuda0,floatX=float32 python [...]

Tensorflow multithreading image loading

So I have this toy example code;
import glob
from tqdm import tqdm
import tensorflow as tf
imgPaths = glob.glob("/home/msmith/imgs/*/*") # Some images
filenameQ = tf.train.string_input_producer(imgPaths)
reader = tf.WholeFileReader()
key, value = reader.read(filenameQ)
img = tf.image.decode_jpeg(value)
init_op = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in tqdm(range(10000)):
img.eval().mean()
which loads images and prints the mean of each one. How to I edit it so it's multithreading the loading part of the images, which is at the moment my bottleneck on my tf image scripts.
EDIT (2018/3/5): It's now easier to get the same results using the tf.data API.
import glob
from tqdm import tqdm
import tensorflow as tf
imgPaths = glob.glob("/home/msmith/imgs/*/*") # Some images
dataset = (tf.data.Dataset.from_tensor_slices(imgPaths)
.map(lambda x: tf.reduce_mean(tf.decode_jpeg(tf.read_file(x))),
num_parallel_calls=16)
.prefetch(128))
iterator = dataset.make_one_shot_iterator()
next_mean = iterator.get_next()
with tf.Session() as sess:
for i in tqdm(range(10000)):
sess.run(next_mean)
As sygi suggests in their comment, a tf.train.QueueRunner can be used to define some ops that run in a separate thread, and (typically) enqueue values into a TensorFlow queue.
import glob
from tqdm import tqdm
import tensorflow as tf
imgPaths = glob.glob("/home/msmith/imgs/*/*") # Some images
filenameQ = tf.train.string_input_producer(imgPaths)
# Define a subgraph that takes a filename, reads the file, decodes it, and
# enqueues it.
filename = filenameQ.dequeue()
image_bytes = tf.read_file(filename)
decoded_image = tf.image.decode_jpeg(image_bytes)
image_queue = tf.FIFOQueue(128, [tf.uint8], None)
enqueue_op = image_queue.enqueue(decoded_image)
# Create a queue runner that will enqueue decoded images into `image_queue`.
NUM_THREADS = 16
queue_runner = tf.train.QueueRunner(
image_queue,
[enqueue_op] * NUM_THREADS, # Each element will be run from a separate thread.
image_queue.close(),
image_queue.close(cancel_pending_enqueues=True))
# Ensure that the queue runner threads are started when we call
# `tf.train.start_queue_runners()` below.
tf.train.add_queue_runner(queue_runner)
# Dequeue the next image from the queue, for returning to the client.
img = image_queue.dequeue()
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for i in tqdm(range(10000)):
img.eval().mean()

Resources