Memory leak issue using PyTorch IterableDataset with zarr

Memory leak issue using PyTorch IterableDataset with zarr - pytorch

I'm trying to build a pytorch project on an IterableDataset with zarr as storage backend.
class Data(IterableDataset):
def __init__(self, path, start=None, end=None):
super(Data, self).__init__()
store = zarr.DirectoryStore(path)
self.array = zarr.open(store, mode='r')
if start is None:
start = 0
if end is None:
end = self.array.shape[0]
assert end > start
self.start = start
self.end = end
def __iter__(self):
return islice(self.array, self.start, self.end)
This works quite nicely with small test-datasets but once i move to my actual dataset (480 000 000 x 290) i'm running into a memory leak. I've tried logging out the python heap periodically as everything slows to a crawl, but i couldn't see anything increasing in size abnormally, so the lib i used (pympler) didn't actually catch the memory leak.
I'm kind of at my wits end, so if anybody has any idea how to further debug this, it would be greatly appreciated.
Cross-posted on PyTorch Forums.

Turns out that I had an issue in my validation routine:
with torch.no_grad():
for batch in tqdm(testloader, **params):
x = batch[:, 1:].to(device)
y = batch[:, 0].unsqueeze(0).T
y_test_pred = torch.sigmoid(sxnet(x))
y_pred_tag = torch.round(y_test_pred)
y_pred_list.append(y_pred_tag.cpu().numpy())
y_list.append(y.numpy())
I originally thought that I am well clear of running into troubles with appending my results to lists, but the issue is that the result of .numpy was an array of arrays (since the original datatype was a 1xn Tensor).
Adding .flatten() on the numpy arrays has fixed this issue and the RAM consumption is now as I originally provisioned.

Related

Calling VGG many times causes an out of memory error

I want to extract the VGG features of a set of images and keep them in memory in a dictionary. The dictionary ends up holding 8091 tensors each of shape (1,4096), but my machine crashes with an out of memory error after about 6% of the way. Does anybody have a clue why this is happening and how to prevent it?
In fact, this seems to be triggered by the call to VGG rather than the memory space, since storing the VGG classification is sufficient to trigger the error.
Below is the simplest code I've found to reproduce the error. Once a helper function is defined:
import torch, torchvision
from tqdm import tqdm
vgg = torchvision.models.vgg16(weights='DEFAULT')
def try_and_crash(gen_data):
store_out = {}
for i in tqdm(range(8091)):
my_output = gen_data(torch.randn(1,3,224,224))
store_out[i] = my_output
return store_out
Calling it to quickly produce a large tensor doesn't cause a fuss
just_fine = try_and_crash(lambda x: torch.randn(1,4096))
but calling it to use vgg causes the machine to crash:
will_crash = try_and_crash(vgg)

The problem is that each element of the dictionary store_out[i] also stores the gradients that led to its computation, therefore ends up being much larger than a simple 1x4096 element tensor.
Running the code with torch.no_grad(), or equivalently with torch.set_grad_enabled(False) solves the issue. We can test it by slightly changing the helper function
def try_and_crash_grad(gen_data, grad_enabled):
store_out = {}
for i in tqdm(range(8091)):
with torch.set_grad_enabled(grad_enabled):
my_output = gen_data(torch.randn(1,3,224,224))
store_out[i] = my_output
return store_out
Now the following works
works_fine = try_and_crash_grad(vgg, False)
while the following throws an out of memory error
crashes = try_and_crash_grad(vgg, True)

Why the CUDA memory is not release with torch.cuda.empty_cache()

On my Windows 10, if I directly create a GPU tensor, I can successfully release its memory.
import torch
a = torch.zeros(300000000, dtype=torch.int8, device='cuda')
del a
torch.cuda.empty_cache()
But if I create a normal tensor and convert it to GPU tensor, I can no longer release its memory.
import torch
a = torch.zeros(300000000, dtype=torch.int8)
a.cuda()
del a
torch.cuda.empty_cache()
Why this is happening.

At least in Ubuntu, your script does not release memory when it is run in the interactive shell and works as expected when running as a script. I think there are some reference issues in the in-place call. The following will work in both the interactive shell and as a script.
import torch
a = torch.zeros(300000000, dtype=torch.int8)
a = a.cuda()
del a
torch.cuda.empty_cache()

Yes, this also happens on my pc with following configurations:
20.04.1-Ubuntu
1.7.1+cu110
According to information from fastai discussion:https://forums.fast.ai/t/gpu-memory-not-being-freed-after-training-is-over/10265/8
This is related to the python garbage collector in ipython environment.
def pretty_size(size):
"""Pretty prints a torch.Size object"""
assert(isinstance(size, torch.Size))
return " × ".join(map(str, size))
def dump_tensors(gpu_only=True):
"""Prints a list of the Tensors being tracked by the garbage collector."""
import gc
total_size = 0
for obj in gc.get_objects():
try:
if torch.is_tensor(obj):
if not gpu_only or obj.is_cuda:
print("%s:%s%s %s" % (type(obj).__name__,
" GPU" if obj.is_cuda else "",
" pinned" if obj.is_pinned else "",
pretty_size(obj.size())))
total_size += obj.numel()
elif hasattr(obj, "data") and torch.is_tensor(obj.data):
if not gpu_only or obj.is_cuda:
print("%s → %s:%s%s%s%s %s" % (type(obj).__name__,
type(obj.data).__name__,
" GPU" if obj.is_cuda else "",
" pinned" if obj.data.is_pinned else "",
" grad" if obj.requires_grad else "",
" volatile" if obj.volatile else "",
pretty_size(obj.data.size())))
total_size += obj.data.numel()
except Exception as e:
pass
print("Total size:", total_size)
if I do something like
import torch as th
a = th.randn(10, 1000, 1000)
aa = a.cuda()
del aa
th.cuda.empty_cache()
you will not see any decrease in nvidia-smi/nvtop.
But you can find out what is happening using handy function
dump_tensors()
and you may observe following informations:
Tensor: GPU pinned 10 × 1000 × 1000
Total size: 10000000
That means your gc still holds the resources.
One may refer to more discussions for python gc mechanism.
Force garbage collection in Python to free memory

I meet the same issue.
Solution:
cuda = torch.device('cuda')
a.to(cuda)

To add up to the excellent answer from #wstcegg, what worked for me to clean my GPU cache on Ubuntu (did not work under windows) was using:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()
You might also want to delete the elements you have created, see How can I explicitly free memory in Python?
For more details about garbage collection, see this good reference, which I am quoting the interesting part hereinafter
https://stackabuse.com/basics-of-memory-management-in-python/
Why Perform Manual Garbage Collection?
We know that the Python interpreter keeps a track of references to
objects used in a program. In earlier versions of Python (until
version 1.6), the Python interpreter used only the reference counting
mechanism to handle memory. When the reference count drops to zero,
the Python interpreter automatically frees the memory. This classical
reference counting mechanism is very effective, except that it fails
to work when the program has reference cycles. A reference cycle
happens if one or more objects are referenced each other, and hence
the reference count never reaches zero.
Let's consider an example.
>>> def create_cycle():
... list = [8, 9, 10]
... list.append(list)
... return list
...
>>> create_cycle()
[8, 9, 10, [...]]
The above code creates a reference cycle, where the object list refers
to itself. Hence, the memory for the object list will not be freed
automatically when the function returns. The reference cycle problem
can't be solved by reference counting. However, this reference cycle
problem can be solved by change the behavior of the garbage collector
in your Python application.
To do so, we can use the gc.collect() function of the gc module.
import gc
n = gc.collect()
print("Number of unreachable objects collected by GC:", n)
The gc.collect() returns the number of objects it has collected and
de-allocated.
There are two ways to perform manual garbage collection: time-based or
event-based garbage collection.
Time-based garbage collection is pretty simple: the gc.collect()
function is called after a fixed time interval.

You should not use torch.cuda.empty_cache() as it it will slow down your code for no gain https://discuss.pytorch.org/t/what-is-torch-cuda-empty-cache-do-and-where-should-i-add-it/40975

Python avoiding large array allocation multiple times

I have to compute a function many many times.
To compute this function the elements of an array must be computed.
The array is quite large.
How can I avoid the allocation of the array in every function call.
The code I have tried goes something like this:
class FunctionCalculator(object):
def __init__(self, data):
"""
Get the data and do some small handling of it
Let's say that we do
self.data = data
"""
def function(self, point):
return numpy.sum(numpy.array([somecomputations(item) for item in self.data]))
Well, maybe my concern is unfounded, so I have first this question.
Question: Is it true that the array [somecomputations(item) for item in data] is being allocated and deallocated for every call to function?
Thinking that that is the case I have tried
class FunctionCalculator(object):
def __init__(self, data):
"""
Get the data and do some small handling of it
Let's say that we do
self.data = data
"""
self.number_of_data = range(0, len(data))
self.my_array = numpy.zeros(len(data))
def function(self, point):
for i in self.number_of_data:
self.my_array[i] = somecomputations(self.data[i])
return numpy.sum(self.my_array)
This is slower than the previous version. I assume that the list comprehension in the first version can be ran in C entirely, while in the second version smaller parts of the script can be translated into optimized C code.
I have very little idea of how Python works inside.
Question: Is there a good way to skip the array allocation in every function call and at the same time take advantage of a well optimized loop on the array?
I am using Python3.5

Looping over the array is unnecessary and access python to c many times, hence the slow down. The beauty of numpy arrays that functions work on them cell by cell. I think the fastest would be:
return numpy.sum(somecomputations(self.data))
Somecomputations may need a bit of a modification, but often it will work off the bat. Also, you're not using point, and other stuff.

OpenMDAO 1.x: recording in parallel

When running an analysis under MPI with distributed components in a ParallelGroup, I get an error when adding a DumpRecorder to the analysis. Below is a small example that demonstrates this (this was run with the latest master branch commit aaa67a4d51f4081e9e41b250b0a76b077f6f0c21 from 28/10/2015):
import numpy as np
from openmdao.core.mpi_wrap import MPI
from openmdao.api import Component, Group, DumpRecorder, Problem, ParallelGroup
class Sliced(Component):
def __init__(self):
super(Sliced, self).__init__()
self.add_param('x', 0.)
self.add_output('y', 0.)
def solve_nonlinear(self, params, unknowns, resids):
unknowns['y'] = params['x'] * 2.
class VectorComp(Component):
def __init__(self, size):
super(VectorComp, self).__init__()
self.add_param('xin', np.zeros(size))
self.add_output('x', np.zeros(size))
def solve_nonlinear(self, params, unknowns, resids):
unknowns['x'] = params['xin'] * 2.
class Analysis(Group):
def __init__(self, size):
super(Analysis, self).__init__()
self.add('v', VectorComp(size), promotes=['*'])
par = self.add('par', ParallelGroup())
for i in range(size):
par.add('sec%02d' % i, Sliced())
self.connect('x', 'par.sec%02d.x' % i, src_indices=[i])
if __name__ == '__main__':
if MPI:
from openmdao.core.petsc_impl import PetscImpl as impl
else:
from openmdao.core.basic_impl import BasicImpl as impl
p = Problem(impl=impl, root=Analysis(4))
recorder = DumpRecorder('optimization.log')
# adding specific includes works, but leaving it out results in a crash
# recorder.options['includes'] = ['x']
p.driver.add_recorder(recorder)
p.setup()
p.run()
The error which is raised is:
RuntimeError: Cannot access remote Variable 'par.sec00.x' in this process.
I see that the recorder dumps a file per processor, so shouldn't the BaseRecorder._filter_vectors method filter out params not present on a specific processor? I'm not yet familiar enough with the code to propose a fix, so I hope the OpenMDAO devs can easily figure out what goes wrong.
Manually specifying the includes works since the Sliced parameters are then excluded, but it would be nice that this was not necessary, and dealt with under the hood.
I also want to let you guys know how excited we are about the new framework. It is so much faster that the 0.x version, and the parallel FD feature is much appreciated and works like a charm!

There were some recent changes that broke the dump recorder in parallel. We put a story up for someone to fix it, but in the meantime, you might want to try the SqliteRecorder recorder. It's what I have been using for performance testing on CADRE. You set it up the same way, but then to read the values using an sqlitedict. There is a small example in the docs, but a more practical example is here in the CADRE code:
https://github.com/OpenMDAO/CADRE/blob/master/plot_progress.py

How do I initialize a PyTables table column size?

I am doing a Monte Carlo calculation and I'd like to save the intermediate results to disk. Below is a basic version of my code. In my original version, I had a data aggregator object that would collect the results from each trajectory and then at the end calculate some statistics and write to disk, but I began to run out of memory and the files were unwieldy. I am trying instead to tack on PyTables so that I can a) flush the data to disk and b) efficiently read it back in for further processing when it's done. I am working from this tutorial. My problem is that for each run, the data that would go into the layer column is a 1xn vector where n is set at the start of the script (it's actually passed through on the command line in real life).
Python won't let me define the table descriptor class inside the aggregator class, but the size n is outside the scope of the descriptor class. I'm coming from a MATLAB background, where all of the table creation and flushing to disk is hidden behind the single matfile command, so I'm really lost here.
How should I properly initialize my data table so that it can be seen within the aggregator object? If I should be doing this differently, how can I do the least amount of damage to my already working (except for the writing to disk) code?
import tables
import numpy
class Trajectory(tables.IsDescription):
start = tables.Float32Col(shape=(1, 2))
end = tables.Float32Col(shape=(1, 2))
layer = tables.Float32Col(shape=(1, n)) # how do I pass n to here?
class AggregateResults(object):
def __init__(self, n, filename):
self.n = n
self.h5 = tables.openFile(filename, mode="w")
self.traj_group = self.h5.createGroup(self.h5.root, "Trajectories")
self.traj_table = self.h5.createTable(self.traj_group, "trajectory", Trajectory, "Single Trajectory)
def end_of_trajectory(self, results):
trajectory = self.traj_table.row
trajectory['start'] = results.start_position
trajectory['end'] = results.end_position
trajectory['layer'] = results.layer_path
trajectory.append()
trajectory.flush()
def end_of_run(self):
self.h5.close()
def do_code(aggregate):
results = # long calculation goes here
aggregate.end_of_trajectory(results)
main():
filename = "filename.h5"
n = 7
aggregate = AggregateResults(n, filename)
for x in range(100000):
do_code(aggregate)
aggregate.end_of_run()

This doesn't completely answer my own question, but in solving a different problem, I came upon a solution. Rather than saving in a table, as above, I am saving the variable length vector as a separate array as described here. Then I save each of the scalar values as an attribute of that vector.
class AggregateResults(object):
def __init__(self, n, filename):
self.n = n
self.h5 = tables.openFile(filename, mode="w")
self.traj_group = self.h5.createGroup(self.h5.root, "Trajectories")
def end_of_trajectory(self, results):
i = current_photon
current_vector_name = "vector%2" % i
current_vector = self.h5.create_array(self.traj_group, current_vector_name, results.layer)
current_vector.attrs.start = results.start
current_vector.attrs.end = results.end
trajectory.flush()
def end_of_run(self):
self.h5.close()

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Memory leak issue using PyTorch IterableDataset with zarr - pytorch

Related

Calling VGG many times causes an out of memory error

Why the CUDA memory is not release with torch.cuda.empty_cache()

Python avoiding large array allocation multiple times

OpenMDAO 1.x: recording in parallel

How do I initialize a PyTables table column size?

Categories

Resources