Stacking/concatenating tensors increases memory usage - pytorch

In PyTorch, I found stack-ing or cat-ing multiple tensors would increase memory usage by the sum of the sizes of all these arrays. An example is as follows:
import torch as tc
import torch.autograd as tag
import sys
import psutil
import os
import resource
def get_ru_maxrss():
""" Return max RSS usage (in kilobytes) """
size = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
if sys.platform == 'darwin':
# on Mac OS X ru_maxrss is in bytes, on Linux it is in KB
size //= 1024
return size / 1024
def cpuStats():
print(sys.version)
print(psutil.cpu_percent())
print(psutil.virtual_memory()) # physical memory usage
pid = os.getpid()
py = psutil.Process(pid)
memoryUse = py.memory_info()[0] / 2. ** 30 # memory use in GB...I think
print('memory GB:', memoryUse)
m0 = get_ru_maxrss()
x1 = tc.ones([8192, 8192], requires_grad=True)
print(x1.dtype)
print(get_ru_maxrss() - m0)
print('=======')
y = x1 * 1.1
print(y.dtype)
print(get_ru_maxrss() - m0)
print('=======')
for i in range(10):
y = tc.cat([y, x1])
print(y.dtype)
print(get_ru_maxrss() - m0)
print('=======')
loss = tc.mean(y)
print(get_ru_maxrss() - m0)
print('=======')
loss.backward()
print(get_ru_maxrss() - m0)
print('=======')
And we can see that each occurence of cat in the for loop increases memory usage by 512 MB, which is the sum of y and x1 (256 MB each). This isn't a major issue for me right now but I'm just curious about it. If I understand it correctly, the vector-Jacobian product of stack or concatenation is just to perform the reversed operation to the gradient vector, splitting it back into multiple arrays whose shapes match the original inputs to the stack or cat function. This process doesn't need the intermediate value of the stacked tensor computed in the forward pass, which is similar to a linear operation, but the latter doesn't incur additional memory usage (e.g., if I change the cat in the for loop to a linear op such as y = y * 1.1, it won't increase memory consumption). So I'm wondering if the increased memory usage is essentially just an empty space allocated to hold the ``splitted'' gradient arrays in the backward pass, which has to be contiguous, so the splitting operation doesn't have to stride in memory? It follows that a linear operation doesn't need this additional memory because the memory space of the input gradient can be directly overwritten without worrying about memory contiguity. Is that right?

Related

What is the fastest way to generate / cast a float32 matrix from an uint8 or uint16 C-Pointer for pytorch?

I am currently trying to access multiple live network cameras for machine learning.
The problem is that each camera has a resolution of ~ 20 MPix and therefore produces large amounts of data which need preprocessing before the next frame arrives.
The SDK gives access to the raw C pointer of camera to speed up data acquisition.
The pointer is either uint8 or uint16 per Pixel, depending on the camera model and settings.
The goal is to extract the image from the data and provide it as float32 pytorch tensor.
My main problem is that my current pipeline, especially the conversion from uint8 to float32 is the main bottleneck of the framerate.
Currently I have to use numpy as a crutch to extract the buffer data, as pytorch is unable to work with uint16 buffers.
Afterwards, I typecast the uint8/16 numpy matrix into a preallocated float32 matrix converting the data types and avoiding malloc overhead of .astype.
Using troch.from_numpy() and an element-wise copy I transfer the data to the shared tensor.
I attached a code snippet as an example where the camera data is replaced by the data of a numpy array.
Just the conversion alone takes from 24 to 25ms, with additional overhead, e.g. transferring data to gpu this results in massive reductions in framerate.
Is there a way to speed this whole process up?
import numpy as np
import torch
import time
# random image data from the camera SDK
pdata = np.random.randint(0, 65535, (5000, 4000, 3), dtype=np.uint16).data
# preallocate memory for typecast to float
float_frame = np.zeros((5000, 4000, 3), dtype=np.float32)
# preallocate memory for "shared tensor"
torch_frame = torch.zeros((5000, 4000, 3), dtype=torch.float32, requires_grad=False, device='cpu')
torch_frame.share_memory_()
# example loop for converting a frame
n = 100 # number of iterations
start = time.perf_counter()
for i in range(n):
# get data from buffer
frame = np.ctypeslib.as_array(pdata, (5000, 4000, 3))
# convert uint8/uint16 to float32
float_frame[:] = frame
# move into shared tensor
torch_frame[:] = torch.from_numpy(float_frame)
# normalize float 0 to 1.0
torch_frame = torch_frame / 65535
time_delta = time.perf_counter() - start
print(f"Time for N: {n} loops: {time_delta:.5}s. With {time_delta / n:.5} seconds per loop.")

TensorFlow vs PyTorch: Memory usage

I have PyTorch 1.9.0 and TensorFlow 2.6.0 in the same environment, and both recognizing the all GPUs.
I was comparing the performance of both, so I did this small simple test, multiplying large matrices (A and B, both 2000x2000) several times (10000x):
import numpy as np
import os
import time
def mul_torch(A,B):
# PyTorch matrix multiplication
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import torch
A, B = torch.Tensor(A.copy()), torch.Tensor(B.copy())
A = A.cuda()
B = B.cuda()
start = time.time()
for i in range(10000):
C = torch.matmul(A, B)
torch.cuda.empty_cache()
print('PyTorch:', time.time() - start, 's')
return C
def mul_tf(A,B):
# TensorFlow Matrix Multiplication
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
with tf.device('GPU:0'):
A = tf.constant(A.copy())
B = tf.constant(B.copy())
start = time.time()
for i in range(10000):
C = tf.math.multiply(A, B)
print('TensorFlow:', time.time() - start, 's')
return C
if __name__ == '__main__':
A = np.load('A.npy')
B = np.load('B.npy')
n = 2000
A = np.random.rand(n, n)
B = np.random.rand(n, n)
PT = mul_torch(A, B)
time.sleep(5)
TF = mul_tf(A, B)
As a result:
PyTorch: 19.86856198310852 s
TensorFlow: 2.8338065147399902 s
I was not expecting these results, I thought the results should be similar.
Investigating the GPU performance, I noticed that both are using GPU full capacity, but PyTorch uses a small fraction of the memory Tensorflow uses. It explains the processing time difference, but I cannot explain the difference on memory usage. Is it something intrinsic to the methods, or is it my computer configuration? Regardless the matrix size (at least for matrices larger than 1000x1000), these plateau are the same.
Thanks you for your help.
It is because you are doing matrix multiplication in pytorch but element-wise multiplication in tensorflow. To do matrix multiplication in TF, use tf.matmul or simply:
for i in range(10000):
C = A # B
That does the same for both TF and torch. You also have to call torch.cuda.synchronize() inside the time measurement and move torch.cuda.empty_cache() outside of the measurement for the sake of fairness.
The expected results will be tensorflow's eager execution slower than pytorch.
Regarding the memory usage, TF by default claims all GPU memory and using nvidia-smi in linux or similarly task manager in windows, does not reflect the actual memory usage of the operations.

How to initialize empty tensor with certain dimension and append to it through a loop without CUDA out of memory?

I am trying to append tensors (t) generated in a for-loop to a list [T] that accumulates all these tensors. Next, the list [T] requires to be converted into a tensor and needs to be loaded onto GPU.
b_output = []
for eachInputId, eachMask in zip(b_input_ids, b_input_mask):
# unrolled into each individual document
# print(eachInputId.size()) # individual document here
outputs = model(eachInputId,
token_type_ids=None,
attention_mask=eachMask)
# combine the [CLS] output layer to form the document
doc_output = torch.mean(outputs[1], dim=0) # size = [1, ncol]
b_output.append( doc_output )
t_b_output = torch.tensor( b_output )
Another method that I tried was initializing a tensor {T} with fixed dimensions and appending the tensors (t) to it from the for-loop.
b_output = torch.zeros(batch_size, hidden_units)
b_output.to(device) # cuda device
for index, (eachInputId, eachMask) in enumerate(zip(b_input_ids, b_input_mask)):
# unrolled into each individual document
# print(eachInputId.size()) # individual document here
outputs = model(eachInputId,
token_type_ids=None,
attention_mask=eachMask)
# combine the [CLS] output layer to form the document
doc_output = torch.mean(outputs[1], dim=0) # size = [1, ncol]
b_output[index] = doc_output
Doing either of this produces this error:
RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 11.17 GiB total capacity; 10.65 GiB already allocated; 2.81 MiB free; 10.86 GiB reserved in total by PyTorch)
I assume this is because of appending the tensors (that are on the GPU) to a list (of course not on the GPU) and then trying to convert the list into a tensor (thats not on the GPU).
What could be done to append those tensors to another tensor and then load the tensor to GPU for further processing?
I will be grateful for any hint or information.
Try using torch.cat instead of torch.tensor. You are currently trying to allocate memory for you new tensor while all the other tensors are still stored, which might be the cause of the out of memory error. Change :
t_b_output = torch.tensor( b_output )
with:
t_b_output = torch.cat( b_output )
Hope this help

How to implement Pytorch 1D crosscorrelation for long signals in fourier domain?

I have a series of signals length n = 36,000 which I need to perform crosscorrelation on. Currently, my cpu implementation in numpy is a little slow. I've heard Pytorch can greatly speed up tensor operations, and provides a way to perform computations in parallel on the GPU. I'd like to explore this option, but I'm not quite sure how to accomplish this using the framework.
Because of the length of these signals, I'd prefer to perform the crosscorrelation operation in the frequency domain.
Normally using numpy I'd perform the operation like so:
import numpy as np
signal_length=36000
# make the signals
signal_1 = np.random.uniform(-1,1, signal_length)
signal_2 = np.random.uniform(-1,1, signal_length)
# output target length of crosscorrelation
x_cor_sig_length = signal_length*2 - 1
# get optimized array length for fft computation
fast_length = np.fftpack.next_fast_len(x_cor_sig_length)
# move data into the frequency domain. axis=-1 to perform
# along last dimension
fft_1 = np.fft.rfft(src_data, fast_length, axis=-1)
fft_2 = np.fft.rfft(src_data, fast_length, axis=-1)
# take the complex conjugate of one of the spectrums. Which one you choose depends on domain specific conventions
fft_1 = np.conj(fft_1)
fft_multiplied = fft_1 * fft_2
# back to time domain.
prelim_correlation = np.fft.irfft(result, x_corr_sig_length, axis=-1)
# shift the signal to make it look like a proper crosscorrelation,
# and transform the output to be purely real
final_result = np.real(np.fft.fftshift(prelim_correlation),axes=-1)).astype(np.float64)
Looking at the Pytorch documentation, there doesn't seem to be an equivalent for numpy.conj(). I'm also not sure if/how I need to implement a fftshift after the irfft operation.
So how would you go about writing a 1D crosscorrelation in Pytorch using the fourier method?
A few things to be considered.
Python interpreter is very slow, what those vectorization libraries do is to move the workload to a native implementation. In order to make any difference you need to be able to give perform many operations in one python instruction. Evaluating things on GPU follows the same principle, while GPU has more compute power it is slower to copy data to/from GPU.
I adapted your example to process multiple signals simulataneously.
import numpy as np
def numpy_xcorr(BATCH=1, signal_length=36000):
# make the signals
signal_1 = np.random.uniform(-1,1, (BATCH, signal_length))
signal_2 = np.random.uniform(-1,1, (BATCH, signal_length))
# output target length of crosscorrelation
x_cor_sig_length = signal_length*2 - 1
# get optimized array length for fft computation
fast_length = next_fast_len(x_cor_sig_length)
# move data into the frequency domain. axis=-1 to perform
# along last dimension
fft_1 = np.fft.rfft(signal_1, fast_length, axis=-1)
fft_2 = np.fft.rfft(signal_2 + 0.1 * signal_1, fast_length, axis=-1)
# take the complex conjugate of one of the spectrums.
fft_1 = np.conj(fft_1)
fft_multiplied = fft_1 * fft_2
# back to time domain.
prelim_correlation = np.fft.irfft(fft_multiplied, fast_length, axis=-1)
# shift the signal to make it look like a proper crosscorrelation,
# and transform the output to be purely real
final_result = np.fft.fftshift(np.real(prelim_correlation), axes=-1)
return final_result, np.sum(final_result)
Since torch 1.7 we have the torch.fft module that provides an interface similar to numpy.fft, the fftshift is missing but the same result can be obtained with torch.roll. Another point is that numpy uses by default 64-bit precision and torch will use 32-bit precision.
The fast length consists in choosing smooth numbers (those having that are factorized in to small prime numbers, and I suppose you are familiar with this subject).
def next_fast_len(n, factors=[2, 3, 5, 7]):
'''
Returns the minimum integer not smaller than n that can
be written as a product (possibly with repettitions) of
the given factors.
'''
best = float('inf')
stack = [1]
while len(stack):
a = stack.pop()
if a >= n:
if a < best:
best = a;
if best == n:
break; # no reason to keep searching
else:
for p in factors:
b = a * p;
if b < best:
stack.append(b)
return best;
Then the torch implementation goes
import torch;
import torch.fft
def torch_xcorr(BATCH=1, signal_length=36000, device='cpu', factors=[2,3,5], dtype=torch.float):
signal_length=36000
# torch.rand is random in the range (0, 1)
signal_1 = 1 - 2*torch.rand((BATCH, signal_length), device=device, dtype=dtype)
signal_2 = 1 - 2*torch.rand((BATCH, signal_length), device=device, dtype=dtype)
# just make the cross correlation more interesting
signal_2 += 0.1 * signal_1;
# output target length of crosscorrelation
x_cor_sig_length = signal_length*2 - 1
# get optimized array length for fft computation
fast_length = next_fast_len(x_cor_sig_length, [2, 3])
# the last signal_ndim axes (1,2 or 3) will be transformed
fft_1 = torch.fft.rfft(signal_1, fast_length, dim=-1)
fft_2 = torch.fft.rfft(signal_2, fast_length, dim=-1)
# take the complex conjugate of one of the spectrums. Which one you choose depends on domain specific conventions
fft_multiplied = torch.conj(fft_1) * fft_2
# back to time domain.
prelim_correlation = torch.fft.irfft(fft_multiplied, dim=-1)
# shift the signal to make it look like a proper crosscorrelation,
# and transform the output to be purely real
final_result = torch.roll(prelim_correlation, (fast_length//2,))
return final_result, torch.sum(final_result);
And here a code to test the results
import time
funcs = {'numpy-f64': lambda b: numpy_xcorr(b, factors=[2,3,5], dtype=np.float64),
'numpy-f32': lambda b: numpy_xcorr(b, factors=[2,3,5], dtype=np.float32),
'torch-cpu-f64': lambda b: torch_xcorr(b, device='cpu', factors=[2,3,5], dtype=torch.float64),
'torch-cpu': lambda b: torch_xcorr(b, device='cpu', factors=[2,3,5], dtype=torch.float32),
'torch-gpu-f64': lambda b: torch_xcorr(b, device='cuda', factors=[2,3,5], dtype=torch.float64),
'torch-gpu': lambda b: torch_xcorr(b, device='cuda', factors=[2,3,5], dtype=torch.float32),
}
times ={}
for batch in [1, 10, 100]:
times[batch] = {}
for l, f in funcs.items():
t0 = time.time()
t1, t2 = f(batch)
tf = time.time()
del t1
del t2
times[batch][l] = 1000 * (tf - t0) / batch;
I obtained the following results
And what surprised myself is the result when the numbers are not so smooth e.g. using 17-smooth length the torch implementation is so much better that I used logarithmic scale here (with batch size 100 the torch gpu was 10000 times faster than numpy with batch size 1).
Remember that these functions are generating the data at the GPU in general we want to copy the final results to the CPU, if we consider the time spent copying the final result to CPU I observed times up to 10x higher than the cross correlation computation (random data generation + three FFTs).

Keras layer for slicing image data into sliding windows

I have a set of images, all of varying widths, but with fixed height set to 100 pixels and 3 channels of depth. My task is to classify if each vertical line in the image is interesting or not. To do that, I look at the line in context of its 10 predecessor and successor lines. Imagine the algorithm sweeping from left to right of the image, detecting vertical lines containing points of interest.
My first attempt at doing this was to manually cut out these sliding windows using numpy before feeding the data into the Keras model. Like this:
# Pad left and right
s = np.repeat(D[:1], 10, axis = 0)
e = np.repeat(D[-1:], 10, axis = 0)
# D now has shape (w + 20, 100, 3)
D = np.concatenate((s, D, e))
# Sliding windows creation trick from SO question
idx = np.arange(21)[None,:] + np.arange(len(D) - 20)[:,None]
windows = D[indexer]
Then all windows and all ground truth 0/1 values for all vertical lines in all images would be concatenated into two very long arrays.
I have verified that this works, in principle. I fed each window to a Keras layer looking like this:
Conv2D(20, (5, 5), input_shape = (21, 100, 3), padding = 'valid', ...)
But the windowing causes the memory usage to increase 21 times so doing it this way becomes impractical. But I think my scenario is a very common in machine learning so there must be some standard method in Keras to do this efficiently? E.g I would like to feed Keras my raw image data (w, 100, 80) and tell it what the sliding window sizes are and let it figure out the rest. I have looked at some sample code but I'm a ml noob so I don't get it.
Unfortunately this isn't an easy problem because it can involve using a variable sized input for your Keras model. While I think it is possible to do this with proper use of placeholders that's certainly no place for a beginner to start. your other option is a data generator. As with many computationally intensive tasks there is often a trade off between compute speed and memory requirements, using a generator is more compute heavy and it will be done entirely on your cpu (no gpu acceleration), but it won't make the memory increase.
The point of a data generator is that it will apply the operation to images one at a time to produce the batch, then train on that batch, then free up the memory - so you only end up keeping one batch worth of data in memory at any time. Unfortunately if you have a time consuming generation then this can seriously affect performance.
The generator will be a python generator (using the 'yield' keyword) and is expected to produce a single batch of data, keras is very good at using arbitrary batch sizes, so you can always make one image yield one batch, especially to start.
Here is the keras page on fit_generator - I warn you, this starts to become a lot of work very quickly, consider buying more memory:
https://keras.io/models/model/#fit_generator
Fine I'll do it for you :P
import numpy as np
import pandas as pd
import keras
from keras.models import Model, model_from_json
from keras.layers import Dense, Concatenate, Multiply,Add, Subtract, Input, Dropout, Lambda, Conv1D, Flatten
from tensorflow.python.client import device_lib
# check for my gpu
print(device_lib.list_local_devices())
# make some fake image data
# 1000 random widths
data_widths = np.floor(np.random.random(1000)*100)
# producing 1000 random images with dimensions w x 100 x 3
# and a vector of which vertical lines are interesting
# I assume your data looks like this
images = []
interesting = []
for w in data_widths:
images.append(np.random.random([int(w),100,3]))
interesting.append(np.random.random(int(w))>0.5)
# this is a generator
def image_generator(images, interesting):
num = 0
while num < len(images):
windows = None
truth = None
D = images[num]
# this should look familiar
# Pad left and right
s = np.repeat(D[:1], 10, axis = 0)
e = np.repeat(D[-1:], 10, axis = 0)
# D now has shape (w + 20, 100, 3)
D = np.concatenate((s, D, e))
# Sliding windows creation trick from SO question
idx = np.arange(21)[None,:] + np.arange(len(D) - 20)[:,None]
windows = D[idx]
truth = np.expand_dims(1*interesting[num],axis=1)
yield (windows, truth)
num+=1
# the generator MUST loop
if num == len(images):
num = 0
# basic model - replace with your own
input_layer = Input(shape = (21,100,3), name = "input_node")
fc = Flatten()(input_layer)
fc = Dense(100, activation='relu',name = "fc1")(fc)
fc = Dense(50, activation='relu',name = "fc2")(fc)
fc = Dense(10, activation='relu',name = "fc3")(fc)
output_layer = Dense(1, activation='sigmoid',name = "output")(fc)
model = Model(input_layer,output_layer)
model.compile(optimizer="adam", loss='binary_crossentropy')
model.summary()
#and training
training_history = model.fit_generator(image_generator(images, interesting),
epochs =5,
initial_epoch = 0,
steps_per_epoch=len(images),
verbose=1
)

Resources