How to make stream of Pytorch work concurrently? - pytorch

I am trying to speed up accelerate matrix calculation on GPU through streams, however, it seems that streams of pytorch are not work.
My code is like this:
RANGE = 1000
device = torch.device("cuda")
s1 = torch.cuda.Stream(device=device)
s2 = torch.cuda.Stream(device=device)
torch.cuda.synchronize()
t0 = time.time()
for index in range(RANGE):
first_input = torch.rand(10000, 10000).cuda()
second_input = torch.rand(10000, 10000).cuda()
with torch.cuda.stream(s1):
first_output = first_input.matmul(first_input)
with torch.cuda.stream(s2):
second_output = second_input.matmul(second_input)
torch.cuda.synchronize()
t1 = time.time()
print(t1 - t0)
Whether I use stream or not, the calculation time is about 1621-1623.
Is there something wrong with my code?

Related

Use Optimization to minimize a output dependent function (python)

Consider a simple code running inside a while loop:
while m_diff >= convergence_lim and paramC <= M_lim:
paramC = temp_M
try:
paramB, M, P = some_function(paramA, paramB, paramC)
except:
temp_M+= 0.001
continue
totalM0 = paramB['Mo']
totalM1 = sum(M) # Sum of all M
m_diff = abs(totalM0 - totalM1)
temp_M+= 0.001
This would run forever and iterate each 0.001 sample until m_diff is acceptable. I would like to use scipy.optimize to arrive at the numbers faster.
The issue is with the paramB as the function some_function regenerates a different value of paramB. The optimizer should look for best paramC that minimizes m_diff.
I tried:
import scipy.optimize
def mass_diff_func(paramC, paramB):
paramA = 10 #Constant
try:
paramB, M, P = some_function(paramA, paramB, paramC)
except:
return float('inf'), paramB
totalM0 = paramB['Mo']
totalM1 = sum(M)
m_diff = abs(totalM0 - totalM1)
return m_diff, paramB
result = scipy.optimize.minimize(mass_diff_func, paramC, args=(paramB,), bounds=[(convergence_lim, M_lim)])
optimal_paramC = result.x
But changes to paramB doesn't seem to be carried to minimize function. Or minimize function doesn't register changes to paramB, either way any idea how to fix this?
Open to other optimizers as well.

what is the synchronise function for mac mps?

I have a mac book with M1 MAX and I try to test the performance of the M1MAX neural engine.
I want to ask cpu to wait until the mac Neural engine tasks to finish, what function should I use? I know for CUDA, I can use torch.cuda.synchronize(), but what function is it for torch on m1?
I have the following code:
import torch
if torch.has_mps:
device = torch.device("mps")
else:
device = torch.device("cpu")
print("using", device, "device")
import time
matrix_size = 32*512
x = torch.randn(matrix_size, matrix_size)
y = torch.randn(matrix_size, matrix_size)
print("***********cpu speed***************")
start = time.time()
result = torch.matmul(x,y)
print(time.time()-start)
print("verify device: ", result.device)
x_mps = x.to(device)
y_mps = y.to(device)
# torch.cuda.synchronize()
for i in range(3):
print("***********mps speed***************")
start = time.time()
result_mps = torch.matmul(x_mps,y_mps)
# torch.cuda.synchronize()
print(time.time()-start)
print("verify device: ", result_mps.device)

Why is PyTorch slower than PyOpenCL, which is slower than Numba on GPU?

I was working on a FDTD program that used the discrete Laplacian, which I can be implemented as a convolution operation. From what I have read, the main component of PyTorch is a tensor library optimized to perform operations commonly used for machine learning (such as convolutions). I was interested to compare it to other frameworks I have used, so I wrote a test program to apply the discrete Laplacian to a 1d array multiple times and compare execution times:
import torch as tr
import time
from numba import jit, cuda
import numpy as np
import pyopencl as cl
from pyopencl import array
#parameters
number_of_timesteps = 1000
number_of_elements = 10000000
#set up the inital conditions
torch_data = tr.rand((1,1,number_of_elements),dtype=tr.double) #torch convolution needs shape (minibatch,in_channels,iW)
numba_data = np.array([0] + list(torch_data[0][0].numpy()) + [0]) #add padding [0] for convolution. handled automatically in torch.
opencl_data = np.array([0] + list(torch_data[0][0].numpy()) + [0])
#test Torch
device = "cuda"
torch_data_a = torch_data.to(device)
torch_data_b = torch_data.to(device)
kernel = tr.tensor([[[1,-2,1]]],dtype=tr.double,device=device)
with tr.no_grad():
start_time = time.time()
for t in range(round(number_of_timesteps/2)): # /2 because each loop is two convolutions
torch_data_b = torch_data_a + 0.1* tr.nn.functional.conv1d(torch_data_a,kernel,padding=1)
torch_data_a = torch_data_b + 0.1* tr.nn.functional.conv1d(torch_data_b,kernel,padding=1)
print("Torch GPU time:",time.time()-start_time)
torch_data_numpy = np.array([0] + list(torch_data_a[0][0].cpu().numpy()) + [0])
#Numba GPU kernel
#cuda.jit
def numba_conv_cuda(x,x_new):
gid = cuda.grid(1)
if 0 < gid < x.size - 1 : # Check array boundaries
x_new[gid] = x[gid] + 0.1*(x[gid+1]+x[gid-1]-2*x[gid])
threadsperblock = 100
blockspergrid = (numba_data.size + (threadsperblock - 1)) // threadsperblock
x_a = cuda.to_device(numba_data)
x_b = cuda.to_device(numba_data)
start_time = time.time()
#actually run the kernel
for t in range(round(number_of_timesteps/2)): #again /2 because each loop is two convolutions
numba_conv_cuda[blockspergrid, threadsperblock](x_a,x_b)
numba_conv_cuda[blockspergrid, threadsperblock](x_b,x_a)
print("Numba GPU time:",time.time()-start_time)
numba_data = x_a.copy_to_host()
#test OpenCL
context = cl.create_some_context(interactive=False,answers=[0])
queue = cl.CommandQueue(context)
mem_flags = cl.mem_flags
program = cl.Program(context, """
#pragma OPENCL EXTENSION cl_khr_fp64 : enable //enable double precision calculations
__kernel void update_psi(__global const double *x, __global double *x_new)
{
int gid = get_global_id(0);
if(0 < gid && gid < x.size - 1){
x_new[gid] = x[gid] + 0.1*(x[gid+1]+x[gid-1]-2*x[gid]);
}
}
""".replace("x.size",str(opencl_data.size))).build()
x_a_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=opencl_data)
x_b_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=opencl_data)
#actually run the OpenCL
start_time = time.time()
for t in range(round(number_of_timesteps/2)): #again /2 because each loop is two convolutions
event = program.update_psi(queue, [threadsperblock*blockspergrid], [threadsperblock], x_a_buf, x_b_buf)
event.wait()
event = program.update_psi(queue, [threadsperblock*blockspergrid], [threadsperblock], x_b_buf, x_a_buf)
event.wait()
print("OpenCL GPU time:",time.time()-start_time)
event = cl.enqueue_copy(queue, opencl_data, x_a_buf)
event.wait()
print("Results are same?",np.allclose(torch_data_numpy,numba_data) and np.allclose(numba_data,opencl_data))
And these are the results testing on an Nvidia GPU:
Torch GPU time: 13.544365406036377
Numba GPU time: 0.2404193878173828
OpenCL GPU time: 0.9025869369506836
Results are same? True
I am surprised that the results show that a library designed for applying operations such as convolutions to be so much slower than Numba or PyOpenCL (which is not even optimized because I did not use any local memory on the GPU). Is this really the case, or did I do something wrong?
Additionally, why is the kernel written in c more than 3x slower than the kernel written in Python?

Why is getting the first 30 keys of the dictionary in two statements faster than one statement?

I was doing a benchmark for myself that I encountered this interesting thing. I am trying to get the first 30 keys of a dictionary, and I have written three ways to get it as follows:
import time
dic = {str(i): i for i in range(10 ** 6)}
start_time = time.time()
x = list(dic.keys())[0:30]
print(time.time() - start_time)
start_time = time.time()
y = list(dic.keys())
x = y[0:30]
print(time.time() - start_time)
start_time = time.time()
z = dic.keys()
y = list(z)
x = y[0:30]
print(time.time() - start_time)
The results are:
0.015970945358276367
0.010970354080200195
0.01691460609436035
Surprisingly, the second method is much faster! Any thoughts on this?
Using Python's timeit module to measure various alternatives. I added mine which doesn't convert the keys to list:
from timeit import timeit
dic = {str(i): i for i in range(10 ** 6)}
def f1():
x = list(dic.keys())[0:30]
return x
def f2():
y = list(dic.keys())
x = y[0:30]
return x
def f3():
z = dic.keys()
y = list(z)
x = y[0:30]
return x
def f4():
x = [k for _, k in zip(range(30), dic.keys())]
return x
t1 = timeit(lambda: f1(), number=10)
t2 = timeit(lambda: f2(), number=10)
t3 = timeit(lambda: f3(), number=10)
t4 = timeit(lambda: f4(), number=10)
print(t1)
print(t2)
print(t3)
print(t4)
Prints:
0.1911074290110264
0.20418328599771485
0.18727918600779958
3.5186996683478355e-05
Maybe this is due to inaccuracies in your measure of time. You can use timeit for doing this kind of things:
import timeit
dic = {str(i): i for i in range(10 ** 6)}
# 27.5125/29.0836/26.8525
timeit.timeit("x = list(dic.keys())[0:30]", number=1000, globals={"dic": dic})
# 28.6648/26.4684/30.9534
timeit.timeit("y = list(dic.keys());x=y[0:30]", number=1000)
# 31.7345/29.5301/30.7541
timeit.timeit("z=dic.keys();y=list(z);x=y[0:30]", number=1000, globals={'dic': dic})
The comments show the times I got when running the same code 3 different times. As you can see, even by performing a large number of repetitions, it is possible to obtain quite large variations in time measured. This can be due to several different things:
An item can be in the cache of your processor or not.
Your processor can be occupied doing several other things.
Etc...
As stated by #Andrej Kesely, your bottleneck is due to the fact that you cast your dictionary keys into a list. By doing so, Python goes through the entire dictionary keys, because that's how it converts something to a list generally. Hence, by avoiding this, you can get much better results.

Tensorflow: using a FIFO queue for code running on GPUs

The code below shows my attempt to run an algorithm on single GPUs and feed data to it using a FIFO queue. The data exists in a CSV file. I use a separate python thread to read from the file one line at a time and enqueue the line into a FIFO.
N = 16
num_ckfs =80000
q = [0.01 for i in range(N)]
q_ckfs = np.array([q for i in range(num_ckfs)])
r = [5]
r_ckfs = np.array([r for i in range(num_ckfs)])
init_var = [10.0 for i in range(N)]
init_var_ckfs = np.array([init_var for i in range(num_ckfs)])
init_state = [0.0 for i in range(N)]
init_state_ckfs = np.array([init_state for i in range(num_ckfs)])
class CKF(object):
def __init__(self, num_ckfs, N):
self.init_variances = tf.Variable(init_var_ckfs, name='init_variances', dtype=tf.float64)
self.init_states = tf.Variable(init_state_ckfs, name='init_states', dtype=tf.float64)
init_states_expanded = tf.expand_dims(self.init_states, 2) # num_ckfs X N X 1
self.q_values = tf.constant(q_ckfs, name='q_values', dtype=tf.float64)
self.r_values = tf.constant(r_ckfs, name='r_values', dtype=tf.float64)
self.input_vectors = tf.placeholder(tf.float64, shape=[num_ckfs, N], name='input_vectors')
self.z_k = tf.placeholder(tf.float64, shape=[num_ckfs, 1], name='z_k');
q = tf.FIFOQueue(200, [tf.float64, tf.float64], shapes=[[num_ckfs,1], [num_ckfs,N]])
self.enqueue_op = q.enqueue([self.z_k, self.input_vectors])
observations, inputs = q.dequeue()
#further processing using the input data
with tf.device('/gpu:0'):
ckf_gpu0 = CKF(num_ckfs, N)
def load_and_enqueue():
#read one line at a time
#obvs_list corresponds to the first column
#data_list corresponds to the rest of the columns
session.run(ckf_gpu0.enqueue_op, feed_dict={
ckf_gpu0.input_vectors: data_list[0], ckf_gpu0.z_k: obvs_list[0]})
count += 1
t = threading.Thread(target=load_and_enqueue)
t.start()
for i in range( num_rows):
out = session.run([ckf_gpu0.projected_output ])
The first problem that I have run into is:
InvalidArgumentError (see above for traceback): Cannot assign a device to node 'fifo_queue': Could not satisfy explicit device specification '/device:GPU:0' because no supported kernel for GPU devices is available.
Is there an alternate way to do such a thing, i.e. hide the I/O latency while the computation is being done on a GPU?

Resources