I have a mac book with M1 MAX and I try to test the performance of the M1MAX neural engine.
I want to ask cpu to wait until the mac Neural engine tasks to finish, what function should I use? I know for CUDA, I can use torch.cuda.synchronize(), but what function is it for torch on m1?
I have the following code:
import torch
if torch.has_mps:
device = torch.device("mps")
else:
device = torch.device("cpu")
print("using", device, "device")
import time
matrix_size = 32*512
x = torch.randn(matrix_size, matrix_size)
y = torch.randn(matrix_size, matrix_size)
print("***********cpu speed***************")
start = time.time()
result = torch.matmul(x,y)
print(time.time()-start)
print("verify device: ", result.device)
x_mps = x.to(device)
y_mps = y.to(device)
# torch.cuda.synchronize()
for i in range(3):
print("***********mps speed***************")
start = time.time()
result_mps = torch.matmul(x_mps,y_mps)
# torch.cuda.synchronize()
print(time.time()-start)
print("verify device: ", result_mps.device)
Related
!!! My problem definition might be long and makes you boring, however, I am trying to make my case and error clear to you.
**
The definition what I am doing:**
I am implementing socket transmission in python. In client side, object detection is performed, and detected number of people and detected frames are sent to server side. The server side consists of multiple threaded classes to handle data from client and perform pose estimation to monitor and log GPU utilization. When I run code, the whole code is running while logging GPU memory usage about 46% as expected. Here, I provide below DataManager.py (to handle data from client) and ChildProcess.py (to get frames data from queue and perform pose estimation) which are part of the whole code.
DataManager.py
class DataManagerThread(Thread):
def __init__(self, queue, sock, index):
super().__init__()
self.image_queue = queue
self.server_socket = sock
self.index = index
def run(self):
data = b""
payload_size = struct.calcsize("Q")
while True:
while len(data) < payload_size:
packet = self.server_socket.recv(4*1024) # The server_socket attribute is no longer None, so this should work
if not packet:
break
data += packet
packed_msg_size = data[:payload_size]
data = data[payload_size:]
msg_size = struct.unpack("Q", packed_msg_size)[0]
while len(data) < msg_size:
data += self.server_socket.recv(4*1024)
frame_data = data[:msg_size]
data = data[msg_size:]
data_dict = pickle.loads(frame_data)
# extract frame and detection information from data dictionary
img = data_dict['frame']
people = data_dict['people']
print(f'Detected number of people: {people}')
# TODO: Passing data to the process manager thread as a queue
self.put_data_to_queue(img)
else:
print("[System] end socket")
self.put_data_to_queue("End")
def put_data_to_queue(self, image):
self.image_queue.put(image)
ChildProcess.py
import warnings
warnings.filterwarnings(action="ignore")
from multiprocessing import Process
from tf_pose.estimator import TfPoseEstimator
from tf_pose.networks import get_graph_path, model_wh
import argparse
import cv2
import logging
import time
def str2bool(v):
return v.lower() in ("yes", "true", "t", "1")
def init_logger():
logger = logging.getLogger('TfPoseEstimator-WebCam')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
class ChildProcess(Process):
def __init__(self, queue):
self.start_time = time.time()
super().__init__()
self.image_queue = queue
self.start_time = time.time()
def __del__(self):
pass
def run(self):
args, w, h, e = self.init_model()
print("[Time]", time.time() - self.start_time)
while True:
print("[System] Run motion")
image = self.image_queue.get()
if type(image) is str:
print("[System end process]")
break
else:
self.motionTracking(args, e, w, h, image)
def motionTracking(self, args, e, w, h, decimg):
humans = e.inference(decimg, resize_to_default=(w > 0 and h > 0),
upsample_size=args.resize_out_ratio)
y1 = [0.0]
y = 0
image = TfPoseEstimator.draw_humans(decimg, humans, imgcopy=False)
for human in humans:
for i in range(len(humans)):
try:
a = human.body_parts[0]
x = a.x * image.shape[1]
y = a.y * image.shape[0]
y1.append(y)
except:
pass
if ((y - y1[len(y1) - 2]) > 30):
pass
cv2.imshow('tf-pose-estimation result', image)
_ = 0xFF & cv2.waitKey(1)
def init_model(self):
print("[System] model init")
parser = argparse.ArgumentParser(description='tf-pose-estimation realtime webcam')
parser.add_argument('--camera', type=int, default=0)
parser.add_argument('--resize', type=str, default='0x0',
help='if provided, resize images before they are processed. default=0x0, Recommends : 432x368 or 656x368 or 1312x736 ')
parser.add_argument('--resize-out-ratio', type=float, default=4.0,
help='if provided, resize heatmaps before they are post-processed. default=1.0')
parser.add_argument('--model', type=str, default='mobilenet_thin',
help='cmu / mobilenet_thin / mobilenet_v2_large / mobilenet_v2_small')
parser.add_argument('--show-process', type=bool, default=False,
help='for debug purpose, if enabled, speed for inference is dropped.')
parser.add_argument('--tensorrt', type=str, default="False",
help='for tensorrt process.')
args = parser.parse_args()
print('[System] initialization %s : %s' % (args.model, get_graph_path(args.model)))
w, h = model_wh(args.resize)
if w > 0 and h > 0:
e = TfPoseEstimator(get_graph_path(args.model), target_size=(w, h), trt_bool=str2bool(args.tensorrt))
else:
e = TfPoseEstimator(get_graph_path(args.model), target_size=(432, 368), trt_bool=str2bool(args.tensorrt))
print("[System] End model")
return args, w, h, e
The definition what I want to do and what error I am getting:
Here, I want to add my LSTM code to predict (people = data_dict['p#ople']) in DataManager.py file.
DataManager(added_lstm).py
# import some necessary libraries
config = tf.compat.v1.ConfigProto()
graph = tf.compat.v1.get_default_graph()
first_session = tf.compat.v1.Session(config=config)
with graph.as_default(), first_session.as_default():
with graph.as_default():
with tf.device('CPU:0'):
model = tf.keras.models.load_model('/home/tf-pose-estimation/modules/lstm_model/model1.h5', compile=False)
print(model.summary())
def make_prediction(m):
WINDOW_SIZE, alpha, theta = 5, 0.9, 3
forecast_ewma, forecast_values, theta_values, arr_of_num = [0], [], [], [1,1,1,1,1]
arr_of_num.append(m)
if len(arr_of_num)>WINDOW_SIZE:
arr_of_num = arr_of_num[1:]
if len(arr_of_num)==WINDOW_SIZE:
actual = arr_of_num[-1]
with graph.as_default(), first_session.as_default():
forecast = model.predict(np.array(arr_of_num[-WINDOW_SIZE:]).reshape(1, WINDOW_SIZE, 1))[0][0]
forecast_values.append(forecast)
a = alpha * forecast + (1 - alpha) * forecast_ewma[-1]
theta += 1 if a > 0.5 else -1
theta = min(max(theta, 0), 2)
theta_values.append(theta)
forecast_ewma.append(a)
return actual, forecast
class DataManagerThread(Thread):
def __init__(self, queue,sock, index):
super().__init__()
self.image_queue = queue
self.server_socket = sock
self.index = index
def run(self):
data = b""
payload_size = struct.calcsize("Q")
while True:
while len(data) < payload_size:
packet = self.server_socket.recv(4*1024) # The server_socket attribute is no longer None, so this should work
if not packet:
break
data += packet
packed_msg_size = data[:payload_size]
data = data[payload_size:]
msg_size = struct.unpack("Q", packed_msg_size)[0]
while len(data) < msg_size:
data += self.server_socket.recv(4*1024)
frame_data = data[:msg_size]
data = data[msg_size:]
data_dict = pickle.loads(frame_data)
# extract frame and detection information from data dictionary
img = data_dict['frame']
people = data_dict['people']
print(f'Detected number of people: {people}')
self.put_data_to_queue(img)
pred = make_prediction(people) # Added for lstm prediction
print(f"Predictions: {pred}")
def put_data_to_queue(self, image):
self.image_queue.put(image)
Here, I am adding only to DataManager.py file that loading lstm model, defining make_prediction function and use pred = make_prediction(people) inside of DataManagerThread(Thread) class to make prediction. Other code remained unchanged in this file.
When I run both models simultaneously, only lstm is predicting and pose estimation is just frozen. Also, even lstm model is forced to utilize CPU, about 84 % of GPU memory is occupied. Why? I do not know. However, my expectation is that lstm model should use cpu and pose estimation model should use gpu, and both models should bre run simultaneously.
When I run every model separately (i.e., lstm on CPU and pose estimation on GPU), they are working pretty well. Specifically, I tested LSTM model seoerately by generating random number, it worked as expected. LSTM model is trained in Tensorflow 2.5 and both models are running in Tensorflow 2.5.
Below is my PC and Env specifications:
GPU: NVIDIA GeForce RTX 2070 SUPER
Driver Version: 525
CUDA Version: 11.6
Python: 3.9.12
Tensorflow-gpu: 2.5.0
Is there possible or relevant solution for running the lstm model on CPU and the pose estimation model on GPU simultaneously using multithreading?
Any help appreciated!!!
U-Net code:
class UNet(nn.Module):
def __init__(self):
super(UNet, self).__init__()
self.c1 = convBlock(1, 64).to('cuda:0')
self.d1 = downSample(64).to('cuda:0')
self.c2 = convBlock(64, 128).to('cuda:0')
self.d2 = downSample(128).to('cuda:0')
self.c3 = convBlock(128, 256).to('cuda:0')
self.d3 = downSample(256).to('cuda:1')
self.c4 = convBlock(256, 512).to('cuda:1')
self.d4 = downSample(512).to('cuda:1')
self.c5 = convBlock(512, 1024).to('cuda:1')
self.u1 = upSample(1024).to('cuda:1')
self.c6 = convBlock(1024, 512).to('cuda:1')
self.u2 = upSample(512).to('cuda:1')
self.c7 = convBlock(512, 256).to('cuda:1')
self.u3 = upSample(256).to('cuda:1')
self.c8 = convBlock(256, 128).to('cuda:1')
self.u4 = upSample(128).to('cuda:0')
self.c9 = convBlock(128, 64).to('cuda:0')
self.out = nn.Conv3d(64, 1, 3, 1, 1).to('cuda:0')
self.th = nn.Sigmoid().to('cuda:0')
def forward(self, x):
L1 = self.c1(x.to('cuda:0'))
L2 = self.c2(self.d1(L1.to('cuda:0')).to('cuda:0'))
L3 = self.c3(self.d2(L2.to('cuda:0')).to('cuda:0'))
L4 = self.c4(self.d3(L3.to('cuda:1')).to('cuda:1'))
L5 = self.c5(self.d4(L4.to('cuda:1')).to('cuda:1'))
R4 = self.c6(self.u1(L5.to('cuda:1'), L4.to('cuda:1')).to('cuda:1'))
R3 = self.c7(self.u2(R4.to('cuda:1'), L3.to('cuda:1')).to('cuda:1'))
R2 = self.c8(self.u3(R3.to('cuda:1'), L2.to('cuda:1')).to('cuda:1'))
R1 = self.c9(self.u4(R2.to('cuda:0'), L1.to('cuda:0')).to('cuda:0'))
return self.th(self.out(R1.to('cuda:0')).to('cuda:0'))
convBlock, downSample, upSample is layer in my own code.
I want to train 3DU-Net, but the GPU memory is not enough, so I want to use multiple GPUs to train this model.
I assign different U-net layers to different GPUs.
I want to ask if this is the correct way to use different GPUs to train models? And what's the best way to run multiple GPU training python scripts using the PyTorch module in the Linux server?
Your code should work, but I'd suggest using some sort of variable to transfer submodel/tensor to different gpus. Something like this is what I've been using:
class MyModel(nn.Module):
def __init__(self, split_bool: bool = False):
self.submodule1 = ...
self.submodule2 = ...
self.split_bool = split_bool
if split_bool:
self.submodule1.cuda(0)
self.submodule2.cuda(1)
def forward(self, x):
x = self.submodule1(x)
if self.split_bool:
x = x.cuda(1) # Transfer tensor to second GPU
return self.submodule2(x)
For multiple training it really depends on your server. Are you using tensorboard/tensorboardX to plot results? You can launch multiple training script with different parameters with tmux, or even write your own bash script.
I need to parse some datasets in parallel using the same network. The network is on CUDA and I call share_memory() before passing it to the parse function. I spawn multiple processes to parse in parallel using torch.multiprocessing.Pool.
The GPU usage grows linearly with the number of processes I spawn. I am afraid this is expected, because sharing CUDA models requires the spawn start method.
My model is used only for evaluation and runs with torch.no_grad() in the spawned function.
Can I prevent this? The same question was asked here but never got a reply.
Here is a MWE.
With M = 100 I get CUDA OOM, with 10 I don't.
If I move my model to CPU and use fork memory stays constant. Using spawn on CPU still increases memory.
import signal
import numpy as np
from torch import multiprocessing as mp
import torch
import time
N = 1000
M = 100
DEVICE = 'cuda'
STOP = 'STOP'
data_in = {m: np.random.rand(N) for m in range(M)}
data_out = {m: np.random.rand(N) for m in range(M)}
def initializer():
"""Ignore CTRL+C in the worker process."""
signal.signal(signal.SIGINT, signal.SIG_IGN)
def online_test(queue, model, shared_stats):
while True: # keep process alive for testing
# print(f'... {data_id} waiting ...')
epoch, data_id = queue.get()
if data_id == STOP:
print(f'... test function is stopping ...')
break
print(f'testing function for {data_id} has started for epoch {epoch}')
shared_stats.update({data_id: {k: [] for k in ['prediction', 'error']}})
# print(f'... {data_id} evaluation ...')
# time.sleep(np.random.randint(1,10))
pred = model(torch.Tensor(data_in[data_id]).to(device=DEVICE)).cpu().detach().numpy()
err = pred - data_out[data_id]
shared_stats.update({data_id: {'prediction': epoch, 'error': - epoch}})
# shared_stats.update({data_id: {'prediction': list(pred), 'error': list(err)}})
queue.task_done() # notify parent that testing is done for requested epoch
if __name__ == '__main__':
stats = {**{'epoch': []},
**{data_id: {k: [] for k in ['prediction', 'error']} for data_id in data_in.keys()}}
train_model = torch.nn.Sequential(torch.nn.Linear(N, N)).to(device=DEVICE)
test_model = torch.nn.Sequential(torch.nn.Linear(N, N)).to(device=DEVICE)
test_model.share_memory()
mp.set_start_method('spawn')
manager = mp.Manager()
test_queue = manager.JoinableQueue()
shared_stats = manager.dict()
pool = mp.Pool(initializer=initializer)
for data_id in data_in.keys():
pool.apply_async(online_test,
args=(test_queue, test_model, shared_stats))
test_queue.put((0, data_id)) # testing can start
try: # wrap all in a try-except to handle KeyboardInterrupt
for epoch in range(5):
print('training epoch', epoch)
# time.sleep(3)
# ... here I do some training and then copy my parameters to test_model
print('... waiting for testing before moving on to next epoch ...')
test_queue.join()
stats['epoch'].append(epoch + 1)
test_model.load_state_dict(train_model.state_dict())
print(f'... epoch {epoch} testing is done, stats are')
for data_id in shared_stats.keys(): # but first copy stats here
for k in stats[data_id].keys():
mu = np.mean(shared_stats[data_id][k])
stats[data_id][k].append(mu)
# print(' ', data_id, k, mu)
test_queue.put((epoch + 1, data_id))
for data_id in shared_stats.keys(): # notify all procs to end
test_queue.put((-1, STOP))
print(stats)
except KeyboardInterrupt:
pool.terminate()
else:
pool.close()
pool.join()
I was working on a FDTD program that used the discrete Laplacian, which I can be implemented as a convolution operation. From what I have read, the main component of PyTorch is a tensor library optimized to perform operations commonly used for machine learning (such as convolutions). I was interested to compare it to other frameworks I have used, so I wrote a test program to apply the discrete Laplacian to a 1d array multiple times and compare execution times:
import torch as tr
import time
from numba import jit, cuda
import numpy as np
import pyopencl as cl
from pyopencl import array
#parameters
number_of_timesteps = 1000
number_of_elements = 10000000
#set up the inital conditions
torch_data = tr.rand((1,1,number_of_elements),dtype=tr.double) #torch convolution needs shape (minibatch,in_channels,iW)
numba_data = np.array([0] + list(torch_data[0][0].numpy()) + [0]) #add padding [0] for convolution. handled automatically in torch.
opencl_data = np.array([0] + list(torch_data[0][0].numpy()) + [0])
#test Torch
device = "cuda"
torch_data_a = torch_data.to(device)
torch_data_b = torch_data.to(device)
kernel = tr.tensor([[[1,-2,1]]],dtype=tr.double,device=device)
with tr.no_grad():
start_time = time.time()
for t in range(round(number_of_timesteps/2)): # /2 because each loop is two convolutions
torch_data_b = torch_data_a + 0.1* tr.nn.functional.conv1d(torch_data_a,kernel,padding=1)
torch_data_a = torch_data_b + 0.1* tr.nn.functional.conv1d(torch_data_b,kernel,padding=1)
print("Torch GPU time:",time.time()-start_time)
torch_data_numpy = np.array([0] + list(torch_data_a[0][0].cpu().numpy()) + [0])
#Numba GPU kernel
#cuda.jit
def numba_conv_cuda(x,x_new):
gid = cuda.grid(1)
if 0 < gid < x.size - 1 : # Check array boundaries
x_new[gid] = x[gid] + 0.1*(x[gid+1]+x[gid-1]-2*x[gid])
threadsperblock = 100
blockspergrid = (numba_data.size + (threadsperblock - 1)) // threadsperblock
x_a = cuda.to_device(numba_data)
x_b = cuda.to_device(numba_data)
start_time = time.time()
#actually run the kernel
for t in range(round(number_of_timesteps/2)): #again /2 because each loop is two convolutions
numba_conv_cuda[blockspergrid, threadsperblock](x_a,x_b)
numba_conv_cuda[blockspergrid, threadsperblock](x_b,x_a)
print("Numba GPU time:",time.time()-start_time)
numba_data = x_a.copy_to_host()
#test OpenCL
context = cl.create_some_context(interactive=False,answers=[0])
queue = cl.CommandQueue(context)
mem_flags = cl.mem_flags
program = cl.Program(context, """
#pragma OPENCL EXTENSION cl_khr_fp64 : enable //enable double precision calculations
__kernel void update_psi(__global const double *x, __global double *x_new)
{
int gid = get_global_id(0);
if(0 < gid && gid < x.size - 1){
x_new[gid] = x[gid] + 0.1*(x[gid+1]+x[gid-1]-2*x[gid]);
}
}
""".replace("x.size",str(opencl_data.size))).build()
x_a_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=opencl_data)
x_b_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=opencl_data)
#actually run the OpenCL
start_time = time.time()
for t in range(round(number_of_timesteps/2)): #again /2 because each loop is two convolutions
event = program.update_psi(queue, [threadsperblock*blockspergrid], [threadsperblock], x_a_buf, x_b_buf)
event.wait()
event = program.update_psi(queue, [threadsperblock*blockspergrid], [threadsperblock], x_b_buf, x_a_buf)
event.wait()
print("OpenCL GPU time:",time.time()-start_time)
event = cl.enqueue_copy(queue, opencl_data, x_a_buf)
event.wait()
print("Results are same?",np.allclose(torch_data_numpy,numba_data) and np.allclose(numba_data,opencl_data))
And these are the results testing on an Nvidia GPU:
Torch GPU time: 13.544365406036377
Numba GPU time: 0.2404193878173828
OpenCL GPU time: 0.9025869369506836
Results are same? True
I am surprised that the results show that a library designed for applying operations such as convolutions to be so much slower than Numba or PyOpenCL (which is not even optimized because I did not use any local memory on the GPU). Is this really the case, or did I do something wrong?
Additionally, why is the kernel written in c more than 3x slower than the kernel written in Python?
I have installed tensorflow-gpu in my laptop (GTX 1060-6GB) and I am developing a prime number generator program using tensorflow. Following is sample code i am using to calculate prime-numbers:
def prime_tf_graf(max_count):
# matrix version
with tf.device('/gpu:0'):
data = tf.range(2, max_count, dtype=tf.int32)
Y, X = tf.meshgrid(data, data); Z = X%Y
with tf.device('/cpu:0'):
loer = tf.matrix_band_part(Z, -1, 0) - tf.matrix_band_part(Z, 0, 0)
with tf.device('/gpu:0'):
loer = tf.cast(tf.not_equal(loer, 0), tf.int32)
sumr = tf.reduce_sum(loer, axis=1)
nums = data - 2
rato = tf.cast(sumr/nums, tf.int32)
indx = tf.cast(tf.not_equal(rato, 0), tf.int32)
return tf.reshape(tf.where(indx), [-1]) + 2
def prime_tf(max_count):
graf = prime_tf_graf(max_count)
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
prim = sess.run(graf)
return prim
When i run this code, my CPU and RAM get used only, and my GPU stays out of it. (only dedicated memory is allocated). Test Run: prime_tf(tf.constant(20000))
Here is my system usage:
It shows that tensorflow is only using my CPU and RAM not GPU.