CUDA error on WSL2 using pytorch with multiprocessing - pytorch

I have a Python script as shown below:
import torch
from torch.multiprocessing import set_start_method, Pipe, Process
def func(conn):
data = conn.recv()
print(data)
if __name__ == "__main__":
set_start_method('spawn')
a, b = Pipe()
data = torch.tensor([1, 2, 3], device='cuda')
proc = Process(target=func, args=(data,))
proc.start()
b.send(data)
proc.join()
I run this script on WSL2, but it shows
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/home/zxc/anaconda3/envs/airctrl/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/home/zxc/anaconda3/envs/airctrl/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
File "/home/zxc/anaconda3/envs/airctrl/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 121, in rebuild_cuda_tensor
storage = storage_cls._new_shared_cuda(
File "/home/zxc/anaconda3/envs/airctrl/lib/python3.8/site-packages/torch/storage.py", line 807, in _new_shared_cuda
return torch.UntypedStorage._new_shared_cuda(*args, **kwargs)
RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
[W CudaIPCTypes.cpp:15] Producer process has been terminated before all shared CUDA tensors released. See Note [Sharing CUDA tensors]
[W CUDAGuardImpl.h:46] Warning: CUDA warning: driver shutting down (function uncheckedGetDevice)
[W CUDAGuardImpl.h:62] Warning: CUDA warning: invalid device ordinal (function uncheckedSetDevice)
[W CUDAGuardImpl.h:46] Warning: CUDA warning: driver shutting down (function uncheckedGetDevice)
[W CUDAGuardImpl.h:62] Warning: CUDA warning: invalid device ordinal (function uncheckedSetDevice)
My environment is:
OS:WSL2 Ubuntu 22.04
CUDA: 11.7
Python: 3.8
Pytorch: 1.13.0+cu117
Any idea on how to solve this issue?
Thanks.
I've run this script on Ubuntu 22.04 without WSL2, it's OK.

Try moving the data to GPU inside your function instead of creating it in GPU directly. That worked for me.

Related

tensorRT with grpc multi threading error, how to fix it?

Description
Environment
TensorRT Version: 8.2.3.0
NVIDIA GPU: gtx 1080ti
NVIDIA Driver Version: 470.103.01
CUDA Version: 11.4
CUDNN Version: 8.2
Operating System: Linux 18.06
Python Version (if applicable): 3.8.0
Tensorflow Version (if applicable):
PyTorch Version (if applicable): 1.10
Baremetal or Container (if so, version):
grpc server code
server = grpc.server(
futures.ThreadPoolExecutor(),
options=[
("grpc.max_send_message_length", -1),
("grpc.max_receive_message_length", -1),
("grpc.so_reuseport", 1),
("grpc.use_local_subchannel_pool", 1),
],
)
grpc stub init
grpcObject(encoder=trt_model, decoder=decoder)
trt_model init code
def __init__(self):
cuda_ctx = cuda.Device(0).make_context()
self.cuda_ctx = cuda_ctx
if self.cuda_ctx:
self.cuda_ctx.push()
...
Hello.
I'm using TensorRT via grpc.
However, after setting max_worker in the multi-threading function of grpc, the following error occurs when requests come in from multiple clients.
In case of max_worker=1, no error occurs. Can you help?
infer method
def infer(self, wav_path):
input_signal = preprocess_stt(wav_path)
if self.cuda_ctx:
self.cuda_ctx.push()
self.context.set_binding_shape(0, input_signal.shape)
assert self.context.all_binding_shapes_specified
h_output = cuda.pagelocked_empty(tuple(self.context.get_binding_shape(1)), dtype=np.float32)
h_input_signal = cuda.register_host_memory(np.ascontiguousarray(to_numpy(input_signal)))
cuda.memcpy_htod_async(self.d_input, h_input_signal, self.stream)
self.context.execute_async(bindings=[int(self.d_input), int(self.d_output)], stream_handle=self.stream.handle)
cuda.memcpy_dtoh_async(h_output, self.d_output, self.stream)
self.stream.synchronize()
if self.cuda_ctx:
self.cuda_ctx.pop()
return h_output
error
pycuda._driver.LogicError: cuMemHostAlloc failed: an illegal memory access was encountered
E0228 17:02:30.063214 140249774667520 _server.py:471] Exception iterating responses: cuMemHostAlloc failed: an illegal memory access was encountered
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/grpc/_server.py", line 461, in _take_response_from_response_iterator
return next(response_iterator), True
File "/data/grpc/stt_grpc/grpc_class/dummy_grpc_core.py", line 116, in getStream
stt_result = trt_inference(self.trt_model, 'aaa.wav', self.decoder)
File "/data/grpc/stt_grpc/stt_package/stt_func.py", line 525, in trt_inference
model_output = actor.infer('aaa.wav')
File "/data/grpc/stt_grpc/grpc_class/tensorrt_stt.py", line 153, in infer
h_output = cuda.pagelocked_empty(tuple(self.context.get_binding_shape(1)), dtype=np.float32)
pycuda._driver.LogicError: cuMemHostAlloc failed: an illegal memory access was encountered

changing rounding mode via importing libm in python 3

My environment: Ubuntu 18.04, Anaconda, Python 3.6
I am using following code to import libm in python via ctypes in order to change floating type environment such as rounding.
import numpy as np
import ctypes
FE_TONEAREST = 0x0000
FE_DOWNWARD = 0x0400
FE_UPWARD = 0x0800
FE_TOWARDZERO = 0x0c00
#libm = ctypes.CDLL("libm.so", ctypes.RTLD_GLOBAL)
libm = ctypes.cdll.LoadLibrary(r'/usr/lib/x86_64-linux-gnu/libm.so')
v = 1. / (1<<23)
print( repr(np.float32(1+v) - np.float32(v/2))) # prints 1.0
#change mode
libm.fesetround(FE_UPWARD)
print( repr(np.float32(1+v) - np.float32(v/2))) # prints 1.0000002
However I get following error:
Traceback (most recent call last):
File "mode.py", line 10, in <module>
libm = ctypes.cdll.LoadLibrary(r'/usr/lib/x86_64-linux-gnu/libm.so')
File "/anaconda/envs/phat/lib/python3.6/ctypes/__init__.py", line 426, in LoadLibrary
return self._dlltype(name)
File "/anaconda/envs/phat/lib/python3.6/ctypes/__init__.py", line 348, in __init__
self._handle = _dlopen(self._name, mode)
OSError: /usr/lib/x86_64-linux-gnu/libm.so: invalid ELF header
libm is the default library comes with ubuntu 18.04.
Could you please advise what is the best way to import library?
Thank you.
I have changed the library path
from
/usr/lib/x86_64-linux-gnu/libm.so
to
/lib/x86_64-linux-gnu/libm.so.6
it worked.

python pyttsx3 error -- _pickle.UnpicklingError: invalid load key, '\x00'

i am trying to convert text to speech using pyttsx3 in python. but iam getting the error -- _pickle.UnpicklingError: invalid load key, '\x00'.
it worked once. later it didn't
my code
import pyttsx3
engine = pyttsx3.init()
engine.say("I will speak this text")
engine.runAndWait()
error i am receiving is --
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\pyttsx3__init__.py",
line 20, in init
eng = _activeEngines[driverName]
File "C:\ProgramData\Anaconda3\lib\weakref.py", line 137, in
getitem
o = self.data[key]()
KeyError: None
During handling of the above exception, another exception occurred:
...
File "C:\ProgramData\Anaconda3\lib\site-packages\win32com\client\gencache.py", line 113, in _LoadDicts
version = p.load()
_pickle.UnpicklingError: invalid load key, '\x00'.
python version is 3.7.3 |
pyttsx3 version is 2.71|
pywin32 version is 224
please help
I had this problem as well and fixed it by deleting gen_py in my temp directory.
You can find this folder here:
C:\Users\USERNAME\AppData\Local\Temp\gen_py

Cupy get error in multithread.pool if GPU already used

I tried to use cupy in two parts of my program, one of them being parallelized with a pool.
I managed to reproduce it with a simple example:
import cupy
import numpy as np
from multiprocessing import pool
def f(x):
return cupy.asnumpy(2*cupy.array(x))
input = np.array([1,2,3,4])
print(cupy.asnumpy(cupy.array(input)))
print(np.array(list(map(f, input))))
p = pool.Pool(4)
output = p.map(f, input)
p.close()
p.join()
print(output)
The output is the following:
[1 2 3 4]
[2 4 6 8]
Exception in thread Thread-3:
Traceback (most recent call last):
File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File "/usr/lib/python3.6/threading.py", line 864, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 489, in _handle_results
task = get()
File "/usr/lib/python3.6/multiprocessing/connection.py", line 251, in recv
return _ForkingPickler.loads(buf.getbuffer())
File "cupy/cuda/runtime.pyx", line 126, in cupy.cuda.runtime.CUDARuntimeError.__init__
TypeError: an integer is required
also, the code freezes and doesn't exit but I think it's not related to cupy.
And my config is this one:
CuPy Version : 5.2.0
CUDA Root : /usr/local/cuda-10.0
CUDA Build Version : 10000
CUDA Driver Version : 10000
CUDA Runtime Version : 10000
cuDNN Build Version : 7301
cuDNN Version : 7301
NCCL Build Version : 2307
This issue is not specific to CuPy. Due to the limitation of CUDA, processes cannot be forked after CUDA initialization.
You need to use multiprocessing.set_start_method('spawn') (or forkserver), or avoid initializing CUDA (i.e., do not use CuPy API except import cupy) until you fork child processes.
When I tried multiprocessing with cupy before, I needed to use spawn context.
ctx = multiprocessing.get_context('spawn')
pool = ctx.Pool(4)
I don't know this resolves your problem but can you try it?

GlfwError: Failed to create GLFW window (windows)

I am using openAI gym on windows 10 x64, with python 3.6.7 through windows remote desktop
and i succeeded in installing atari-py and mujoco-py, but when i tried running this code.
import gym
env = gym.make('Humanoid-v2')
for i_episode in range(100):
env.reset()
for t in range(100):
env.render()
action = env.action_space.sample()
observation, reward, done, info = env.step(action)
if done:
print("Episode finished after {} timesteps".format(t+1))
break
I got this error:
GLFW error (code %d): %s 65544 b'Vulkan: Failed to query instance extension count: The requested version of Vulkan is not supported by the driver or is otherwise incompatible'
Creating window glfw
GLFW error (code %d): %s 65542 b'WGL: The driver does not appear to support OpenGL'
Traceback (most recent call last):
File "test.py", line 7, in <module>
env.render()
File "D:\ReinforceLearning\RLenv\lib\site-packages\gym\core.py", line 275, in render
return self.env.render(mode, **kwargs)
File "D:\ReinforceLearning\RLenv\lib\site-packages\gym\envs\mujoco\mujoco_env.py", line 118, in render
self._get_viewer(mode).render()
File "D:\ReinforceLearning\RLenv\lib\site-packages\gym\envs\mujoco\mujoco_env.py", line 130, in _get_viewer
self.viewer = mujoco_py.MjViewer(self.sim)
File "D:\ReinforceLearning\RLenv\lib\site-packages\mujoco_py-1.50.1.0-py3.6.egg\mujoco_py\mjviewer.py", line 130, in __init__
super().__init__(sim)
File "D:\ReinforceLearning\RLenv\lib\site-packages\mujoco_py-1.50.1.0-py3.6.egg\mujoco_py\mjviewer.py", line 25, in __init__
super().__init__(sim)
File "RLenv\lib\site-packages\mujoco_py-1.50.1.0-py3.6.egg\mujoco_py\mjrendercontext.pyx", line 244, in mujoco_py.cymj.MjRenderContextWindow.__init__
super().__init__(sim, offscreen=False)
File "RLenv\lib\site-packages\mujoco_py-1.50.1.0-py3.6.egg\mujoco_py\mjrendercontext.pyx", line 43, in mujoco_py.cymj.MjRenderContext.__init__
self._setup_opengl_context(offscreen)
File "RLenv\lib\site-packages\mujoco_py-1.50.1.0-py3.6.egg\mujoco_py\mjrendercontext.pyx", line 92, in mujoco_py.cymj.MjRenderContext._setup_opengl_context
self._opengl_context = GlfwContext(offscreen=offscreen)
File "RLenv\lib\site-packages\mujoco_py-1.50.1.0-py3.6.egg\mujoco_py\opengl_context.pyx", line 48, in mujoco_py.cymj.GlfwContext.__init__
self.window = self._create_window(offscreen)
File "RLenv\lib\site-packages\mujoco_py-1.50.1.0-py3.6.egg\mujoco_py\opengl_context.pyx", line 97, in mujoco_py.cymj.GlfwContext._create_window
raise GlfwError("Failed to create GLFW window")
mujoco_py.cymj.GlfwError: Failed to create GLFW window
OpenGL over WindowsRemote is not supported on NVIDIA GPUs for OpenGL Versions after 1.1.
Did a writeup on what workaround exist:
Current state and solutions for OpenGL over Windows Remote
For extra salt into the wound: You can launch the an opengl context and then connect via WindowsRemote. But launching inside the session directly is impossible without workarounds.

Resources