pytorch distributed training fails when use 'gloo' backend - pytorch

I run the distributed training code in the pytorch tutorial, when I use the 'gloo' backend, the code raises RuntimeError when init_process_group.
"""run.py:"""
#!/usr/bin/env python
import os
import torch
import torch.distributed as dist
from torch.multiprocessing import Process
def run(rank, size):
""" Distributed function to be implemented later. """
pass
def init_process(rank, size, fn, backend='gloo'):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'
dist.init_process_group(backend, rank=rank, world_size=size)
fn(rank, size)
if __name__ == "__main__":
size = 2
processes = []
for rank in range(size):
p = Process(target=init_process, args=(rank, size, run))
p.start()
processes.append(p)
for p in processes:
p.join()
While getting the error:
Process Process-2:
Traceback (most recent call last):
File "/home/wuyiming/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/wuyiming/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "<ipython-input-1-6cc8a94a6551>", line 16, in init_process
dist.init_process_group(backend, rank=rank, world_size=size)
File "/home/wuyiming/anaconda3/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 416, in init_process_group
timeout=timeout)
File "/home/wuyiming/anaconda3/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 484, in _new_process_group_helper
timeout=timeout)
RuntimeError: [enforce fail at /pytorch/third_party/gloo/gloo/transport/tcp/device.cc:198] ifa != nullptr. Unable to find interface for: [0.0.0.27]
Process Process-1:
Traceback (most recent call last):
File "/home/wuyiming/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/wuyiming/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "<ipython-input-1-6cc8a94a6551>", line 16, in init_process
dist.init_process_group(backend, rank=rank, world_size=size)
File "/home/wuyiming/anaconda3/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 416, in init_process_group
timeout=timeout)
File "/home/wuyiming/anaconda3/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 484, in _new_process_group_helper
timeout=timeout)
RuntimeError: [enforce fail at /pytorch/third_party/gloo/gloo/transport/tcp/device.cc:198] ifa != nullptr. Unable to find interface for: [0.0.0.27]
If I change the backbone from 'gloo' to 'NCCL', the code runs correctly.

Related

RuntimeError: RPC has not been initialized. Call torch.distributed.rpc.init_rpc first

I am making the experimental FedAvg simulation using the Pytorch RPC, but the server side throws errors when I run it. It seems that it is my coding problem, but I don't know what the problem is. Here are some related code snippets:
.
.
.
#Start training
if args.rank == 0:
for e in range(args.epoch):
processes = []
q = mp.Queue()
print("Server's Epoch:"+str(e+1))
weight = copy.deepcopy(model.state_dict())
for r in range(args.world_size):
p = mp.Process(
target=run_worker,
args=(
r,
model,
args.lr,
train_loader[r],
device,
args.epoch,
weight,
q))
processes.append(p)
p.start()
for p in processes:
p.join()
.
.
.
And for the function run_worker:
def run_worker(rank, model, lr, train_loader, device, epoch, weight, q):
out_weight = rpc.rpc_sync(f"Worker{rank}", train, args=(rank, model, lr, train_loader, device, epoch, weight))
q.put([rank, out_weight])
But the error shows as below:
Server initialized!
Server's Epoch:1
Process Process-1:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/pi/FYP/FedAvg_RPC.py", line 96, in run_worker
out_weight = rpc.rpc_sync(f"Worker{rank}", train, args=(rank, model, lr, train_loader, device, epoch, weight))
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/rpc/api.py", line 75, in wrapper
raise RuntimeError(
RuntimeError: RPC has not been initialized. Call torch.distributed.rpc.init_rpc first.
Process Process-2:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/pi/FYP/FedAvg_RPC.py", line 96, in run_worker
out_weight = rpc.rpc_sync(f"Worker{rank}", train, args=(rank, model, lr, train_loader, device, epoch, weight))
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/rpc/api.py", line 75, in wrapper
raise RuntimeError(
RuntimeError: RPC has not been initialized. Call torch.distributed.rpc.init_rpc first.
Process Process-3:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/pi/FYP/FedAvg_RPC.py", line 96, in run_worker
out_weight = rpc.rpc_sync(f"Worker{rank}", train, args=(rank, model, lr, train_loader, device, epoch, weight))
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/rpc/api.py", line 75, in wrapper
raise RuntimeError(
RuntimeError: RPC has not been initialized. Call torch.distributed.rpc.init_rpc first.
So what is my coding problem on RPC? Here is my setting:
python 3.9.2
torch==1.8.0a0+37c1f4a
torchvision==0.9.0a0+01dfa8e

Pytorch loader throws error after some iterations

The code runs for several iterations and throws the following error.
My Dataset
class Dataset(Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, input_feature_paths, target_feature_folder) -> None:
self.input_feature_paths = input_feature_paths
self.target_feature_folder = target_feature_folder
def __len__(self):
#return sum(1 for _ in self.input_feature_paths)
return len(self.input_feature_paths)
def __getitem__(self, index) -> None:
input_feature_path = self.input_feature_paths[index]
input_feature = load(input_feature_path, map_location='cpu')
target_feature_path = self.target_feature_folder / input_feature_path.parts[-1]
target_feature = load(target_feature_path, map_location='cpu')
return input_feature.to(dtype=torch.float64), target_feature.to(dtype=torch.float64)
I set dtype torch float64 because it throws the same error while writing on the tensorboard summary writer.
Error Stack
Traceback (most recent call last):
File "student_audio_feature_extractor.py", line 178, in <module>
train(dt, input_frame)
File "student_audio_feature_extractor.py", line 164, in train
model, train_loss = train_step(model, train_loader, optimizer, criterion)
File "student_audio_feature_extractor.py", line 80, in train_step
for input_feature, target_feature in train_loader:
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 628, in __next__
data = self._next_data()
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1313, in _next_data
return self._process_data(data)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1359, in _process_data
data.reraise()
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/_utils.py", line 543, in reraise
raise exception
RuntimeError: Caught RuntimeError in DataLoader worker process 4.
Original Traceback (most recent call last):
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
return self.collate_fn(data)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
return collate(batch, collate_fn_map=default_collate_fn_map)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 143, in collate
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed] # Backwards compatibility.
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 143, in <listcomp>
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed] # Backwards compatibility.
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 120, in collate
return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
File "/home/visge/miniconda3/envs/zk_torch/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 162, in collate_tensor_fn
out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable
I had a tensor of shape [] that's why it throws this error i changed it and it works now.

WSQ files not opening with Pillow/wsq when using joblib.Parallel

I am trying to preprocess large amounts of WSQ images for model training using both the Pillow and wsq libraries. To speed up my code, I am trying to use Parallel but this causes an UnidentifiedImageError.
I verified that the files are there where they should be, and that the function runs without errors when used in a regular for-loop. Other files (eg csv files) can be opened inside the function without errors, so I presume that the error lies with the combination of Parallel and Pillow/wsq. All libraries are up to date. As I am just starting out with Pillow and multiprocessing, I have no idea yet on how to fix this and any help would be highly appreciated.
Code:
from joblib import Parallel, delayed
from PIL import Image
import multiprocessing
import wsq
import numpy as np
def process_image(i):
path = "/home/user/project/wsq/image_"+str(i)+".wsq"
img = np.array(Image.open(path))
#some preprocessing, saving as npz
output_path = "/home/user/project/npz/image_"+str(i)+".npz"
np.savez_compressed(output_path, img)
return None
inputs = range(100000)
num_cores = multiprocessing.cpu_count()
Parallel(n_jobs=num_cores)(delayed(process_image)(i) for i in inputs)
Output:
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/user/.local/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 431, in _process_worker
r = call_item()
File "/home/user/.local/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 285, in __call__
return self.fn(*self.args, **self.kwargs)
File "/home/user/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "/home/user/.local/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "/home/user/.local/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "preprocess_images.py", line 9, in process_image
img = np.array(Image.open(path))
File "/home/user/.local/lib/python3.8/site-packages/PIL/Image.py", line 2967, in open
raise UnidentifiedImageError(
PIL.UnidentifiedImageError: cannot identify image file '/home/user/project/wsq/image_1.wsq'
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "preprocess_images.py", line 18, in <module>
Parallel(n_jobs=num_cores)(delayed(process_image)(i) for i in inputs)
File "/home/user/.local/lib/python3.8/site-packages/joblib/parallel.py", line 1054, in __call__
self.retrieve()
File "/home/user/.local/lib/python3.8/site-packages/joblib/parallel.py", line 933, in retrieve
self._output.extend(job.get(timeout=self.timeout))
File "/home/user/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 542, in wrap_future_result
return future.result(timeout=timeout)
File "/usr/lib/python3.8/concurrent/futures/_base.py", line 439, in result
return self.__get_result()
File "/usr/lib/python3.8/concurrent/futures/_base.py", line 388, in __get_result
raise self._exception
PIL.UnidentifiedImageError: cannot identify image file '/home/user/project/wsq/image_1.wsq'

Python3 multiprocessing shared dictionary consume by all process

I'm beginner for multiprocessing,
i would like to use multiprocessing for parallel code running with streaming data.
To start good, I have coded below and got error.
Could you please tell me correct way to print on the screen.
Code:
import sys
from multiprocessing import Process, Manager
import time
def producer(dic, name):
for i in range(10000):
dic["A"] = i
time.sleep(2)
def consumer(dic, name):
for i in range(10000):
aval = dic.get("A")
#print(f" {name} - Val = {aval}")
sys.stdout.write(f" {name} - Val = {aval}")
sys.stdout.flush()
time.sleep(2.2)
if __name__ == '__main__':
manager = Manager()
dic = manager.dict()
Process(target=producer, args=(dic,"TT")).start()
time.sleep(1)
Process(target=consumer, args=(dic,"Con1")).start()
Process(target=consumer, args=(dic,"Con2")).start()
When I run the same in the windows console, I got below error, how can I print Consumer's print function in the console.Thanks
(base) PS D:\> python .\mulpro.py
Process Process-3:
Process Process-4:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 811, in
_callmethod
conn = self._tls.connection
AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _
bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in ru
n
self._target(*self._args, **self._kwargs)
File "D:\mulpro.py", line 19, in consumer
aval = dic.get("A")
File "<string>", line 2, in get
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 815, in
_callmethod
self._connect()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 802, in
_connect
conn = self._Client(self._token.address, authkey=self._authkey)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 490, i
n Client
c = PipeClient(address)
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 691, i
n PipeClient
_winapi.WaitNamedPipe(address, 1000)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 811, in
_callmethod
conn = self._tls.connection
FileNotFoundError: [WinError 2] The system cannot find the file specified
AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _
bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in ru
n
self._target(*self._args, **self._kwargs)
File "D:\mulpro.py", line 19, in consumer
aval = dic.get("A")
File "<string>", line 2, in get
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 815, in
_callmethod
self._connect()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 802, in
_connect
conn = self._Client(self._token.address, authkey=self._authkey)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 490, i
n Client
c = PipeClient(address)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 691, i
n PipeClient
_winapi.WaitNamedPipe(address, 1000)
FileNotFoundError: [WinError 2] The system cannot find the file specified
Process Process-2:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _
bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in ru
n
self._target(*self._args, **self._kwargs)
File "D:\mulpro.py", line 13, in producer
dic["A"] = i
File "<string>", line 2, in __setitem__
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 818, in
_callmethod
conn.send((self._id, methodname, args, kwds))
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 206, i
n send
self._send_bytes(_ForkingPickler.dumps(obj))
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 280, i
n _send_bytes
ov, err = _winapi.WriteFile(self._handle, buf, overlapped=True)
BrokenPipeError: [WinError 232] The pipe is being closed
The reason might be that the main process doesn't wait for two children process, so make your code like this could work:
def run():
manager = Manager()
dic = manager.dict()
Process(target=producer, args=(dic,"TT")).start()
time.sleep(1)
Process(target=consumer, args=(dic,"Con1")).start()
Process(target=consumer, args=(dic,"Con2")).start()
while True:
pass
if __name__ == '__main__':
run()
However it's very strange that if I append a dead loop in main instead of using another function, it still raise that exception. Anyway, the code above could help.

SyntaxError raised when using joblib with lxml on python 3.5

I am trying to parallelize the tasks of correcting texts on many documents with Python, so I naturally found "joblib". I want each task to be to correct a given document. Here is the structure of the code:
if __name__ == '__main__':
lexicon = build_compact_lexicon()
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=num_cores)(delayed(find_errors)('GDL', i, 1, lexicon) for i in range(1798, 1820))
I am using the function find_errors summed up here :
def find_errors(newspaper, year, month, lexicon):
# parse the input newspaper text data using etree parser from LXML
# detect errors in the text
return found_errors_type1, found_errors_type2, found_errors_type3
This does raise me a few errors
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/joblib/parallel.py", line 130, in __call__
return self.func(*args, **kwargs)
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/joblib/parallel.py", line 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/joblib/parallel.py", line 72, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "hellowordParallel.py", line 85, in find_errors
tree = etree.parse(xml_file_path)
File "src/lxml/lxml.etree.pyx", line 3427, in lxml.etree.parse (src/lxml/lxml.etree.c:79801)
File "src/lxml/parser.pxi", line 1805, in lxml.etree._parseDocument (src/lxml/lxml.etree.c:116293)
TypeError: cannot parse from 'NoneType'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/tokenize.py", line 392, in find_cookie
line_string = line.decode('utf-8')
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb0 in position 24: invalid start byte
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/joblib/parallel.py", line 139, in __call__
tb_offset=1)
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/joblib/format_stack.py", line 373, in format_exc
frames = format_records(records)
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/joblib/format_stack.py", line 274, in format_records
for token in generate_tokens(linereader):
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/tokenize.py", line 514, in _tokenize
line = readline()
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/joblib/format_stack.py", line 265, in linereader
line = getline(file, lnum[0])
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/linecache.py", line 16, in getline
lines = getlines(filename, module_globals)
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/linecache.py", line 47, in getlines
return updatecache(filename, module_globals)
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/linecache.py", line 136, in updatecache
with tokenize.open(fullname) as fp:
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/tokenize.py", line 456, in open
encoding, lines = detect_encoding(buffer.readline)
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/tokenize.py", line 433, in detect_encoding
encoding = find_cookie(first)
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/tokenize.py", line 397, in find_cookie
raise SyntaxError(msg)
File "<string>", line None
SyntaxError: invalid or missing encoding declaration for '/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/lxml/etree.so'
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "hellowordParallel.py", line 160, in <module>
results = Parallel(n_jobs=num_cores)(delayed(find_errors)('GDL', i, 1, lexicon) for i in range(1798, 1820))
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/joblib/parallel.py", line 810, in __call__
self.retrieve()
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/joblib/parallel.py", line 727, in retrieve
self._output.extend(job.get())
File "/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/multiprocessing/pool.py", line 608, in get
raise self._value
SyntaxError: invalid or missing encoding declaration for '/home/mbl/anaconda3/envs/OCR_Correction/lib/python3.5/site-packages/lxml/etree.so'
I don't understand if this is due do something related with configs or if my function doesn't fit in a parallel implementation... (I guess it should...)
Did it happen to some of you before?
Hope my question is clear and there is enough information for someone to give me some help!

Resources