NCCL/Gloo: freeze with cyclic isend and recv - pytorch

I've made a very simple algorithm in two versions: on Gloo backed and on NCCL backend. And the both versions freeze. Could you please explain me why these programs hang?
I assume the cause is that I send data in circle:
process #1 sends data to process #2
process #2 sends data to process #3
process #3 sends data to process #1
But I don’t understand why it can lead to a freeze.
Here's NCCL version of a program:
import os
import torch as th
import torch.distributed as dist
import torch.multiprocessing as mp
def run(rank: int, value: float, src:int, dst: int):
tensor = th.FloatTensor([value,]).to(f"cuda:{rank}")
print(f"[rk={rank}] tensor before send-recv: {tensor}")
req = dist.isend(tensor=tensor, dst=dst)
print(f"[rk={rank}] after isend")
dist.recv(tensor=tensor, src=src)
print(f"[rk={rank}] after recv")
req.wait()
print(f"[rk={rank}] after wait")
print(f"[rk={rank}] tensor after send-recv: {tensor}")
def init_process(rank: int):
dist.init_process_group(
"nccl",
rank=rank,
world_size=3,
init_method="file:///home/user/store"
)
if rank==0:
run(rank=rank, value=float(rank), src=1, dst=1)
elif rank==1:
run(rank, value=float(rank), src=2, dst=2)
elif rank==2:
run(rank, value=float(rank), src=0, dst=0)
else:
raise Exception()
if __name__ == "__main__":
mp.set_start_method("spawn")
processes = []
for rank in [0,1, 2]:
p = mp.Process(target=init_process, args=(rank, ))
p.start()
processes.append(p)
for p in processes:
p.join()
And here's a Gloo version (without CUDA):
import os
import torch as th
import torch.distributed as dist
import torch.multiprocessing as mp
def run(rank: int, value: float, src:int, dst: int):
tensor = th.FloatTensor([value,])
print(f"[rk={rank}] tensor before send-recv: {tensor}")
req = dist.isend(tensor=tensor, dst=dst)
print(f"[rk={rank}] after isend")
dist.recv(tensor=tensor, src=src)
print(f"[rk={rank}] after recv")
req.wait()
print(f"[rk={rank}] after wait")
print(f"[rk={rank}] tensor after send-recv: {tensor}")
def init_process(rank: int):
dist.init_process_group(
"gloo",
rank=rank,
world_size=3,
init_method="file:///home/user/store"
)
if rank==0:
run(rank=rank, value=float(rank), src=1, dst=1)
elif rank==1:
run(rank, value=float(rank), src=2, dst=2)
elif rank==2:
run(rank, value=float(rank), src=0, dst=0)
else:
raise Exception()
if __name__ == "__main__":
mp.set_start_method("spawn")
processes = []
for rank in [0,1, 2]:
p = mp.Process(target=init_process, args=(rank, ))
p.start()
processes.append(p)
for p in processes:
p.join()
I will be glad even if you help with one of the programs.
Thank you for your attention!

Related

Script Multiprocessing dont finish all task, and also i get 100 cpu?

i need to ask if part of my script is correct, working "i think fine" but i think really i have somethink wrong, because still i get CPU 100% and so many time dont finish all task but after 50/100 task is like frozen.
Any info how to edit it ? Or Maybe just tell me where is the error ?
Thank you
Ps. I have inserted all the modules that the script requires and only the part that should be of interest for multiprocessing and also just firt part of the script.
Many Thanks
from __future__ import print_function
import sys
import os
import easygui
import pyautogui as py
import datetime
import pwinput
import json
from collections import Counter
import random
import string
import threading
import subprocess
import multiprocessing
import queue
from multiprocessing import cpu_count
from multiprocessing import Value, Lock, Process, Queue, current_process
import numpy as np
import grequests
import requests
from requests.exceptions import ConnectionError
from requests.exceptions import HTTPError
import time
from time import sleep
number_of_processes = cpu_count()
class Counter(object):
def __init__(self, initval=0):
self.val = Value('i', initval)
self.lock = Lock()
def increment(self):
with self.lock:
self.val.value += 1
def value(self):
with self.lock:
return self.val.value
def updateTitle(number_of_processes,number_of_task,counterhits,counterdone,countersl,countml,username):
while True:
hits = int(counterhits.value())
done = int(counterdone.value())
shtot = int(countersl.value())
maitot = int(countml.value())
remain_scan = number_of_task - hits
elapsed = time.strftime('%H:%M:%S', time.gmtime(time.time() - start))
ctypes.windll.kernel32.SetConsoleTitleW(f'Site Valid For: {number_of_task} | Started: {hits} | Complete: {done} | Remain: {remain_scan} | SL Found: {shtot} | ML Found: {maitot} | Threads: {number_of_processes} | Time elapsed: {elapsed} ! Licensed at: {username}')
sleep(0.3)
def worker_main(tasks_to_do,tasks_finished,counterhits,counterdone,countersl,countml):
while True:
try:
site = tasks_to_do.get_nowait()
if site is None:
break
except Queue.Empty:
break
except Queue.Full:
sleep(0.5)
continue
counterhits.increment()
do_work(site,counterhits,counterdone,countersl,countml)
tasks_finished.put(site + current_process().name)
counterdone.increment()
return True
def main():
global username
number_of_task = int(len(filter_data))
counterhits = Counter(0)
counterdone = Counter(0)
countersl = Counter(0)
countml = Counter(0)
tasks_to_do = Queue()
tasks_finished = Queue()
processes1 = []
prefix = ['http://']
# creating processes
for w in range(number_of_processes):
p1 = Process(target=worker_main, args=(tasks_to_do,tasks_finished,counterhits,counterdone,countersl,countml))
processes1.append(p1)
p1.start()
procs = [Process(target=updateTitle, args=(number_of_processes,number_of_task,counterhits,counterdone,countersl,countml,username), daemon=True) for i in range(1)]
for p in procs: p.start()
for site_il in filter_data:
site_or = site_il.rstrip("\n")
if (site_or.startswith("http://")) :
site_or = site_or.replace("http://","")
elif (site_or.startswith("https://")) :
site_or = site_or.replace("https://","")
site_or = site_or.rstrip()
site_or = site_or.split('/')[0]
if ('www.' in site_or) :
site_or = site_or.replace("www.", "")
sitexx = [sub + site_or for sub in prefix]
for site in sitexx:
tasks_to_do.put(site)
# completing process
for p1 in processes1:
p1.join()
for p in procs: p.join()
# print the output
while not tasks_finished.empty():
print(tasks_finished.get())
os.system('pause>nul')
return True
if __name__ == '__main__':
if sys.platform.startswith('win'):
# On Windows calling this function is necessary.
multiprocessing.freeze_support()
main()

Producer Consumer message sharing not working in multiprocessing

i am trying to run a scenario where i have a producer which is capturing frames from webcam and putting it in a queue.
and then consumer reads image from input queue and does some processing and puts o/p image in outgoing queue.
Issue is, consumer read from queue is not blocking. Ideally it should be, also when it reads value from queue, size is always constant 128, which is wrong. I am sure size of image that I am putting in queue is far greater.
from __future__ import print_function
import multiprocessing
import time
import logging
import sys
import cv2
class Consumer(multiprocessing.Process):
def __init__(self, incoming_q, outgoing_q):
multiprocessing.Process.__init__(self)
self.outgoing_q = outgoing_q
self.incoming_q = incoming_q
def run(self):
proc_name = self.name
print(f"{proc_name} - inside process_feed..starting")
while True:
#print(f"size of incoming_q=>{self.incoming_q.qsize()}")
try:
#print(f"{proc_name} - size of B incoming_q=>{self.incoming_q.qsize()}")
image_np = self.incoming_q.get(True)
size_of_img = sys.getsizeof(image_np)
#print(f"{proc_name} - size of A incoming_q=>{self.incoming_q.qsize()}")
if size_of_img > 128:
print(f"{proc_name} - size image=>{size_of_img}")
time.sleep(1)
self.outgoing_q.put_nowait(image_np)
except:
pass
print("inside process_feed..ending")
class Producer(multiprocessing.Process):
def __init__(self, incoming_q, outgoing_q):
multiprocessing.Process.__init__(self)
self.incoming_q = incoming_q
self.outgoing_q = outgoing_q
def run(self):
proc_name = self.name
print("inside capture_feed")
stream = cv2.VideoCapture(0)
try:
counter = 0
while True:
counter += 1
if counter == 1:
if not self.incoming_q.full():
(grabbed, image_np) = stream.read()
size_of_img = sys.getsizeof(image_np)
print(f"{proc_name}........B.......=>{self.incoming_q.qsize()}")
print(f"{proc_name} - size image=>{size_of_img}")
self.incoming_q.put(image_np)
print(f"{proc_name}........A.......=>{self.incoming_q.qsize()}")
counter = 0
try:
image_np = self.outgoing_q.get_nowait()
logging.info("reading value for o/p")
cv2.imshow('object detection', image_np)
except:
pass
if cv2.waitKey(25) & 0xFF == ord('q'):
break
finally:
stream.release()
cv2.destroyAllWindows()
print("inside capture_feed..ending")
if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
stream = cv2.VideoCapture(0)
incoming_q = multiprocessing.Queue(maxsize=100)
outgoing_q = multiprocessing.Queue(maxsize=100)
logging.info("before start of thread")
max_process = 1
processes = []
processes.append(Producer(incoming_q, outgoing_q))
for i in range(max_process):
p = Consumer(incoming_q, outgoing_q)
p.daemon = True
processes.append(p)
logging.info("inside main thread..middle")
for p in processes:
p.start()
logging.info("inside main thread..ending")
logging.info("waiting in main thread too....")
logging.info("waiting in main thread finished....")
for p in processes:
p.join()
logging.info("inside main thread..ended")
I was able to figure out issue with my approach. I missed whole concept of pickle (serialization).
I changed my code to serialize numpy array before writing to queue and deserialize after reading it. Code started working as expected.
also printing 128 as sizeof np array is fine, i was misinterpreting that number.
def serialize_ndarray(arr:np.ndarray):
serialized = pickle.dumps(arr)
return serialized
def deserialize_ndarray(string):
data = pickle.loads(string)
return data

Python Not printing expected output to screen from given thread

Could someone explain to me why my code doesn't print the expected output [Thread1 36] from the thread? I am currently using python3.7.0 on a mac OS Catalina 10.15.2
Here is my code:
import timeit, _thread
def expmod1(a, n, m):
return (a**n)%m
# avg_time is a higher order function
# argument is the different variations of the expmod(135, 202, 53) function
def avg_time(thread_name, expmod_function):
print("Start")
result = expmod_function(135, 202, 53)
print(thread_name + " " + str(result), flush=True)
return result
# analysis of all three functions based on average timecost using a constant function as defined by avg_time
def analysis1():
try:
_thread.start_new_thread(avg_time, ("Thread1", expmod1))
except:
print("Unable to start thread")
def main():
analysis1()
if __name__ == "__main__":
main()

multiprocessing TypeError: can't pickle _thread.lock objects

when i run it on python3 ,i get error:TypeError: can't pickle _thread.lock objects
example:
import time
import multiprocessing
from multiprocessing import Process,freeze_support,Manager,Queue,Pool
q=Queue()
pool=Pool(3)
class A(object):
def init(self,varam):
self.varam = varam
class class_one(object):
def __init__(self):
self.var = {}
self.list = []
self.get()
def get_result(self, varam):
q.put(A(varam))
def get(self):
for i in [6,7,8,9]:
self.list.append(pool.apply_async(self.get_result, (i, )))
#p = Process(target=self.get_result, args=(i, ))
#process_list.append(p)
#p.start()
pool.close()
pool.join()
if __name__ == '__main__':
freeze_support()
process_num =2
process_list = []
g = class_one()
for i in g.list:
print(i.get())
# for j in process_list:
# j.join()
# print(q.qsize())
# if q.empty():
# print("error")
# while (not q.empty()):
# print(q.get().varam)
# g.var[q.get().varam]=q.get()

How do i reflect changes made by a function to a global variable in the main function?

from num2words import num2words
import re
from googletrans import Translator
import time, os, glob
import concurrent.futures
start_time = time.time()
translator = Translator()
src_dir="/home/lol/patrika1"
dest_file="/home/lol/df.txt"
counter=1
def n2w(match):
return translator.translate(num2words(int(match.group(1))),dest='hi').text
def clean_file(file_name):
global counter
fil = open(file_name,'r')
lines = fil.read()
fil.close()
# more logic
result=re.sub(r'[\n]+','\n',result2)
counter+=1
print(counter)
print(file_name)
cleaned.write(result)
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == '__main__':
global cleaned
os.chdir(src_dir)
file_list=glob.glob("*.txt")
cleaned=open(dest_file,'a')
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(clean_file,file_list[:10])
print("finish "+ str(counter))
cleaned.close()
Counter has a value 1 in the main function when i print.
How do i maintain a count of how many files have been processed by the function?
As usually global variable use is not advised ( what does it count ? who modifies it ? and the case where 2 scripts to merge use the same "counter" for 2 different things ), you could use that type of construct :
class FileCleaner:
Counter = 0
#classmethod
def clean(cls, file_name):
...
cls.Counter +=1
...
and then access FileCleaner.Counter from anywhere and call function with FileCleaner.clean( ... )
maybe try cleaner code before using ProcessPoolExecutor model, as it does not give easy to read code ( hopefully will be helped soon by subinterpreters) you would need to split the list of files, call executor in try/except , add +1 to the class counter on success all that from a file cleaning pool class. not from main.
import concurrent.futures
import threading
import math
PRIMES = [
112272535095293,
112582705942171,
112272535095293,
115280095190773,
115797848077099,
1099726899285419]
class PrimePoolTester:
Counter = 0
#classmethod
def is_prime(cls,n):
if n % 2 == 0:
return False
sqrt_n = int(math.floor(math.sqrt(n)))
for i in range(3, sqrt_n + 1, 2):
if n % i == 0:
return False
return True
#classmethod
def execute(cls,primes):
with concurrent.futures.ProcessPoolExecutor() as executor:
for number, prime in zip(primes, executor.map(cls.is_prime, primes)):
cls.Counter += 1
print('(%s)-%d : %d is prime: %s' % (threading.current_thread().name, cls.Counter, number, prime))
class Runner_interpreter:
def __init__(self, thread_count, worker):
self.thr = []
for _ in range(thread_count):
t = threading.Thread(target = worker)
t.daemon = True
t.start()
self.thr.append( t )
def join(self):
for th in self.thr:
th.join()
if __name__ == '__main__':
def job():
global worklist
PrimePoolTester.execute( worklist.pop(0) )
worklist = [ PRIMES ] * 4
#use 4 "core"
Runner_interpreter(4,job).join()

Resources