Slow multiprocessing when parent object contains large data - python-3.x

Consider the following snippet:
import numpy as np
import multiprocessing as mp
import time
def work_standalone(args):
return 2
class Worker:
def __init__(self):
self.data = np.random.random(size=(10000, 10000))
# leave a trace whenever init is called
with open('rnd-%d' % np.random.randint(100), 'a') as f:
f.write('init called\n')
def work_internal(self, args):
return 2
def _run(self, target):
with mp.Pool() as pool:
tasks = [[idx] for idx in range(16)]
result = pool.imap(target, tasks)
for res in result:
pass
def run_internal(self):
self._run(self.work_internal)
def run_standalone(self):
self._run(work_standalone)
if __name__ == '__main__':
t1 = time.time()
Worker().run_standalone()
t2 = time.time()
print(f'Standalone took {t2 - t1:.3f} seconds')
t3 = time.time()
Worker().run_internal()
t4 = time.time()
print(f'Internal took {t3 - t4:.3f} seconds')
I.e. we have an object containing a large variable that uses multiprocessing to parallelize some work that has nothing to do with that large variable, i.e. does not read from or write to. The location of the worker process has a huge impact on the runtime:
Standalone took 0.616 seconds
Internal took 19.917 seconds
Why is this happening? I am completely lost. Note that __init__ is only called twice, so the random data is not created for every new process in the pool. The only reason I can think of why this would be slow is that data is copied around, but that would not make sense since it is never used anywhere, and python is supposed to use copy-on-write semantics. Also note that the difference disappears if you make run_internal a static method.

The issue you have is due to the target you are calling from the pool. That target is the function with the reference to Worker instance.
Now, you're right that the __init__() is only called twice. But remember, when you send anything to and from the processes, python will need to pickle the data first.
So, because your target is self.work_internal(), python has to pickle the Worker() instance every time the imap is called. This leads to one issue, self.data being copied over again and again.
The following is the proof. I just added 1 "input" statements, and fixed the last time of time calculation.
import numpy as np
import multiprocessing as mp
import time
def work_standalone(args):
return 2
class Worker:
def __init__(self):
self.data = np.random.random(size=(10000, 10000))
# leave a trace whenever init is called
with open('rnd-%d' % np.random.randint(100), 'a') as f:
f.write('init called\n')
def work_internal(self, args):
return 2
def _run(self, target):
with mp.Pool() as pool:
tasks = [[idx] for idx in range(16)]
result = pool.imap(target, tasks)
input("Wait for analysis")
for res in result:
pass
def run_internal(self):
self._run(self.work_internal)
# self._run(work_standalone)
def run_standalone(self):
self._run(work_standalone)
def work_internal(target):
with mp.Pool() as pool:
tasks = [[idx] for idx in range(16)]
result = pool.imap(target, tasks)
for res in result:
pass
if __name__ == '__main__':
t1 = time.time()
Worker().run_standalone()
t2 = time.time()
print(f'Standalone took {t2 - t1:.3f} seconds')
t3 = time.time()
Worker().run_internal()
t4 = time.time()
print(f'Internal took {t4 - t3:.3f} seconds')
You can run the code, when it shows up "wait for analysis", go and check the memory usage.
Like so
Then on the second time you see the message, press enter. And observe the memory usage increasing and decreasing again.
On the other hand, if you change self._run(self.work_internal) to self._run(work_standalone) you would notice that the speed is very fast, and the memory is not increasing, as well as the time taken is a lot shorter than doing self.work_internal.
Solution
One way to solve your issue is to set self.data as a static class variable. In normal cases, this would prevent instances from having to copy/reinit the variable again. This also prevented the issue from occuring.
class Worker:
data = np.random.random(size=(10000, 10000))
def __init__(self):
pass
...

Related

Python multiprocessing script partial output

I am following the principles laid down in this post to safely output the results which will eventually be written to a file. Unfortunately, the code only print 1 and 2, and not 3 to 6.
import os
import argparse
import pandas as pd
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist):
for par in parlist:
queue.put(par)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
try:
par=queueIn.get(block=False)
res=doCalculation(par)
queueOut.put((res))
queueIn.task_done()
except:
break
def doCalculation(par):
return par
def write(queue):
while True:
try:
par=queue.get(block=False)
print("response:",par)
except:
break
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
On running the code it prints,
$ python3 tst.py
Queue size 6
response: 1
response: 2
Also, is it possible to ensure that the write function always outputs 1,2,3,4,5,6 i.e. in the same order in which the data is fed into the feed queue?
The error is somehow with the task_done() call. If you remove that one, then it works, don't ask me why (IMO that's a bug). But the way it works then is that the queueIn.get(block=False) call throws an exception because the queue is empty. This might be just enough for your use case, a better way though would be to use sentinels (as suggested in the multiprocessing docs, see last example). Here's a little rewrite so your program uses sentinels:
import os
import argparse
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist, nthreads):
for par in parlist:
queue.put(par)
for i in range(nthreads):
queue.put(None)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
par=queueIn.get()
if par is None:
break
res=doCalculation(par)
queueOut.put((res))
def doCalculation(par):
return par
def write(queue):
while not queue.empty():
par=queue.get()
print("response:",par)
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod, nthreads))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
A few things to note:
the sentinel is putting a None into the queue. Note that you need one sentinel for every worker process.
for the write function you don't need to do the sentinel handling as there's only one process and you don't need to handle concurrency (if you would do the empty() and then get() thingie in your calc function you would run into a problem if e.g. there's only one item left in the queue and both workers check empty() at the same time and then both want to do get() and then one of them is locked forever)
you don't need to put feed and write into processes, just put them into your main function as you don't want to run it in parallel anyway.
how can I have the same order in output as in input? [...] I guess multiprocessing.map can do this
Yes map keeps the order. Rewriting your program into something simpler (as you don't need the workerQueue and writerQueue and adding random sleeps to prove that the output is still in order:
from multiprocessing import Pool
import time
import random
def calc(val):
time.sleep(random.random())
return val
if __name__ == "__main__":
considerperiod=[1,2,3,4,5,6]
with Pool(processes=2) as pool:
print(pool.map(calc, considerperiod))

Processing huge CSV file using Python and multithreading

I have a function that yields lines from a huge CSV file lazily:
def get_next_line():
with open(sample_csv,'r') as f:
for line in f:
yield line
def do_long_operation(row):
print('Do some operation that takes a long time')
I need to use threads such that each record I get from the above function I can call do_long_operation.
Most places on Internet have examples like this, and I am not very sure if I am on the right path.
import threading
thread_list = []
for i in range(8):
t = threading.Thread(target=do_long_operation, args=(get_next_row from get_next_line))
thread_list.append(t)
for thread in thread_list:
thread.start()
for thread in thread_list:
thread.join()
My questions are:
How do I start only a finite number of threads, say 8?
How do I make sure that each of the threads will get a row from get_next_line?
You could use a thread pool from multiprocessing and map your tasks to a pool of workers:
from multiprocessing.pool import ThreadPool as Pool
# from multiprocessing import Pool
from random import randint
from time import sleep
def process_line(l):
print l, "started"
sleep(randint(0, 3))
print l, "done"
def get_next_line():
with open("sample.csv", 'r') as f:
for line in f:
yield line
f = get_next_line()
t = Pool(processes=8)
for i in f:
t.map(process_line, (i,))
t.close()
t.join()
This will create eight workers and submit your lines to them, one by one. As soon as a process is "free", it will be allocated a new task.
There is a commented out import statement, too. If you comment out the ThreadPool and import Pool from multiprocessing instead, you will get subprocesses instead of threads, which may be more efficient in your case.
Using a Pool/ThreadPool from multiprocessing to map tasks to a pool of workers and a Queue to control how many tasks are held in memory (so we don't read too far ahead into the huge CSV file if worker processes are slow):
from multiprocessing.pool import ThreadPool as Pool
# from multiprocessing import Pool
from random import randint
import time, os
from multiprocessing import Queue
def process_line(l):
print("{} started".format(l))
time.sleep(randint(0, 3))
print("{} done".format(l))
def get_next_line():
with open(sample_csv, 'r') as f:
for line in f:
yield line
# use for testing
# def get_next_line():
# for i in range(100):
# print('yielding {}'.format(i))
# yield i
def worker_main(queue):
print("{} working".format(os.getpid()))
while True:
# Get item from queue, block until one is available
item = queue.get(True)
if item == None:
# Shutdown this worker and requeue the item so other workers can shutdown as well
queue.put(None)
break
else:
# Process item
process_line(item)
print("{} done working".format(os.getpid()))
f = get_next_line()
# Use a multiprocessing queue with maxsize
q = Queue(maxsize=5)
# Start workers to process queue items
t = Pool(processes=8, initializer=worker_main, initargs=(q,))
# Enqueue items. This blocks if the queue is full.
for l in f:
q.put(l)
# Enqueue the shutdown message (i.e. None)
q.put(None)
# We need to first close the pool before joining
t.close()
t.join()
Hannu's answer is not the best method.
I ran the code on a 100M rows CSV file. It took me forever to perform the operation.
However, prior to reading his answer, I had written the following code:
def call_processing_rows_pickably(row):
process_row(row)
import csv
from multiprocessing import Pool
import time
import datetime
def process_row(row):
row_to_be_printed = str(row)+str("hola!")
print(row_to_be_printed)
class process_csv():
def __init__(self, file_name):
self.file_name = file_name
def get_row_count(self):
with open(self.file_name) as f:
for i, l in enumerate(f):
pass
self.row_count = i
def select_chunk_size(self):
if(self.row_count>10000000):
self.chunk_size = 100000
return
if(self.row_count>5000000):
self.chunk_size = 50000
return
self.chunk_size = 10000
return
def process_rows(self):
list_de_rows = []
count = 0
with open(self.file_name, 'rb') as file:
reader = csv.reader(file)
for row in reader:
print(count+1)
list_de_rows.append(row)
if(len(list_de_rows) == self.chunk_size):
p.map(call_processing_rows_pickably, list_de_rows)
del list_de_rows[:]
def start_process(self):
self.get_row_count()
self.select_chunk_size()
self.process_rows()
initial = datetime.datetime.now()
p = Pool(4)
ob = process_csv("100M_primes.csv")
ob.start_process()
final = datetime.datetime.now()
print(final-initial)
This took 22 minutes. Obviously, I need to have more improvements. For example, the Fred library in R takes 10 minutes maximum to do this task.
The difference is: I am creating a chunk of 100k rows first, and then I pass it to a function which is mapped by threadpool(here, 4 threads).

python apply_async does not call method

I have a method which needs to process through a large database, that would take hours/days to dig through
The arguments are stored in a (long) list of which max X should be processed in one batch. The method does not need to return anything, yet i return "True" for "fun"...
The function is working perfectly when I'm iterating through it linearly (generating/appending the results in other tables not seen here), yet I am unable to get apply_async or map_async work. (it worked before in other projects)
Any hint of what might I be doing wrong would be appreciated, thanks in advance!
See code below:
import multiprocessing as mp
class mainClass:
#loads of stuff
def main():
multiprocess = True
batchSize = 35
mC = mainClass()
while True:
toCheck = [key for key, value in mC.lCheckSet.items()] #the tasks are stored in a dictionary, I'm referring to them with their keys, which I turn to a list here for iteration.
if multiprocess == False:
#this version works perfectly fine
for i in toCheck[:batchSize]:
mC.check(i)
else:
#the async version does not, either with apply_async...
with mp.Pool(processes = 8) as pool:
temp = [pool.apply_async(mC.check, args=(toCheck[n],)) for n in range(len(toCheck[:batchSize]))]
results = [t.get() for t in temp]
#...or as map_async
pool = mp.Pool(processes = 8)
temp = pool.map_async(mC.check, toCheck[:batchSize])
pool.close()
pool.join()
if __name__=="__main__":
main()
The "smell" here is that you are instantiating your maincClass on the main Process, just once, and then trying to call a method on it on the different processes - but note that when you pass mC.check to your process pool, it is a method already bound to the class instantiated in this process.
I'd guess there is where your problem lies. Although that could possibly work - and it does - I made this simplified version and it works as intended :
import multiprocessing as mp
import random, time
class MainClass:
def __init__(self):
self.value = 1
def check(self, arg):
time.sleep(random.uniform(0.01, 0.3))
print(id(self),self.value, arg)
def main():
mc = MainClass()
with mp.Pool(processes = 4) as pool:
temp = [pool.apply_async(mc.check, (i,)) for i in range(8)]
results = [t.get() for t in temp]
main()
(Have you tried just adding some prints to make sure the method is not running at all?)
So, the problem lies likely in some complex state in your MainClass that does not make it to the parallel processes in a good way. A possible work-around is to instantiate your mainclasses inside each process - that can be easily done since MultiProcessing allow you to get the current_process, and use this object as a namespace to keep data in the process instantiated in the worker Pool, across different calls to apply async.
So, create a new check function like the one bellow - and instead of instantiating your mainclass in the mainprocess, instantiate it inside each process in the pool:
import multiprocessing as mp
import random, time
def check(arg):
process = mp.current_process
if not hasattr(process, "main_class"):
process.main_class = MainClass()
process.main_class.check(arg)
class MainClass:
def __init__(self):
self.value = random.randrange(100)
def check(self, arg):
time.sleep(random.uniform(0.01, 0.3))
print(id(self),self.value, arg)
def main():
mc = MainClass()
with mp.Pool(processes = 2) as pool:
temp = [pool.apply_async(check, (i,)) for i in range(8)]
results = [t.get() for t in temp]
main()
I got to this question with the same problem, my apply_async calls not called at all, but the reason on my case was that the parameters number on apply_async call was different to the number on function declaration

Python3: Multiprocessing consumes extensively much RAM and slows down

I start multiple processes in order to create a list of new objects. htop shows me in between 1 and 4 processes (I always create 3 new objects).
def foo(self):
with multiprocessing.Pool(processes=3, maxtasksperchild=10) as pool:
result = pool.map_async(self.new_obj, self.information)
self.new_objs = result.get()
pool.terminate()
gc.collect()
I call foo() multiple times, each time it is called, the whole process is running slower, the program does not even finish in the end, as it slows down to much. The program starts to eat up all my RAM, while the sequential approach does not have any significant RAM usage.
When I kill the program, most of the time this was the function the program was last executing.
->File "threading.py", line 293, in wait
waiter.acquire()
Edit
To give some information about my circumstances. I create a tree made of nodes. foo() is called by a parent node in order to create its child nodes. The result returned by the processes are these child nodes. Those are saved in a list at the parent node. I want to parallelize the creation of those child nodes instead of creating them in a sequential way.
I think your issue has mainly to do with the fact that your parallelised function is a method of the object. It's hard to be certain without more information, but consider this little toy program:
import multiprocessing as mp
import numpy as np
import gc
class Object(object):
def __init__(self, _):
self.data = np.empty((100, 100, 100), dtype=np.float64)
class Container(object):
def __new__(cls):
self = object.__new__(cls)
print("Born")
return self
def __init__(self):
self.objects = []
def foo(self):
with mp.Pool(processes=3, maxtasksperchild=10) as pool:
result = pool.map_async(self.new_obj, range(50))
self.objects.extend(result.get())
pool.terminate()
gc.collect()
def new_obj(self, i):
return Object(i)
def __del__(self):
print("Dead")
if __name__ == '__main__':
c = Container()
for j in range(5):
c.foo()
Now Container is called only once, so you'd expect to see a "Born", followed by a "Dead" being printed out; but since the code being executed by the processes is a method of the container, this means the whole container has to be executed elsewhere ! Running this, you will see a stream of intermingled "Born" and "Dead" as your container is being rebuilt on every execution of map:
Born
Born
Born
Born
Born
Dead
Born
Dead
Dead
Born
Dead
Born
...
<MANY MORE LINES HERE>
...
Born
Dead
To convince yourself that the entire container is being copied and sent around every time, try to set some non-serialisable value:
def foo(self):
with mp.Pool(processes=3, maxtasksperchild=10) as pool:
result = pool.map_async(self.new_obj, range(50))
self.fn = lambda x: x**2
self.objects.extend(result.get())
pool.terminate()
gc.collect()
Which will immediately raise an AttributeError as it cannot serialise the container.
Let's sum up: when sending 1000 requests to the pool, Container will be serialised, sent to the processes and deserialised there a 1000 times. Sure, they will eventually be dropped (assuming there's not too much weird cross-referencing going on), but that will definitely put a lot of pressure on the RAM, as the object is serialised, called, updated, reserialised... for every element in your mapped inputs.
How can you solve that ? Well, ideally, do not share state:
def new_obj(_):
return Object(_)
class Container(object):
def __new__(cls):
self = object.__new__(cls)
print("Born")
return self
def __init__(self):
self.objects = []
def foo(self):
with mp.Pool(processes=3, maxtasksperchild=10) as pool:
result = pool.map_async(new_obj, range(50))
self.objects.extend(result.get())
pool.terminate()
gc.collect()
def __del__(self):
print("Dead")
This completes in a fraction of the time, and only produces the tiniest blimp on the RAM (as a single Container is ever built). If you need some of the internal state to be passed there, extract it and send just that:
def new_obj(tup):
very_important_state, parameters = tup
return Object(very_important_state=very_important_state,
parameters=parameters)
class Container(object):
def __new__(cls):
self = object.__new__(cls)
print("Born")
return self
def __init__(self):
self.objects = []
def foo(self):
important_state = len(self.objects)
with mp.Pool(processes=3, maxtasksperchild=10) as pool:
result = pool.map_async(new_obj,
((important_state, i) for i in range(50)))
self.objects.extend(result.get())
pool.terminate()
gc.collect()
def __del__(self):
print("Dead")
This has the same behaviour as before. If you absolutely cannot avoid sharing some mutable state between the processes, checkout out the multiprocessing tools for doing that without having to copy everything everywhere everytime.

Threaded result not giving same result as un-threaded result (python)

I have created a program to generate data points of functions that I later plot. The program takes a class which defines the function, creates a data outputting object which when called generates the data to a text file. To make the whole process faster I put the jobs in threads, however when I do, the data generated is not always correct. I have attached a picture to show what I mean:
Here are some of the relevant bits of code:
from queue import Queue
import threading
import time
queueLock = threading.Lock()
workQueue = Queue(10)
def process_data(threadName, q, queue_window, done):
while not done.get():
queueLock.acquire() # check whether or not the queue is locked
if not workQueue.empty():
data = q.get()
# data is the Plot object to be run
queueLock.release()
data.parent_window = queue_window
data.process()
else:
queueLock.release()
time.sleep(1)
class WorkThread(threading.Thread):
def __init__(self, threadID, q, done):
threading.Thread.__init__(self)
self.ID = threadID
self.q = q
self.done = done
def get_qw(self, queue_window):
# gets the queue_window object
self.queue_window = queue_window
def run(self):
# this is called when thread.start() is called
print("Thread {0} started.".format(self.ID))
process_data(self.ID, self.q, self.queue_window, self.done)
print("Thread {0} finished.".format(self.ID))
class Application(Frame):
def __init__(self, etc):
self.threads = []
# does some things
def makeThreads(self):
for i in range(1, int(self.threadNum.get()) +1):
thread = WorkThread(i, workQueue, self.calcsDone)
self.threads.append(thread)
# more code which just processes the function etc, sorts out the gui stuff.
And in a separate class (as I'm using tkinter, so the actual code to get the threads to run is called in a different window) (self.parent is the Application class):
def run_jobs(self):
if self.running == False:
# threads are only initiated when jobs are to be run
self.running = True
self.parent.calcsDone.set(False)
self.parent.threads = [] # just to make sure that it is initially empty, we want new threads each time
self.parent.makeThreads()
self.threads = self.parent.threads
for thread in self.threads:
thread.get_qw(self)
thread.start()
# put the jobs in the workQueue
queueLock.acquire()
for job in self.job_queue:
workQueue.put(job)
queueLock.release()
else:
messagebox.showerror("Error", "Jobs already running")
This is all the code which relates to the threads.
I don't know why when I run the program with multiple threads some data points are incorrect, whilst running it with just 1 single thread the data is all perfect. I tried looking up "threadsafe" processes, but couldn't find anything.
Thanks in advance!

Resources