python simpy memory usage with large numbers of objects/processes - python-3.x

I am using simpy to create a DES with a very large numbers of objects (many millions). I am running into memory issues and have being trying to figure out how to address this. It is possible to work out which objects will not undergo anymore interactions with other processes and so I can delete these objects from the simulation in theory freeing up memory. I created the below test this.
import psutil as ps
import simpy
import random
class MemoryUse(object):
"""a class used to output memory usage at various times within the sim"""
def __init__(self, env, input_dict):
self.env = env
self.input_dict = input_dict
self.env.process(self.before())
self.env.process(self.during())
self.env.process(self.after_sr())
self.env.process(self.after())
def before(self):
yield self.env.timeout(0)
print("full object list and memory events at time: ", self.env.now, " ", ps.virtual_memory())
print(len(self.input_dict), len(self.env._queue))
def during(self):
yield self.env.timeout(2)
print("full object list and events ar time: ", self.env.now, " ", ps.virtual_memory())
print(len(self.input_dict), len(self.env._queue))
def after_sr(self):
yield self.env.timeout(4)
print("reduced object list and reduced events at time: ", self.env.now, " ", ps.virtual_memory())
print(len(self.input_dict), len(self.env._queue))
def after(self):
yield self.env.timeout(6)
print("no objects and no events at time: ", self.env.now, " ", ps.virtual_memory())
print(len(self.input_dict), len(self.env._queue))
class ExObj(object):
"""a generic object"""
def __init__(self, env, id, input_dict):
self.env = env
self.id = id
self.input_dict = input_dict
if random.randint(0, 100) < 70:
# set as SR
self.timeout = 2
else:
self.timeout = 4
def action(self):
yield self.env.timeout(self.timeout)
del self.input_dict[self.id]
class StartObj(object):
"""this enables me to create the obj events after the sim has started so as to measure memory usage before the events
associated with the object exists"""
def __init__(self, env, input_dict):
self.env = env
self.input_dict = input_dict
self.env.process(self.start_obj())
def start_obj(self):
yield self.env.timeout(1)
for k, v in self.input_dict.items():
self.env.process(v.action())
yield self.env.timeout(0)
# memory usage before we do anything
print("before all: ", ps.virtual_memory())
# create simpy env
env = simpy.Environment()
obj_dict = {}
# create memory calculation events
memory = MemoryUse(env, obj_dict)
# create objects
for i in range(2500000):
obj_dict[i] = ExObj(env, i, obj_dict)
# create process that will itself start events associated with the objects
start = StartObj(env, obj_dict)
# run
env.run()
# clear the dict if not already clear
for j in range(2500000):
obj_dict.clear()
# final memory check
print("after all: ", ps.virtual_memory())
print(len(obj_dict))
I was expecting memory usage to drop by time 4, as many objects have been removed and processes completed (around 70%). However memory usage appears to stay the same (See below). Why is this so? What is using this memory? Do completed processes stay in the simulation?
before all: svmem(total=42195423232, available=39684155392, percent=6.0, used=2246373376, free=38884859904, active=2390749184, inactive=441712640, buffers=263155712, cached=801034240, shared=28721152)
full object list and memory events at time: 0 svmem(total=42195423232, available=38834251776, percent=8.0, used=3096276992, free=38035181568, active=3241959424, inactive=441466880, buffers=263159808, cached=800804864, shared=28721152)
2500000 4
full object list and events ar time: 2 svmem(total=42195423232, available=35121584128, percent=16.8, used=6808891392, free=34322219008, active=6947561472, inactive=441761792, buffers=263163904, cached=801148928, shared=28774400)
2500000 2500002
reduced object list and reduced events at time: 4 svmem(total=42195423232, available=35120973824, percent=16.8, used=6809530368, free=34321600512, active=6948368384, inactive=441737216, buffers=263168000, cached=801124352, shared=28745728)
767416 767417
no objects and no events at time: 6 svmem(total=42195423232, available=38448134144, percent=8.9, used=3482365952, free=37648760832, active=3627053056, inactive=441733120, buffers=263172096, cached=801124352, shared=28745728)
0 0
after all: svmem(total=42195423232, available=38825793536, percent=8.0, used=3104706560, free=38026420224, active=3250180096, inactive=441733120, buffers=263172096, cached=801124352, shared=28745728)
0
Process finished with exit code 0

Related

Multiprocessing to return large data sets in Python

I have 2 functions in a Python 3.7 script that search 2 separate network nodes and returns very large data sets of strings in a list. The smaller data set length is ~300K entries, while the larger one is ~1.5M. This script takes almost an hour to execute because of how it has to compile the data sets as well as having the second data set be significantly larger. I don't have a way to shorten the run time by changing how the compilation happens, there's no easier way for me to get the data from the network nodes. But I can cut almost 10 minutes if I can run them simultaneously, so I'm trying to shorten the run time by using multiprocessing so I can run both of them at once.
I do not need them to necessarily start within the same second or finish at the same second, just want them to run at the same time.
Here's a breakdown of first attempt at coding for multiprocessing:
def p_func(arg1, arg2, pval):
## Do Stuff
return pval
def s_func(arg1, sval):
## Do Stuff
return sval
# Creating variables to get return values that multiprocessing can handle
pval = multiprocessing.Value(list)
sval = multiprocessing.Value(list)
# setting up multiprocessing Processes for each function and passing arguments
p1 = multiprocessing.Process(target=p_func, args=(arg1, arg2, pval))
s2 = multiprocessing.Process(target=s_func, args=(arg3, sval))
p1.start()
s1.start()
p1.join()
s1.join()
print("Number of values in pval: ", len(pval))
print("Number of values in sval: ", len(sval))
I believe I have solved my list concerns, so....
Based on comments I've updated my code as follows:
#! python3
import multiprocessing as mp
def p_func(arg1, arg2, pval):
# takes arg1 and arg2 and queries network node to return list of ~300K
# values and assigns that list to pval for return to main()
return pval
def s_func(arg1, sval):
# takes arg1 and queries network node to return list of ~1.5M
# values and assigns that list to sval for return to main()
return sval
# Creating variables to get return values that multiprocessing can handle in
# main()
with mp.Manager() as mgr
pval = mgr.list()
sval = mgr.list()
# setting up multiprocessing Processes for each function and passing
# arguments
p1 = mp.Process(target=p_func, args=(arg1, arg2, pval))
s1 = mp.Process(target=s_func, args=(arg3, sval))
p1.start()
s1.start()
p1.join()
s1.join()
# out of with block
print("Number of values in pval: ", len(pval))
print("Number of values in sval: ", len(sval))
Now getting a TypeError: can't pickle _thread.lock objects on the p1.start() invocation. I'm guessing that one of the variables I have passed in the p1 declaration is causing a problem with multiprocessing, but I'm not sure how to read the error or resolve the problem.
Use a Manager.list() instead:
import multiprocessing as mp
def p_func(pval):
pval.extend(list(range(300000)))
def s_func(sval):
sval.extend(list(range(1500000)))
if __name__ == '__main__':
# Creating variables to get return values that mp can handle
with mp.Manager() as mgr:
pval = mgr.list()
sval = mgr.list()
# setting up mp Processes for each function and passing arguments
p1 = mp.Process(target=p_func, args=(pval,))
s2 = mp.Process(target=s_func, args=(sval,))
p1.start()
s2.start()
p1.join()
s2.join()
print("Number of values in pval: ", len(pval))
print("Number of values in sval: ", len(sval))
Output:
Number of values in pval: 300000
Number of values in sval: 1500000
Manager objects are slower than shared memory but more flexible. Shared memory is faster, so if you know an upper limit for your arrays, you could use a fixed-sized shared memory Array and a shared value indicating the used size instead, such as:
#!python3
import multiprocessing as mp
def p_func(parr,psize):
for i in range(10):
parr[i] = i
psize.value = 10
def s_func(sarr,ssize):
for i in range(5):
sarr[i] = i
ssize.value = 5
if __name__ == '__main__':
# Creating variables to get return values that mp can handle
parr = mp.Array('i',2<<20) # 2M
sarr = mp.Array('i',2<<20)
psize = mp.Value('i',0)
ssize = mp.Value('i',0)
# setting up mp Processes for each function and passing arguments
p1 = mp.Process(target=p_func, args=(parr,psize))
s2 = mp.Process(target=s_func, args=(sarr,ssize))
p1.start()
s2.start()
p1.join()
s2.join()
print("parr: ", parr[:psize.value])
print("sarr: ", sarr[:ssize.value])
Output:
parr: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
sarr: [0, 1, 2, 3, 4]

How best to parallelize grakn queries with Python?

I run Windows 10, Python 3.7, and have a 6-core CPU. A single Python thread on my machine submits 1,000 inserts per second to grakn. I'd like to parallelize my code to insert and match even faster. How are people doing this?
My only experience with parellelization is on another project, where I submit a custom function to a dask distributed client to generate thousands of tasks. Right now, this same approach fails whenever the custom function receives or generates a grakn transaction object/handle. I get errors like:
Traceback (most recent call last):
File "C:\Users\dvyd\.conda\envs\activefiction\lib\site-packages\distributed\protocol\pickle.py", line 41, in dumps
return cloudpickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
...
File "stringsource", line 2, in grpc._cython.cygrpc.Channel.__reduce_cython__
TypeError: no default __reduce__ due to non-trivial __cinit__
I've never used Python's multiprocessing module directly. What are other people doing to parallelize their queries to grakn?
The easiest approach that I've found to execute a batch of queries is to pass a Grakn session to each thread in a ThreadPool. Within each thread you can manage transactions and of course do some more complex logic:
from grakn.client import GraknClient
from multiprocessing.dummy import Pool as ThreadPool
from functools import partial
def write_query_batch(session, batch):
tx = session.transaction().write()
for query in batch:
tx.query(query)
tx.commit()
def multi_thread_write_query_batches(session, query_batches, num_threads=8):
pool = ThreadPool(num_threads)
pool.map(partial(write_query_batch, session), query_batches)
pool.close()
pool.join()
def generate_query_batches(my_data_entries_list, batch_size):
batch = []
for index, data_entry in enumerate(my_data_entries_list):
batch.append(data_entry)
if index % batch_size == 0 and index != 0:
yield batch
batch = []
if batch:
yield batch
# (Part 2) Somewhere in your application open a client and a session
client = GraknClient(uri="localhost:48555")
session = client.session(keyspace="grakn")
query_batches_iterator = generate_query_batches(my_data_entries_list, batch_size)
multi_thread_write_query_batches(session, query_batches_iterator, num_threads=8)
session.close()
client.close()
The above is a generic method. As a concrete example, you can use the above (omitting part 2) to parallelise batches of insert statements from two files. Appending this to the above should work:
files = [
{
"file_path": f"/path/to/your/file.gql",
},
{
"file_path": f"/path/to/your/file2.gql",
}
]
KEYSPACE = "grakn"
URI = "localhost:48555"
BATCH_SIZE = 10
NUM_BATCHES = 1000
# ​Entry point where migration starts
def migrate_graql_files():
start_time = time.time()
for file in files:
print('==================================================')
print(f'Loading from {file["file_path"]}')
print('==================================================')
open_file = open(file["file_path"], "r") # Here we are assuming you have 1 Graql query per line!
batches = generate_query_batches(open_file.readlines(), BATCH_SIZE)
with GraknClient(uri=URI) as client: # Using `with` auto-closes the client
with client.session(KEYSPACE) as session: # Using `with` auto-closes the session
multi_thread_write_query_batches(session, batches, num_threads=16) # Pick `num_threads` according to your machine
elapsed = time.time() - start_time
print(f'Time elapsed {elapsed:.1f} seconds')
elapsed = time.time() - start_time
print(f'Time elapsed {elapsed:.1f} seconds')
if __name__ == "__main__":
migrate_graql_files()
You should also be able to see how you can load from a csv or any other file type in this way, but taking the values you find in that file and substitution them into Graql query string templates. Take a look at the migration example in the docs for more on that.
An alternative approach using multi-processing instead of multi-threading follows below.
We empirically found that multi-threading doesn't yield particularly large performance gains, compared to multi-processing. This is probably due to Python's GIL.
This piece of code assumes a file enumerating TypeQL queries that are independent of each other, so they can be parallelised freely.
from typedb.client import TypeDB, TypeDBClient, SessionType, TransactionType
import multiprocessing as mp
import queue
def batch_writer(database, kill_event, batch_queue):
client = TypeDB.core_client("localhost:1729")
session = client.session(database, SessionType.DATA)
while not kill_event.is_set():
try:
batch = batch_queue.get(block=True, timeout=1)
with session.transaction(TransactionType.WRITE) as tx:
for query in batch:
tx.query().insert(query)
tx.commit()
except queue.Empty:
continue
print("Received kill event, exiting worker.")
def start_writers(database, kill_event, batch_queue, parallelism=4):
processes = []
for _ in range(parallelism):
proc = mp.Process(target=batch_writer, args=(database, kill_event, batch_queue))
processes.append(proc)
proc.start()
return processes
def batch(iterable, n=1000):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]
if __name__ == '__main__':
batch_size = 100
parallelism = 1
database = "<database name>"
# filePath = "<PATH TO QUERIES FILE - ONE QUERY PER NEW LINE>"
with open(file_path, "r") as file:
statements = file.read().splitlines()[:]
batch_statements = batch(statements, n=batch_size)
total_batches = int(len(statements) / batch_size)
if total_batches % batch_size > 0:
total_batches += 1
batch_queue = mp.Queue(parallelism * 4)
kill_event = mp.Event()
writers = start_writers(database, kill_event, batch_queue, parallelism=parallelism)
for i, batch in enumerate(batch_statements):
batch_queue.put(batch, block=True)
if i*batch_size % 10000 == 0:
print("Loaded: {0}/{1}".format(i*batch_size, total_batches*batch_size))
kill_event.set()
batch_queue.close()
batch_queue.join_thread()
for proc in writers:
proc.join()
print("Done loading")

Simpy: Block resource beyond its servicing a user

Consider a carwash as described in the Simpy example list. Now assume that each time a car is serviced, the corresponding washing unit has to be serviced itself, before it can clean the next car. Meanwhile, the serviced car can leave.
How would I best model the above? My current solution is to have "ghost cars" with a high priority that block the carwash during the regeneration time. I don't find this solution very elegant, and guess there is a better way.
In the below example, which represents a poor copy of the above-mentioned tutorial, the serviced car cannot leave the pump during the regeneration period. How could I fix that to mimic the intended behavior? I guess the solution is straightforward; I just don't see it.
import random
import simpy
RANDOM_SEED = 42
NUM_MACHINES = 2 # Number of machines in the carwash
WASHTIME = 5 # Minutes it takes to clean a car
REGENTIME = 3
T_INTER = 7 # Create a car every ~7 minutes
SIM_TIME = 20 # Simulation time in minutes
class Carwash(object):
def __init__(self, env, num_machines, washtime):
self.env = env
self.machine = simpy.Resource(env, num_machines)
self.washtime = washtime
def wash(self, car):
yield self.env.timeout(WASHTIME)
print("Carwash removed %s's dirt at %.2f." % (car, env.now))
def regenerateUnit(self):
yield self.env.timeout(REGENTIME)
print("Carwash's pump regenerated for next user at %.2f." % (env.now))
def car(env, name, cw):
print('%s arrives at the carwash at %.2f.' % (name, env.now))
with cw.machine.request() as request:
yield request
print('%s enters the carwash at %.2f.' % (name, env.now))
yield env.process(cw.wash(name))
yield env.process(cw.regenerateUnit())
print('%s leaves the carwash at %.2f.' % (name, env.now))
def setup(env, num_machines, washtime, t_inter):
# Create the carwash
carwash = Carwash(env, num_machines, washtime)
# Create 4 initial cars
for i in range(4):
env.process(car(env, 'Car %d' % i, carwash))
# Create more cars while the simulation is running
while True:
yield env.timeout(random.randint(t_inter - 2, t_inter + 2))
i += 1
env.process(car(env, 'Car %d' % i, carwash))
# Setup and start the simulation
random.seed(RANDOM_SEED) # This helps reproducing the results
# Create an environment and start the setup process
env = simpy.Environment()
env.process(setup(env, NUM_MACHINES, WASHTIME, T_INTER))
# Execute!
env.run(until=SIM_TIME)
Thanks a lot in advance.
What you want is to model an entity that uses your resource with a very high priority so that normal entities meanwhile cannot use it. So your "ghost car" is actually not such a bad idea.

google datastore put_multi didn't insert data

I am trying to insert 6000 rows/entities into google cloud datastore. I am also using datastore emulator as the local server.
In the code, I created an insert function that inserts entities in batches using put_multi and set the batch size to 50. I use python multiprocessing to spawn processes that execute the function.
A slice function is also used to divide the workload based on how many CPU cores are used. e.g. if there are 3 cores, the workload (6000 entities) is divided into 3 parts with 2000 entities each, then each part is inserted by a spawned process that executes the insert function.
After insertion is done, I checked with Cloud Datastore Admin console, but couldn't find the kinds that have been inserted.
I am wondering what is the issue here and how to solve it.
code snippet is as follows,
# cores_to_use is how many cpu cores available for dividing workload
cores_to_use = 3
# a datastore client is passed in as the argument
inserter = FastInsertGCDatastore(client)
# entities is a list of datastore entities to be inserted
# the number of entities is 6000 here
input_size = len(entities)
slice_size = int(input_size / cores_to_use)
entity_blocks = []
iterator = iter(entities)
for i in range(cores_to_use):
entity_blocks.append([])
for j in range(slice_size):
entity_blocks[i].append(iterator.__next__())
for block in entity_blocks:
p = multiprocessing.Process(target=inserter.execute, args=(block,))
p.start()
class FastInsertGCDatastore:
"""
batch insert entities into gc datastore based on batch_size and number_of_entities
"""
def __init__(self, client):
"""
initialize with datastore client
:param client: the datastore client
"""
self.client = client
def execute(self, entities):
"""
batch insert entities
:param entities: a list of datastore entities need to be inserted
"""
number_of_entities = len(entities)
batch_size = 50
batch_documents = [0] * batch_size
rowct = 0 # entity count as index for accessing rows
for index in range(number_of_entities):
try:
batch_documents[index % batch_size] = entities[rowct]
rowct += 1
if (index + 1) % batch_size == 0:
self.client.put_multi(batch_documents)
index += 1
except Exception as e:
print('Unexpected error for index ', index, ' message reads', str(e))
raise e
# insert any remaining entities
if not index % batch_size == 0:
self.client.put_multi(batch_documents[:index % batch_size])

python3 multiprocessing.Process approach fails

I saw somewhere a hint on how to process a large dataset (say lines of text) faster with the multiprocessing module, something like:
... (form batch_set = nump batches [= lists of lines to process], batch_set
is a list of lists of strings (batches))
nump = len(batch_set)
output = mp.Queue()
processes = [mp.Process(target=proc_lines, args=(i, output, batch_set[i])) for i in range(nump)]
for p in processes:
p.start()
for p in processes:
p.join()
results = sorted([output.get() for p in processes])
... (do something with the processed outputs, ex print them in order,
given that each proc_lines function returns a couple (i, out_batch))
However, when i run the code with a small number of lines/batch it works fine
[ex: './code.py -x 4:10' for nump=4 and numb=10 (lines/batch)] while after a
certain number of lines/batch is hangs [ex: './code.py -x 4:4000'] and when i
interrupt it i see a traceback hint about a _wait_for_tstate_lock and the system
threading library. It seems that the code does not reach the last shown code
line above...
I provide the code below, in case somebody needs it to answer why this is
happening and how to fix it.
#!/usr/bin/env python3
import sys
import multiprocessing as mp
def fabl(numb, nump):
'''
Form And Batch Lines: form nump[roc] groups of numb[atch] indexed lines
'<idx> my line here' with indexes from 1 to (nump x numb).
'''
ret = []
idx = 1
for _ in range(nump):
cb = []
for _ in range(numb):
cb.append('%07d my line here' % idx)
idx += 1
ret.append(cb)
return ret
def proc_lines(i, output, rows_in):
ret = []
for row in rows_in:
row = row[0:8] + 'some other stuff\n' # replacement for the post-idx part
ret.append(row)
output.put((i,ret))
return
def mp_proc(batch_set):
'given the batch, disperse it to the number of processes and ret the results'
nump = len(batch_set)
output = mp.Queue()
processes = [mp.Process(target=proc_lines, args=(i, output, batch_set[i])) for i in range(nump)]
for p in processes:
p.start()
for p in processes:
p.join()
print('waiting for procs to complete...')
results = sorted([output.get() for p in processes])
return results
def write_set(proc_batch_set, fout):
'write p[rocessed]batch_set'
for _, out_batch in proc_batch_set:
for row in out_batch:
fout.write(row)
return
def main():
args = sys.argv
if len(args) < 2:
print('''
run with args: -x [ NumProc:BatchSize ]
( ex: '-x' | '-x 4:10' (default values) | '-x 4:4000' (hangs...) )
''')
sys.exit(0)
numb = 10 # suppose we need this number of lines/batch : BatchSize
nump = 4 # number of processes to use. : NumProcs
if len(args) > 2 and ':' in args[2]: # use another np:bs
nump, numb = map(int, args[2].split(':'))
batch_set = fabl(numb, nump) # proc-batch made in here: nump (groups) x numb (lines)
proc_batch_set = mp_proc(batch_set)
with open('out-min', 'wt') as fout:
write_set(proc_batch_set, fout)
return
if __name__ == '__main__':
main()
The Queue have a certain capacity and can get full if you do not empty it while the Process are running. This does not block the execution of your processes but you won't be able to join the Process if the put did not complete.
So I would just modify the mp_proc function such that:
def mp_proc(batch_set):
'given the batch, disperse it to the number of processes and ret the results'
n_process = len(batch_set)
output = mp.Queue()
processes = [mp.Process(target=proc_lines, args=(i, output, batch_set[i]))
for i in range(process)]
for p in processes:
p.start()
# Empty the queue while the processes are running so there is no
# issue with uncomplete `put` operations.
results = sorted([output.get() for p in processes])
# Join the process to make sure everything finished correctly
for p in processes:
p.join()
return results

Resources