multi-threading - How to control print statements from function during thread calls in python - python-3.x

I am using ThreadPoolExecuter for parallel execution of a function which prints statements and executes the sql. I would like to manage the print statements from the function. Eg
def Func(host,sql):
print ('Executing for %s ' %host)
SQL = Execute(host,SQL) -- connecting to DB
print SQL
main():
sql = 'show databases;'
hostList = ['abc.com','def.com','ghi.com','jkl.com']
with concurrent.futures.ThreadPoolExecutor() as executor:
future = [executor.submit(Func,acct ,host,sql) for host in hostList]
Here for 4 items in hostList it executes the thread and executes the function Func in parallel but prints results like below
Executing for abc.com
Executing for def.com
Executing for ghi.com
Executing for jkl.com
then
SQL output 1
SQL output 2
SQL output 3
SQL output 4
How I would like the function to print is like below
Executing for abc.com
SQL output 1
Executing for def.com
SQL output 1
Executing for ghi.com
SQL output 1
Executing for jkl.com
SQL output 1

If you just want to group your print statements together without reflecting the pause required to execute then you can do the following. Note that if the ONLY thing you are doing is a single print statement then you likely don't need the lock.
import concurrent.futures
import threading
import random
import time
def Func(account, host, sql, lock):
seconds = random.randint(1, 10)
time.sleep(seconds)
result = "Result of executing \"{}\" took {} seconds".format(sql, seconds)
## -------------------------------
## Probably don't need to lock if you combine these into one statement
## -------------------------------
with lock:
print('Executing for %s ' % host)
print("\t{}\n".format(result))
## -------------------------------
lock = threading.Lock()
hostList = ['abc.com','def.com','ghi.com','jkl.com']
with concurrent.futures.ThreadPoolExecutor() as executor:
future = [executor.submit(Func, "acct" , host, "somesql", lock) for host in hostList]

Related

How to run PostgreSQL query which starts with "DO $$ " in Python

I have PostgreSQL query which starts with DO:
do
$$
DECLARE
temprow record;
BEGIN
for temprow in
select *
from generate_series(1, 100)
where generate_series % 2 = 0
loop
with cte_input(val) as (select val from (values (temprow.generate_series)) as t(val))
insert
into tmp_table(input_value, value_100)
select cte_input.val as input_value, cte_input.val::float / 100 as value_100
from cte_input;
commit;
end loop;
END
$$ LANGUAGE plpgsql;
How I can run this query with Python and psycopg2?
Is it right way to use temporary function if I need to run this query with some dynamic changes few times?
UPD
Thank you #erwin-brandstetter for information about COMMIT.
I deleted COMMIT from query block and add it in Python code: ps_cursor.execute('COMMIT').
I write code in this way:
import concurrent.futures
import psycopg2 as pg
from psycopg2 import pool
features = [(1, name_of_feature_1), ...] # list of features
list_query = []
for feature in features:
feature_id = feature[0]
name_feature = feature[1]
query = f"""--Feature:{feature_id}
create or replace procedure pg_temp.proc_feature_{feature_id}_values()
language plpgsql
as
$$
DECLARE
temprow record;
BEGIN
for temprow in
select *
from tmp_maternal_sample
where maternal_sample = 1000
loop
insert
into tmp_feature_values(feature_id,
feature_values_array,
maternal_sample)
select feature_id,
array_agg(t_rank.{name_feature}) f_values,
temprow.maternal_sample
from t_rank
....
....
end loop;
end
$$;
call pg_temp.proc_feature_{feature_id}_values();
"""
list_query.append(query)
def load_query(query):
ps_connection = threaded_postgreSQL_pool.getconn()
if (ps_connection):
print(f"Successfully recived connection from connection pool for Query {query[:15]} ")
ps_cursor = ps_connection.cursor()
ps_cursor.execute(query)
ps_cursor.execute('COMMIT')
ps_cursor.close()
result = f'Query {query[:15]} finished'
print(result)
return result
try:
threaded_postgreSQL_pool = pool.ThreadedConnectionPool(1, 32, user, password, host, port, database)
if (threaded_postgreSQL_pool):
print("Connection pool created successfully using ThreadedConnectionPool")
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
future_to_sql = {executor.submit(load_query, query): query for query in list_query}
for future in concurrent.futures.as_completed(future_to_sql):
sql = future_to_sql[future]
try:
data = future.result()
except Exception as exc:
print('%s generated an exception: %s' % (sql[:15], exc))
else:
print('%s page is %s bytes' % (sql[:15], data))
except (Exception, pg.DatabaseError) as error:
print("Error while connecting to PostgreSQL", error)
finally:
if threaded_postgreSQL_pool:
threaded_postgreSQL_pool.closeall
print('Threaded PG connection pool is closed')
It's safe to assume Postgres 11 or later, because:
COMMIT works in one plpgsql code block, but not in another?
Your DO statement is convoluted without obvious reason. Simpler:
DO
LANGUAGE plpgsql
$do$
DECLARE
i int;
BEGIN
FOR i IN
SELECT generate_series(2, 100, 2)
LOOP
INSERT INTO tmp_table(input_value, value_100)
VALUES (i, i::float / 100);
-- COMMIT; -- ?
END LOOP;
END
$do$;
Which boils down to just this - even including the creation of that temp table:
CREATE TEMP TABLE tmp_table AS
SELECT g AS input_value, g::float / 100 AS value_100
FROM generate_series(2, 100, 2) g;
db<>fiddle here
Some setups (like dbfiddle.uk) still don't allow transaction handling with COMMIT. Not sure you even need that?
Either way, just execute the raw SQL.

Multiprocess : Persistent Pool?

I have code like the one below :
def expensive(self,c,v):
.....
def inner_loop(self,c,collector):
self.db.query('SELECT ...',(c,))
for v in self.db.cursor.fetchall() :
collector.append( self.expensive(c,v) )
def method(self):
# create a Pool
#join the Pool ??
self.db.query('SELECT ...')
for c in self.db.cursor.fetchall() :
collector = []
#RUN the whole cycle in parallel in separate processes
self.inner_loop(c, collector)
#do stuff with the collector
#! close the pool ?
both the Outer and the Inner loop are thousands of steps ...
I think I understand how to run a Pool of couple of processes.
All the examples I found show that more or less.
But in my case I need to lunch a persistent Pool and then feed the data (c-value). Once a inner-loop process has finished I have to supply the next-available-c-value.
And keep the processes running and collect the results.
How do I do that ?
A clunky idea I have is :
def method(self):
ws = 4
with Pool(processes=ws) as pool :
cs = []
for i,c in enumerate(..) :
cs.append(c)
if i % ws == 0 :
res = [pool.apply(self.inner_loop, (c)) for i in range(ws)]
cs = []
collector.append(res)
will this keep the same pool running !! i.e. not lunch new process every time ?i
Do I need 'if i % ws == 0' part or I can use imap(), map_async() and the Pool obj will block the loop when available workers are exhausted and continue when some are freed ?
Yes, the way that multiprocessing.Pool works is:
Worker processes within a Pool typically live for the complete duration of the Pool’s work queue.
So simply submitting all your work to the pool via imap should be sufficient:
with Pool(processes=4) as pool:
initial_results = db.fetchall("SELECT c FROM outer")
results = [pool.imap(self.inner_loop, (c,)) for c in initial_results]
That said, if you really are doing this to fetch things from the DB, it may make more sense to move more processing down into that layer (bring the computation to the data rather than bringing the data to the computation).

multiprocessing: maxtasksperchild and chunksize conflict?

I am using the multiprocessing module in Python 3.7. My code is not working as expected (see this question here). Someone suggested to set maxtasksperchild, which I set to 1. Then, while reading the documentation, I figured that it was best to set the chunksize to 1 as well. This is the relevant code part:
# Parallel Entropy Calculation
# ============================
node_combinations = [(i, j) for i in g.nodes for j in g.nodes]
pool = Pool(maxtaskperchild=1)
start = datetime.datetime.now()
logging.info("Start time: %s", start)
print("Start time: ", start)
results = pool.starmap(g._log_probability_path_ij, node_combinations, chunksize=1)
end = datetime.datetime.now()
print("End time: ", end)
print("Run time: ", end - start)
logging.info("End time: %s", end)
logging.info("Total run time: %s", start)
pool.close()
pool.join()
This backfired enormously. Setting only maxtasksperchild or only chunksize got the job done in the expected time (for a smaller dataset that I am using to test the code). Setting both just wouldn't finish and nothing was really running after a few seconds (I checked with htop to see if the cores where working).
Questions
Do maxtasksperchild and chunksize conflict when setting them together?
Do they do the same thing? maxtasksperchild at the Pool() level and chunksize at the Pool methods level?
======================================================
EDIT
I understand that debugging may be impossible from the extract of code presented, please find the full code below. The modules graph and graphfile are just little libraries written by me available in GitHub. If you wish to run the code, you can use any of the files in the data/ directory in the mentioned GitHub repository. Short tests are better run using F2, but F1 and F3 are the ones causing trouble in the HPC.
import graphfile
import graph
from multiprocessing.pool import Pool
import datetime
import logging
def remove_i_and_f(edges):
new_edges = dict()
for k,v in edges.items():
if 'i' in k:
continue
elif 'f' in k:
key = (k[0],k[0])
new_edges[key] = v
else:
new_edges[k] = v
return new_edges
if __name__ == "__main__":
import sys
# Read data
# =========
graph_to_study = sys.argv[1]
full_path = "/ComplexNetworkEntropy/"
file = graphfile.GraphFile(full_path + "data/" + graph_to_study + ".txt")
edges = file.read_edges_from_file()
# logging
# =======
d = datetime.date.today().strftime("%Y_%m_%d")
log_filename = full_path + "results/" + d + "_probabilities_log_" + graph_to_study + ".log"
logging.basicConfig(filename=log_filename, level=logging.INFO, format='%(asctime)s === %(message)s')
logging.info("Graph to study: %s", graph_to_study)
logging.info("Date: %s", d)
# Process data
# ==============
edges = remove_i_and_f(edges)
g = graph.Graph(edges)
# Parallel Entropy Calculation
# ============================
node_combinations = [(i, j) for i in g.nodes for j in g.nodes]
pool = Pool(maxtasksperchild=1)
start = datetime.datetime.now()
logging.info("Start time: %s", start)
print("Start time: ", start)
results = pool.starmap(g._log_probability_path_ij, node_combinations, chunksize=1)
end = datetime.datetime.now()
print("End time: ", end)
print("Run time: ", end - start)
logging.info("End time: %s", end)
logging.info("Total run time: %s", start)
pool.close()
pool.join()
maxtasksperchild ensures a worker is restarted after a certain amount of tasks. In other words, it kills the process after it runs maxtaskperchild iteration of your given function. It is provided to contain resource leakages caused by poor implementations on long running services.
chunksize groups a given collection/iterator in multiple tasks. It then ships over the internal pipe the whole group to reduce inter-process communication (IPC) overhead. The collection elements will still be processed 1 by 1. chunksize is useful if you have a large collection of small elements and the IPC overhead is significant in relation to the processing of the elements themselves. One side effect is that the same process will process a whole chunk.
Setting both parameters to 1 dramatically increases process rotation and IPC which are both quite resource-heavy especially on machines with high number of cores.

Is there any way to put timeout in pandas read_sql function?

I connect to a DB2 server through ODBC connection in my python code. The DB2 server gets reboot for maintainence or disconnects me while running specific server side tasks, happens 1 or 2 times in a day. At that time if my code has started executing the pandas read_sql function to fetch result of query, it goes into a infinite wait even when the server is up after lets say 1 hour.
I want to put a timeout in the execution of read_sql and whenever that timeout occurs I want to refresh the connection with DB2 server so that a fresh connection is made again before continuing the query.
I have tried making a while loop and picking chunks of data from DB2 instead of pulling whole result at once, but problem is if DB2 disconnects in pulling chunk python code still goes into infinite wait.
chunk_size = 1000
offset = 0
while True:
sql = "SELECT * FROM table_name limit %d offset %d" % (chunk_size,offset)
df = pd.read_sql(sql, conn)
df.index += (offset+1)
offset += chunk_size
sys.stdout.write('.')
sys.stdout.flush()
if df.shape[0] < chunk_size:
break
I need the read_sql to throw some exception or return a value if the sql execution takes more than 3 minutes. If that happenes I need the connection to DB2 to refresh.
You could use the package func-timeout. You can install via pip as below:
pip install func-timeout
So, for example, if you have a function “doit(‘arg1’, ‘arg2’)” that you want to limit to running for 5 seconds, with func_timeout you can call it like this:
from func_timeout import func_timeout, FunctionTimedOut
try:
doitReturnValue = func_timeout(5, doit, args=(‘arg1’, ‘arg2’))
except FunctionTimedOut:
print ( “doit(‘arg1’, ‘arg2’) could not complete within 5 seconds, hence terminated.\n”)
except Exception as e:
# Handle any exceptions that doit might raise here

scheduling a task at multiple timings(with different parameters) using celery beat but task run only once(with random parameters)

What i am trying to achieve
Write a scheduler, that uses a database to schedule similar tasks at different timings.
For the same i am using celery beat, the code snippet below would give an idea
try:
reader = MongoReader()
except:
raise
try:
tasks = reader.get_scheduled_tasks()
except:
raise
celerybeat_schedule = dict()
for task in tasks:
celerybeat_schedule[task["task_id"]] =dict()
celerybeat_schedule[task["task_id"]]["task"] = task["task_name"]
celerybeat_schedule[task["task_id"]]["args"] = (task,)
celerybeat_schedule[task["task_id"]]["schedule"] = get_task_schedule(task)
app.conf.update(BROKER_URL=rabbit_mq_endpoint, CELERY_TASK_SERIALIZER='json', CELERY_ACCEPT_CONTENT=['json'], CELERYBEAT_SCHEDULE=celerybeat_schedule)
so these are three steps
- reading all tasks from datastore
- creating a dictionary, celery scheduler which is populated by all tasks having properties, task_name(method that would run), parameters(data to pass to the method), schedule(stores when to run)
- updating this with celery configurations
Expected scenario
given all entries run the same celery task name that just prints, have same schedule to be run every 5 min, having different parameters specifying what to print, lets say db has
task name , parameter , schedule
regular_print , Hi , {"minutes" : 5}
regular_print , Hello , {"minutes" : 5}
regular_print , Bye , {"minutes" : 5}
I expect, these to be printing every 5 minutes to print all three
What happens
Only one of Hi, Hello, Bye prints( possible randomly, surely not in sequence)
Please help,
Thanks a lot in advance :)
Was able to resolve this using version 4 of celery. Sample similar to what worked for me.. can also find in documentation by celery for version 4
#taking address and user-pass from environment(you can mention direct values)
ex_host_queue = os.environ["EX_HOST_QUEUE"]
ex_port_queue = os.environ["EX_PORT_QUEUE"]
ex_user_queue = os.environ["EX_USERID_QUEUE"]
ex_pass_queue = os.environ["EX_PASSWORD_QUEUE"]
broker= "amqp://"+ex_user_queue+":"+ex_pass_queue+"#"+ex_host_queue+":"+ex_port_queue+"//"
#celery initialization
app = Celery(__name__,backend=broker, broker=broker)
app.conf.task_default_queue = 'scheduler_queue'
app.conf.update(
task_serializer='json',
accept_content=['json'], # Ignore other content
result_serializer='json'
)
task = {"task_id":1,"a":10,"b":20}
##method to update scheduler
def add_scheduled_task(task):
print("scheduling task")
del task["_id"]
print("adding task_id")
name = task["task_name"]
app.add_periodic_task(timedelta(minutes=1),add.s(task), name = task["task_id"])
#app.task(name='scheduler_task')
def scheduler_task(data):
print(str(data["a"]+data["b"]))

Resources