I am using airflow, i want to pass the output of the function of task 1 to the task 2.
def create_dag(dag_id,
schedule,
default_args):
def getData(**kwargs):
res = requests.post('https://dummyURL')
return res.json())
def sendAlert(**kwargs):
requests.post('https://dummyURL', params = "here i want to send res.json() from task 1")
dag = DAG(dag_id,
schedule_interval=schedule,
default_args=default_args)
with dag:
t1 = PythonOperator(task_id='task1',python_callable=getData,provide_context=True,dag=dag)
t2 = PythonOperator(task_id='task2',python_callable=sendAlert,provide_context=True,dag=dag)
return dag
Check out xcom's, as long as the data you want to pass is relatively small it's the best option.
Related
Is there a way to find the maximum/minimum or even an average duration of all DagRun instances in Airflow? - That is all dagruns from all dags not just one single dag.
I can't find anywhere to do this on the UI or even a page with a programmatic/command line example.
You can use airflow- api to get all dag_runs for dag and calculate statistics.
An example to get all dag_runs per dag and calc total time :
import datetime
import requests
from requests.auth import HTTPBasicAuth
airflow_server = "http://localhost:8080/api/v1/"
auth = HTTPBasicAuth("airflow", "airflow")
get_dags_url = f"{airflow_server}dags"
get_dag_params = {
"limit": 100,
"only_active": "true"
}
response = requests.get(get_dags_url, params=get_dag_params, auth=auth)
dags = response.json()["dags"]
get_dag_run_params = {
"limit": 100,
}
for dag in dags:
dag_id = dag["dag_id"]
dag_run_url = f"{airflow_server}/dags/{dag_id}/dagRuns?limit=100&state=success"
response = requests.get(dag_run_url, auth=auth)
dag_runs = response.json()["dag_runs"]
for dag_run in dag_runs:
execution_date = datetime.datetime.fromisoformat(dag_run['execution_date'])
end_date = datetime.datetime.fromisoformat(dag_run['end_date'])
duration = end_date - execution_date
duration_in_s = duration.total_seconds()
print(duration_in_s)
The easiest way will be to query your Airflow metastore. All the scheduling, DAG runs, and task instances are stored there and Airflow can't operate without it. I do recommend filtering on DAG/execution date if your use-case allows. It's not obvious to me what one can do with just these three overarching numbers alone.
select
min(runtime_seconds) min_runtime,
max(runtime_seconds) max_runtime,
avg(runtime_seconds) avg_runtime
from (
select extract(epoch from (d.end_date - d.start_date)) runtime_seconds
from public.dag_run d
where d.execution_date between '2022-01-01' and '2022-06-30' and d.state = 'success'
)
You might also consider joining to the task_instance table to get some task-level data, and perhaps use the min start and max end times for DAG tasks within a DAG run for your timestamps.
With Spark, starting a job take a time.
For a complex workflow, it's possible to invoke a job in a loop.
But, we pay for each 'start'.
def test_loop(spark):
all_datas = []
for i in ['CZ12905K01', 'CZ12809WRH', 'CZ129086RP']:
all_datas.extend(spark.sql(f"""
select * from data where id=='{i}'
""").collect()) # Star a job
return all_datas
Sometime, it's possible to explode the loop to a big job, with 'union'.
def test_union(spark):
full_request = None
for i in ['CZ12905K01', 'CZ12809WRH' ,'CZ129086RP']:
q = f"""
select '{i}' ID,* from data where leh_be_lot_id=='{i}'
"""
partial_df = spark.sql(q)
if not full_request:
full_request = partial_df
else:
full_request = full_request.union(partial_df)
return full_request.collect() # Start a job
For clarity, my samples are elementary (I know, I can use in (...)) . The real requests will be more complex.
It's a good idea ?
With union approach, I can reduce drastically the number of jobs submitted, but with a more complex job.
My tests show that:
It's possible to union > 1000 request for a better performance
For 950 requests with local[6]
With 0 union : 1h53m
with 10 unions : 20m01s
with 100 unions : 7m12s
with 200 unions : 6m02s
with 500 unions : 6m25s
Sometime, the union version must "broadcast big data", or generate an "Out of memory"
My final approach: set a level_of_union
Merge some request, start the job, get the data
continue the loop with another batch
def test_union(spark,level_of_union):
full_request = None
all_datas = []
todo=['CZ12905K01', 'CZ12809WRH' ,'CZ129086RP']
for idx,i in enumerate(todo):
q = f"""
select '{i}' ID,* from leh_be where leh_be_lot_id=='{i}'
"""
partial_df = spark.sql(q)
if not full_request:
full_request = partial_df
else:
full_request = full_request.union(partial_df)
if idx % level_of_union == level_of_union-1 or idx == len(todo)-1:
all_datas.extend(full_request.collect()) # Start a job
full_request=None
return all_datas
Make a test to adjust the meta parameter : level_of_union
I'm trying to get the DAG name to the following JSON:
INFO - {'conf': <airflow.configuration.AutoReloadableProxy object at ... >, 'dag': <DAG: dag-name-i-want-to-get>, 'ds': '2021-07-29' ... N }
By the way, I got the JSON using the following function in Airflow:
def execute_dag_run(**kwargs):
print(kwargs)
dag = kwargs['dag']
print(type(dag))
print(dag)
get_dag_run_task = PythonOperator(
task_id='get_dag_run',
python_callable=execute_dag_run,
dag=dag,
provide_context=True
)
However, I'm getting a class if I print type(dag):
INFO - <class 'airflow.models.dag.DAG'>
Do you have any idea how to get this without do a manual extraction?
You are printing the dag object if you want to get the dag name you need to get it from the dag object as:
def execute_dag_run(**kwargs):
dag = kwargs['dag']
print ("dag_id from dag:")
print(dag.dag_id)
Alternatively you can also get it from task_instance as:
def execute_dag_run(**kwargs):
ti = kwargs['task_instance']
print ("dag_id from task instance:")
print (ti.dag_id)
another option is to get it from dag_run as:
def execute_dag_run(**kwargs):
dag_run = kwargs['dag_run']
print("dag_id from dag_run:")
print (dag_run.dag_id)
Hi I'm trying to run tasks in an asynchronous way, but can't get it to work.
What I want to do is query a db (takes 30s) and process the result (takes 15s) while I make the next query.
My problem seems very simple to solve, but for some reason I can't get it to work.
Thank you very much for your help.
Here is my code so far
async def query_db(date):
sql_query = f"SELECT * FROM tablename WHERE date='{date}'"
df = pd.read_sql(sql_query, engine)
df.to_csv(f"{date}_data.csv", index=False)
async def process_df(filepath):
df = pd.read_csv(filepath)
# Do processing stuff here and save the modified file
for dte in pd.date_range("2020-10-10", "2020-10-12", freq='D'):
query_data_task = asyncio.create_task(query_db(dte))
filepath = f"{dte}_data.csv"
await query_data_task
process_dataframe = asyncio.create_task(process_df(filepath))
How can I schedule lots of APScheduler jobs (4,000+) concurrently? (I must schedule all these after certain user events.)
Iteratively calling add_job simply takes too long with many jobs. But when I try to use AsyncIOScheduler and the following async code, I don't get any added performance increase either.
NOTE: my scheduler needs to connect to a SQL jobstore via SqlAlchemy
scheduler = AsyncIOScheduler(jobstores={"default": SQLAlchemyJobStore(url="a valid db connection str")})
scheduler.start()
def schedule_jobs_quickly():
# init lots of (fake) jobs
jobs = []
for i in range(3000):
jobs.append(i)
send_time = datetime.datetime.now() + datetime.timedelta(days=2)
# try to schedule jobs concurrently
start_time = time.time()
asyncio.get_event_loop().run_until_complete(schedule_all_jobs(jobs, send_time))
duration = time.time() - start_time
print(f"Created {len(jobs)} jobs in {duration} seconds")
async def schedule_all_jobs(all_jobs, send_time):
tasks = []
for job in all_jobs:
task = asyncio.ensure_future(schedule_job(job, send_time))
tasks.append(task)
await asyncio.gather(*tasks, return_exceptions=True)
async def schedule_job(job, send_time):
scheduler.add_job(send_email_if_needed, trigger=send_time)
Result is very slow. How to speed this up?
>>> schedule_jobs_quickly()
...
Created 3000 jobs in 401.9982771873474 seconds
For comparison, this is how long it took with a BackgroundScheduler() using the default memory jobstore:
Created 3000 jobs in 0.9155495166778564 seconds
So, it seems to be the database connections that are so expensive. Maybe there's a way to create multiple jobs using the same connection, instead of re-connecting for each add_job?
It's not the solution I was looking for, but I decided to give up on AsyncIOScheduler and instead schedule my many tasks in a separate thread so the rest of my program could continue without being held up by all of the DB connections. Example below.
from threading import Thread
def schedule_jobs_quickly():
# init lots of (fake) jobs
jobs = []
for i in range(3000):
jobs.append(i)
send_time = datetime.datetime.now() + datetime.timedelta(days=2)
# schedule jobs in new thread
scheduler_thread = Thread(target=schedule_email_jobs, args=(email_jobs,))
scheduler_thread.start()
def schedule_email_jobs(jobs):
for job in jobs:
scheduler.add_job(send_email, trigger=send_time)
def send_email():
# sends email