airflow 2.3.3 SparkKubernetesOperator

airflow 2.3.3 SparkKubernetesOperator - apache-spark

I do submit spark application to kubernetes from Airflow as SparkKubernetesOperator task.
When I mark task as failed I see the pod is not deleted in kubernetes.
How to fix it?
dag = DAG(
"dag-name",
default_args=default_args,
description='submit dag',
schedule_interval="30 1 * * *",
start_date = datetime(2022, 7, 26),
)
t1 = SparkKubernetesOperator(
task_id='sample-name',
namespace="batch",
application_file="k8s/sample.yaml",
do_xcom_push=True,
dag=dag,
params = {"processDate": process_date},
)
t1_sensor = SparkKubernetesSensor(
task_id='sample-monitor',
namespace="batch",
application_name="{{ task_instance.xcom_pull(task_ids='sample-name')['metadata']['name'] }}",
kubernetes_conn_id="kubernetes_default",
dag=dag,
)
t1 >> t1_sensor

Related

How to only run certain operator when dag conf value exist

def skip_update_job_pod_name(dag):
"""
:param dag: Airflow DAG
:return: Dummy operator to skip update pod name
"""
return DummyOperator(task_id="skip_update_job_pod_name", dag=dag)
def update_pod_name_branch_operator(dag: DAG, job_id: str):
"""branch operator to update pod name."""
return BranchPythonOperator(
dag=dag,
trigger_rule="all_done",
task_id="update_pod_name",
python_callable=update_pod_name_func,
op_kwargs={"job_id": job_id},
)
def update_pod_name_func(job_id: Optional[str]) -> str:
"""function for update pod name."""
return "update_job_pod_name" if job_id else "skip_update_pod_name"
def update_job_pod_name(dag: DAG, job_id: str, process_name: str) -> MySqlOperator:
"""
:param dag: Airflow DAG
:param job_id: Airflow Job ID
:param process_name: name of the current running process
:return: MySqlOperator to update Airflow job ID
"""
return MySqlOperator(
task_id="update_job_pod_name",
mysql_conn_id="semantic-search-airflow-sdk",
autocommit=True,
sql=[
f"""
INSERT INTO airflow.Pod (job_id, pod_name, task_name)
SELECT * FROM (SELECT '{job_id}', '{xcom_pull("pod_name")}', '{process_name}') AS temp
WHERE NOT EXISTS (
SELECT pod_name FROM airflow.Pod WHERE pod_name = '{{{{ ti.xcom_pull(key="pod_name") }}}}'
) LIMIT 1;
"""
],
task_concurrency=1,
dag=dag,
trigger_rule="all_done",
)
def create_k8s_pod_operator_without_volume(dag: DAG,
job_id: int,
....varaible) -> TaskGroup:
"""
Create task group for k8 operator without volume
"""
with TaskGroup(group_id="k8s_pod_operator_without_volume", dag=dag) as eks_without_volume_group:
emit_pod_name_branch = update_pod_name_branch_operator(dag=dag, job_id=job_id)
update_pod_name = update_job_pod_name(dag=dag, job_id=job_id, process_name=process_name)
skip_update_pod_name = skip_update_job_pod_name(dag=dag)
emit_pod_name_branch >> [update_pod_name, skip_update_pod_name]
return eks_without_volume_group
I update the code based on the comment, I am curious how does the taskgroup work with branch operator I will get this when I try to do this
airflow.exceptions.AirflowException: Branch callable must return valid task_ids. Invalid tasks found: {'update_job_pod_name'}

You can use BranchPythonOperator that get the value and return which the name of task to run in any condition.
def choose_job_func(job_id):
if job_id:
return "update_pod_name_rds"
choose_update_job =BranchPythonOperator(task_id="choose_update_job", python_callable=choose_job_func,
op_kwargs={"job_id": "{{ params.job_id }}"})
or, in task flow api it would look like this :
#task.branch
def choose_update_job(job_id):
if job_id:
return "update_pod_name_rds"
Full Dag Example :
with DAG(
dag_id="test_dag",
start_date=datetime(2022, 1, 1),
schedule_interval=None,
render_template_as_native_obj=True,
params={
"job_id": Param(default=None, type=["null", "string"])
},
tags=["test"],) as dag:
def update_job_pod_name(job_id: str, process_name: str):
return MySqlOperator(
task_id="update_pod_name_rds",
mysql_conn_id="semantic-search-airflow-sdk",
autocommit=True,
sql=[
f"""
INSERT INTO airflow.Pod (job_id, pod_name, task_name)
SELECT * FROM (SELECT '{job_id}', '{xcom_pull("pod_name")}', '{process_name}') AS temp
WHERE NOT EXISTS (
SELECT pod_name FROM airflow.Pod WHERE pod_name = '{{{{ ti.xcom_pull(key="pod_name") }}}}'
) LIMIT 1;
"""
],
task_concurrency=1,
dag=dag,
trigger_rule="all_done",
)
#task.branch
def choose_update_job(job_id):
print(job_id)
if job_id:
return "update_pod_name_rds"
return "do_nothing"
sql_task = update_job_pod_name(
"{{ params.job_id}}",
"process_name",
)
do_nothing = EmptyOperator(task_id="do_nothing")
start_dag = EmptyOperator(task_id="start")
end_dag = EmptyOperator(task_id="end", trigger_rule=TriggerRule.ONE_SUCCESS)
(start_dag >> choose_update_job("{{ params.job_id }}") >> [sql_task, do_nothing] >> end_dag)

Creating dynamic workflows for Airflow tasks present in a Python list

I have a list of lists in the following way -
[['X_API', 'Y_API',....], ['Z_API', 'P_API', ...], [....], [...] .... ]
Here, each API name corresponds to a PythonOperator.
In Airflow, I would like to create task dependencies such that from a starting dummy task, I should have parallel tasks for each of the list inside the main list, and the operators inside the list of lists should execute in sequence :
How can I do this ? I would appreciate any help in this !
Existing code:
args = {
'depends_on_past': False,
'start_date': datetime.now(),
'email': '',
'email_on_failure': False,
'email_on_retry': False,
'retries': 3,
'retry_delay': timedelta(minutes=1)
}
dag = DAG(dag_id, default_args=args, schedule_interval=None)
with dag:
tasks = []
tmp, tmp2 = set(), set()
Start = DummyOperator(
task_id='Start',
dag=dag
)
End = DummyOperator(
task_id='End',
dag=dag
)
for i in dags:
for j in i:
if 'APIs' in list(i.keys()):
for l in i['APIs']:
tab = DummyOperator(
task_id=l['api'] + "_API",
dag=dag
)
tmp.add(tab)
elif 'tables' in list(i.keys()):
for k in i['tables']:
tab2 = DummyOperator(
task_id=k['table'] + "_API",
dag=dag
)
tmp2.add(tab2)
tasks.append(list(tmp))
tasks.append(list(tmp2))
for task in tasks:
for op in range(0, len(task)-1):
Start.set_downstream(task[op])
task[op].set_downstream(task[op+1])
task[op+1].set_downstream(End)

This was the solution I came up with -
with dag:
Start = ShortCircuitOperator(
task_id = 'Create_Postgres_Schema',
python_callable = create_postgres_schema,
ignore_downstream_trigger_rules = True
)
End = DummyOperator (
task_id = 'End_of_Data_extraction',
trigger_rule = TriggerRule.ALL_DONE
)
# Get list of first and last task. This is done to enable the task flow in Airflow
start_task_list = list(map(itemgetter(0), tasks_master_list))
end_tasks_list = list(map(itemgetter(-1), tasks_master_list))
Start >> start_task_list
for task in tasks_master_list:
for op in range(len(tasks_master_list)-1):
task[op] >> task[op+1]
end_tasks_list >> End

DatabricksRunNowOperator returning value

Can I pass a return value from DatabricksRunNowOperator using xcom or any other method ? I just want to return back "date" after my 1st databricks operator has finished and pass it on to the dependent task
for eg:
I want to pass the return value of the verification_run to the insert_run and workspace_run. Usually we can use a xcom_pull and xcom_push to do it in python. But not sure how to make two notebooks talk to each other
from airflow import DAG
from airflow.providers.databricks.operators.databricks import DatabricksRunNowOperator
from airflow.utils.dates import days_ago
default_args = {
'owner': 'airflow'
}
with DAG('databricks_dag',
start_date = days_ago(2),
schedule_interval = None,
default_args = default_args
) as dag:
verification_run = DatabricksRunNowOperator(
task_id = 'verification_task',
databricks_conn_id = 'databricks_default',
job_id = '-----'
)
insert_run = DatabricksRunNowOperator(
task_id = 'insert_task',
databricks_conn_id = 'databricks_default',
job_id = '-----'
)
workspace_run = DatabricksRunNowOperator(
task_id = 'workspace_task',
databricks_conn_id = 'databricks_default',
job_id = '------'
)
verification_run >> [insert_run,workspace_run]

Xcom pull returns a NameError on `ti`

I'm trying to push into xcom a value with key last_date in last_date_task and then pull it from xcom in second task ga_wh_task. when I test this dag by running python dag.py it returns an error at this line like:
provide_context=True, since={{ti.xcom_pull(task_ids="last_date_task", key='last_date')}})
NameError: name 'ti' is not defined
What am I doing wrong here? I've tried using task_instance instead of ti and context['ti'] instead of kwargs['ti']
Here's the dag file:
default_args = {
'owner': 'me',
'start_date': dt.datetime(2017, 10, 30),
'retries': 1,
'retry_delay': dt.timedelta(minutes=10),
'provide_context' : True,
}
def get_last_date(**kwargs):
kwargs['ti'].xcom_push(key='last_date', value='2018-11-15')
return True
with DAG('ga_mysql_dag2',
default_args=default_args,
schedule_interval=None,
catchup=False,
) as dag:
last_date_task = PythonOperator(task_id='last_date_task', python_callable=get_last_date, provide_context=True)
ga_wh_task = GoogleAnalyticsReportingToMySqlOperator(task_id='ga_wh_task', google_analytics_conn_id='google_analytics', key_file=key_file,\
view_id=view_id, until=until, dimensions=dimensions, metrics=metrics, database=database,\
table = table, mysql_conn_id = mysql_conn_id,
provide_context=True, since={{ti.xcom_pull(task_ids="last_date_task", key='last_date')}})
sleep = BashOperator(task_id='sleep', bash_command='sleep 10')
# Dependencies
last_date_task >> ga_warehouse_task >> sleep

Looks like GoogleAnalyticsReportingToMySqlOperator is an operator you created.
since param should be string. So change it to since="{{ti.xcom_pull(task_ids='last_date_task', key='last_date')}}"

How to handle different task intervals on a single Dag in airflow?

I have a single dag with multiple tasks with this simple structure that tasks A, B, and C can run at the start without any dependencies but task D depends on A no here is my question:
tasks A, B, and C run daily but I need task D to run weekly after A succeeds. how can I setup this dag?
does changing schedule_interval of task work? Is there any best practice to this problem?
Thanks for your help.

You can use a ShortCircuitOperator to do this.
import airflow
from airflow.operators.python_operator import ShortCircuitOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.models import DAG
args = {
'owner': 'airflow',
'start_date': airflow.utils.dates.days_ago(2),
'schedule_interval': '0 10 * * *'
}
dag = DAG(dag_id='example', default_args=args)
a = DummyOperator(task_id='a', dag=dag)
b = DummyOperator(task_id='b', dag=dag)
c = DummyOperator(task_id='c', dag=dag)
d = DummyOperator(task_id='d', dag=dag)
def check_trigger(execution_date, **kwargs):
return execution_date.weekday() == 0
check_trigger_d = ShortCircuitOperator(
task_id='check_trigger_d',
python_callable=check_trigger,
provide_context=True,
dag=dag
)
a.set_downstream(b)
b.set_downstream(c)
a.set_downstream(check_trigger_d)
# Perform D only if trigger function returns a true value
check_trigger_d.set_downstream(d)

In Airflow version >= 2.1.0, you can use the BranchDayOfWeekOperator which is exactly suited for your case.
See this answer for more details.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

airflow 2.3.3 SparkKubernetesOperator - apache-spark

Related

How to only run certain operator when dag conf value exist

Creating dynamic workflows for Airflow tasks present in a Python list

DatabricksRunNowOperator returning value

Xcom pull returns a NameError on `ti`

How to handle different task intervals on a single Dag in airflow?

Categories

Resources