I am using the kubernetes version 1.25 client and server, I have deployed Airflow using the official helm charts on the environment. I want the Airflow dags kubernetes pod operator that has code to trigger the spark-submit operation to spawn the driver pod and an executor pod that will run inside the spark submit command and perform a task. The Dag performs the following task 1. Take a table from mysql, 2.dump it in a text file, 3. put the same file to a minio bucket(similar to aws S3) Currently the driver pod spawns with executor pod. The Driver pod then fails eventually as it does not come into a running state. This event causes the executor pod to fail as well. I am authenticating the call going to kubernetes api using the a Service Account that I am passing as a configuration.
This is my redacted dag that I am using, note that spark-submit command works perfectly fine inside the container of the image on the command line and generates a expected outcome, So I doubt its some dag configuration that I might be missing here. Also not that all the jars that I am referring here are already part of the image and are being referenced from the**/opt/spark/connectors/** I have verified this by doing exec inside the container image
import logging
import csv
import airflow
from airflow import DAG
from airflow.utils import dates as date
from datetime import timedelta, datetime
from airflow.providers.apache.spark.operators.spark_jdbc import SparkSubmitOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
from dateutil.tz import tzlocal
from airflow.kubernetes.volume import Volume
from airflow.kubernetes.volume_mount import VolumeMount
import pendulum
#from airflow.models import Variables
local_tz = pendulum.timezone("Asia/Dubai")
volume_config = {"persistentVolumeClaim": {"claimName": "nfspvc-airflow-executable"}}
air_connectors_volume_config = {"persistentVolumeClaim": {"claimName": "nfspvc-airconnectors"}}
volume_mount = VolumeMount(
"data-volume",
mount_path="/air-spark/",
sub_path=None,
read_only=False,
)
air_connectors_volume_mount = VolumeMount(
"air-connectors",
mount_path="/air-connectors/",
sub_path=None,
read_only=False,
)
volume = Volume(
name="data-volume",
configs=volume_config
)
air_connectors_volume = Volume(
name="air-connectors",
configs=air_connectors_volume_config
)
default_args = {
'owner': 'panoiqtest',
'depends_on_past': False,
'start_date': datetime(2021, 5, 1, tzinfo=local_tz),
'retries': 1,
'retry_delay': timedelta(hours=1),
'email': ['admin#panoiq.com'],
'email_on_failure': False,
'email_on_retry': False
}
dag_daily = DAG(dag_id='operator',
default_args=default_args,
catchup=False,
schedule_interval='0 */1 * * *')
_config = {
'application': '/opt/spark/connectors/spark-etl-assembly-2.0.jar',
'num_executors': 2,
'driver_memory': '5G',
'executor_memory': '10G',
'driver_class_path':'/opt/spark/connectors/mysql-connector-java-5.1.49.jar',
'jars':'/opt/spark/connectors/mysql-connector-java-5.1.49.jar,/opt/spark/connectors/aws-java-sdk-bundle-1.12.374.jar,/opt/spark/connectors/hadoop-aws-3.3.1.jar',
#'java_class': 'com.spark.ETLHandler'
}
spark_config = {
"spark.executor.extraClassPath":"/opt/spark/connectors/mysql-connector-java-5.1.49.jar,/opt/spark/connectors/aws-java-sdk-bundle-1.12.374.jar,/opt/spark/connectors/hadoop-aws-3.3.1.jar",
"spark.driver.extraClassPath":"/opt/spark/connectors/mysql-connector-java-5.1.49.jar,/opt/spark/connectors/aws-java-sdk-bundle-1.12.374.jar,/opt/spark/connectors/hadoop-aws-3.3.1.jar"
}
t2 = BashOperator(
task_id='bash_example',
# "scripts" folder is under "/usr/local/airflow/dags"
bash_command="ls /air-spark/ && pwd",
dag=dag_daily)
def get_tables(table_file='/csv-directory/success-dag.csv', **kwargs):
logging.info("#Starting get_tables()#")
tables_list=[]
with open(table_file) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
tables_list= [row for row in reader]
tables_list.pop(0) #remove header
return tables_list
def load_table(table_name, application_args, **kwargs):
k8s_arguments = [
'--name=datalake-pod',
'--master=k8s://https://IP:6443',
'--deploy-mode=cluster',
# '--driver-cores=4',
# '--executor-cores=4',
# '--num-executors=1',
# '--driver-memory=8192m',
'--executor-memory=8192m',
'--conf=spark.kubernetes.authenticate.driver.serviceAccountName=air-airflow-sa',
'--driver-class-path=/opt/spark/connectors//mysql-connector-java-5.1.49.jar,/opt/spark/connectors/aws-java-sdk-bundle-1.12.374.jar,/opt/spark/connectors/hadoop-aws-3.3.1.jar',
'--conf=spark.driver.extraJavaOptions=-Divy.cache.dir=/tmp -Divy.home=/tmp',
'--jars=/opt/spark/connectors/mysql-connector-java-5.1.49.jar,/opt/spark/connectors/aws-java-sdk-bundle-1.12.374.jar,/opt/spark/connectors/hadoop-aws-3.3.1.jar',
'--conf=spark.kubernetes.namespace=development',
# '--conf=spark.driver.cores=4',
# '--conf=spark.executor.cores=4',
# '--conf=spark.driver.memory=8192m',
# '--conf=spark.executor.memory=8192m',
'--conf=spark.kubernetes.container.image=image_name',
'--conf=spark.kubernetes.container.image.pullSecrets=Secret_name',
'--conf=spark.kubernetes.container.image.pullPolicy=Always',
'--conf=spark.dynamicAllocation.enabled=true',
'--conf=spark.dynamicAllocation.shuffleTracking.enabled=true',
'--conf=spark.kubernetes.driver.volumes.persistentVolumeClaim.air-connectors.mount.path=/air-connectors/',
'--conf=spark.kubernetes.driver.volumes.persistentVolumeClaim.air-connectors.mount.readOnly=false',
'--conf=spark.kubernetes.driver.volumes.persistentVolumeClaim.air-connectors.options.claimName=nfspvc-airconnectors',
'--conf=spark.kubernetes.file.upload.path=/opt/spark',
'--class=com.spark.ETLHandler',
'/opt/spark/connectors/spark-etl-assembly-2.0.jar'
];
all_arguments = k8s_arguments + application_args
return KubernetesPodOperator(
dag=dag_daily,
name="zombie-dry-run", #spark_submit_for_"+table_name
# image='image_name',
image='imagerepo.io:5050/panoiq/tools:sparktester',
image_pull_policy = 'Always',
image_pull_secrets = 'registry',
namespace='development',
cmds=['spark-submit'],
arguments=all_arguments,
labels={"foo": "bar"},
task_id="dry_run_demo", #spark_submit_for_"+table_name
# config_file="conf",
volumes=[volume, air_connectors_volume],
volume_mounts=[volume_mount, air_connectors_volume_mount],
)
push_tables_list = PythonOperator(task_id= "load_tables_list",
python_callable=get_tables,
dag=dag_daily)
complete = DummyOperator(task_id="complete",
dag=dag_daily)
for rec in get_tables():
table_name = rec[9]
s3_folder_name = rec[14]
s3_object_name = rec[13]
jdbcUrl = rec[4] + rec[8]
lineagegraph = ",".join(rec[17].split("#"))
entitlement = rec[10]
remarks = rec[11]
username = rec[5]
password = rec[6]
s3_folder_format = rec[16]
select_query = rec[9]
application_args= [select_query, s3_folder_name, jdbcUrl, lineagegraph,entitlement, remarks,username,password,s3_folder_format,s3_object_name]
push_tables_list >> load_table(table_name, application_args) >> complete
Any Help or pointers are appreciated on the issue!! Thanks in advance!!
I was able to fix this issue with the code below, I was able to use the Airflow pod itself as driver and that will just spawn a executor pod and run the jobs and die once completed the job flow
Below is my Python File for anyone that needs to do this again
import logging
import csv
import airflow
from airflow import DAG
from airflow.utils import dates as date
from datetime import timedelta, datetime
from airflow.providers.apache.spark.operators.spark_jdbc import SparkSubmitOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
#from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator
from dateutil.tz import tzlocal
from airflow.kubernetes.volume import Volume
from airflow.kubernetes.volume_mount import VolumeMount
import pendulum
#from airflow.models import Variables
local_tz = pendulum.timezone("Asia/Dubai")
default_args = {
'owner': 'test',
'depends_on_past': False,
'start_date': datetime(2021, 5, 1, tzinfo=local_tz),
'retries': 1,
'retry_delay': timedelta(hours=1),
'email': ['admin#test.com'],
'email_on_failure': False,
'email_on_retry': False
}
dag_daily = DAG(dag_id='datapipeline',
default_args=default_args,
catchup=False,
schedule_interval='#hourly')
start = DummyOperator(task_id='run_this_first', dag=dag_daily)
_config = {
'application': '/air-spark/spark-etl-assembly-2.0.jar',
'num_executors': 2,
'driver_memory': '5G',
'executor_memory': '10G',
'driver_class_path':'/air-connectors/mysql-connector-java-5.1.49.jar',
'jars':'/air-connectors/mysql-connector-java-5.1.49.jar,/air-connectors/aws-java-sdk-bundle-1.12.374.jar,/air-connectors/hadoop-aws-3.3.1.jar',
#'java_class': 'com.spark.ETLHandler'
}
spark_config = {
"spark.executor.extraClassPath":"/air-connectors/mysql-connector-java-5.1.49.jar,/air-connectors/aws-java-sdk-bundle-1.12.374.jar,/air-connectors/hadoop-aws-3.3.1.jar",
"spark.driver.extraClassPath":"/air-connectors/mysql-connector-java-5.1.49.jar,/air-connectors/aws-java-sdk-bundle-1.12.374.jar,/air-connectors/hadoop-aws-3.3.1.jar"
}
t2 = BashOperator(
task_id='bash_example',
# "scripts" folder is under "/usr/local/airflow/dags"
bash_command="ls /air-spark/ && pwd",
dag=dag_daily)
def get_tables(table_file='/csv-directory/success-dag.csv', **kwargs):
logging.info("#Starting get_tables()#")
tables_list=[]
with open(table_file) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
tables_list= [row for row in reader]
tables_list.pop(0) #remove header
return tables_list
def load_table(table_name, application_args, **kwargs):
k8s_arguments = [ "--master", "local[*]", "--conf", "spark.executor.extraClassPath=/air-connectors/mysql-connector-java-5.1.49.jar",
"--conf", "spark.driver.extraClassPath=/opt/spark/connectors/mysql-connector-java-5.1.49.jar", "--jars",
"/opt/spark/connectors/mysql-connector-java-5.1.49.jar,/opt/spark/connectors/ojdbc11-21.7.0.0.jar",
"--conf=spark.kubernetes.container.image=imagerepo.io:5050/tools:sparktesterV0.6",
"--conf=spark.kubernetes.container.image.pullSecrets=registry",
"--num-executors", "5", "--executor-memory", "1G", "--driver-memory", "2G", "--class=com.spark.ETLHandler",
"--name", "arrow-spark", "/opt/spark/connectors/spark-etl-assembly-2.0.jar" ];
all_arguments = k8s_arguments + application_args
# spark =
return KubernetesPodOperator(
image="imagerepo.io:5050/tools:sparktesterV0.6",
service_account_name="air-airflow-worker",
name="data_pipeline_k8s",
task_id="data_pipeline_k8s",
get_logs=True,
dag=dag_daily,
namespace="development",
image_pull_secrets="registry",
image_pull_policy="Always",
cmds=["spark-submit"],
arguments=all_arguments
)
# spark.set_upstream(start)
push_tables_list = PythonOperator(task_id= "load_tables_list",python_callable=get_tables,dag=dag_daily)
complete = DummyOperator(task_id="complete",dag=dag_daily)
for rec in get_tables():
table_name = rec[9]
s3_folder_name = rec[14]
s3_object_name = rec[13]
jdbcUrl = rec[4] + rec[8]
lineagegraph = ",".join(rec[17].split("#"))
entitlement = rec[10]
remarks = rec[11]
username = rec[5]
password = rec[6]
s3_folder_format = rec[16]
select_query = rec[9]
application_args= [select_query, s3_folder_name, jdbcUrl, lineagegraph,entitlement, remarks,username,password,s3_folder_format,s3_object_name]
push_tables_list >> load_table(table_name, application_args) >> complete
Airflow 2.3.3
I have Dag with two TaskGroups.
Each TaskGroup has two tasks:
t1: SparkKubernetesOperator >> t2: SparkKubernetesSensor
t1 submits spark job into kubernetes cluster using spark operator deployment yaml file.
it goes into dark green SUCCESS state instantly.
t2 monitors the execution of t1. if spark job is Running then it takes ~10min for completion and then t2 goes into Success status.
I have the situation then submited spark job gets ERROR: UnknownHostException and this is when I want to retry but I want to retry whole TaskGroup and not only t2.
I now it is not possible to retry whole TaskGroup.
How to correctly retry and submit spark job into k8s task through airflow 2.3.3?
from datetime import datetime, timedelta
from airflow import DAG
from airflow.models.baseoperator import chain
from airflow.utils.task_group import TaskGroup
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
from airflow.providers.cncf.kubernetes.sensors.spark_kubernetes import SparkKubernetesSensor
from alerts import slack_alert
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'max_active_runs': 1,
'retries': 5,
'retry_delay': timedelta(minutes=30),
'on_failure_callback': slack_alert,
}
with DAG(
"some-dag-name",
default_args=default_args,
description='submit some-dag-name',
schedule_interval="30 4 * * *",
start_date = datetime(2022, 8, 27),
) as dag:
with TaskGroup("tg-some-task-name", default_args=default_args,) as tx_some_task_name:
task_some_task_name = SparkKubernetesOperator(
task_id='some-task-name',
namespace="batch",
application_file="k8s/some-task-name.yaml",
do_xcom_push=True,
dag=dag,
)
task_some_task_name_sensor = SparkKubernetesSensor(
task_id='some-task-name-sensor',
namespace="batch",
application_name="{{ task_instance.xcom_pull(task_ids='tg-some-task-name.some-task-name')['metadata']['name'] }}",
kubernetes_conn_id="kubernetes_default",
dag=dag,
retries=1,
attach_log=True,
)
task_some_task_name >> task_some_task_name_sensor
with TaskGroup("tg-some-other-task", default_args=default_args,) as tx_some_other_task:
task_some_other_task = SparkKubernetesOperator(
task_id='some-other-task',
namespace="batch",
application_file="k8s/some-other-task.yaml",
do_xcom_push=True,
dag=dag,
)
task_some_other_task_sensor = SparkKubernetesSensor(
task_id='some-other-task-sensor',
namespace="batch",
application_name="{{ task_instance.xcom_pull(task_ids='tg-some-other-task.some-other-task')['metadata']['name'] }}",
kubernetes_conn_id="kubernetes_default",
dag=dag,
retries=1,
attach_log=True,
)
task_some_task_name_sensor >> task_some_other_task
chain(task_some_other_task, task_some_other_task_sensor)
Airflow TaskGroup doesn't support retry, so you cannot retry t1 when t2 fails if they are in the same TaskGroup.
But there is another component more suitable for your use case, which is SubDag, it's deprecated but still available in the last version, I think it will be removed once they add its features to TaskGroup (like the retry).
With SubDag, you can run a separate dag and configure its retry and conf, it will be visible in the graph of your main dag exactly like a TaskGroup. So you need just to create new dag contains your tasks t1 and t2, then replace the TaskGroup by a task instance of SubDagOperator which run this dag.
this is the concept:
sensor exposes callback which calls a function to which parameters are passed with defined upstream tasks to be cleared on retry.
sensor fails when retry value is reached.
utils.py
from airflow.models import taskinstance
from airflow.utils.db import provide_session
#provide_session
def clear_tasks(tis, session=None, activate_dag_runs=False, dag=None) -> None:
taskinstance.clear_task_instances(
tis=tis,
session=session,
activate_dag_runs=activate_dag_runs,
dag=dag,
)
def clear_upstream_task(context):
tasks_to_clear = context["params"].get("tasks_to_clear", [])
all_tasks = context["dag_run"].get_task_instances()
tasks_to_clear = [ti for ti in all_tasks if ti.task_id in tasks_to_clear]
clear_tasks(tasks_to_clear, dag=context["dag"])
then the task group:
from utils.callback_util import clear_upstream_task
with TaskGroup("tg-task", default_args=default_args) as some_task:
task1 = SparkKubernetesOperator(
task_id='task1',
namespace="batch",
application_file="k8s/task1.yaml",
do_xcom_push=True,
dag=dag,
)
task_proxy_tx_1d_parsed_sensor = SparkKubernetesSensor(
task_id='task1-sensor',
namespace="batch",
application_name="{{ task_instance.xcom_pull(task_ids='tg-task.task1')['metadata']['name'] }}",
kubernetes_conn_id="kubernetes_default",
dag=dag,
attach_log=True,
params={"tasks_to_clear": ["tg-task.task1"]},
on_retry_callback=clear_upstream_task
)
task1 >> task1_sensor
Initial question - https://stackoverflow.com/questions/70623990/retraining-a-machine-learning-model-using-airflow-pipeline
I have tried to clear a particular task and its downstream using the bash operator. However what I see is that, after clear task, this particular task is rerun in all the previous DAG runs.
What I require is the task to rerun in the current DAG run only , not in the previous ones.
I tried to do this using the airflow UI, there the behaviour is as expected !
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
# Default settings applied to all tasks
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
}
with DAG('clear_upstream_task',
start_date=datetime(2021, 1, 1),
schedule_interval=None,
default_args=default_args,
catchup=False,
) as dag:
t0 = DummyOperator(
task_id='t0'
)
t1 = DummyOperator(
task_id='t1'
)
t2 = DummyOperator(
task_id='t2'
)
t3 = BashOperator(
task_id='t3',
bash_command='airflow tasks clear -t t1 -d -y clear_upstream_task'
)
t0 >> t1 >> t2 >> t3
I have a scenario like the one below. Task 3 should be triggered on successful completion of Task1 and Task2. I have shared my code below, can you let me know what is missing in it for the scenario?
My code
from airflow import DAG
from airflow.contrib.sensors.aws_glue_catalog_partition_sensor import AwsGlueCatalogPartitionSensor
from datetime import datetime, timedelta
from airflow.operators.postgres_operator import PostgresOperator
from utils import FAILURE_EMAILS
yesterday = datetime.combine(datetime.today() - timedelta(1), datetime.min.time())
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': yesterday,
'email': FAILURE_EMAILS,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
dag = DAG('trigger_job', default_args=default_args, schedule_interval='#daily')
wait_for_task1 = AwsGlueCatalogPartitionSensor(
task_id='Task1',
database_name='db',
table_name='table1',
expression='load_date={{ ds_nodash }}',
timeout=60,
dag=dag)
wait_for_task2 = AwsGlueCatalogPartitionSensor(
task_id='Task2',
database_name='db',
table_name='table2',
expression='load_date={{ ds_nodash }}',
timeout=60,
dag=dag)
execute_sql = PostgresOperator(
task_id='Task3',
postgres_conn_id='REDSHIFT_CONN',
sql="schema_do_lines.sql",
params={'limit': '50'},
dag=dag
)
execute_sql.set_upstream(wait_for_task1)
How can this be done in airflow using python?
You need two one simple thing:
Set the dependencies right. For now you have coded that execute_sql depends on wait_for_task1. You have to specify that execute_sql also depends on wait_for_task2 by adding the line execute_sql.set_upstream(wait_for_task2) at the end of your code.
BONUS: You may have to set the trigger_rule parameter in your task definition. You can read more about it in this documentation. In your specific case there is no need to set it explicitly because by default it is set on all_success (i.e. execute task only if all the parents have succeeded), and thus execute_sql will only trigger when both tasks it depends on have succeeded.
I was trying to understand how dynamic dags are created in Apache airflow as I need this to create dynamic dags in my project.
Below is the link iam following:Dynamic DAG creation in Apache airflow
Below is the code block for creating a sample hello world dynamic DAGS.(Dynamic DAGs creation based on input parameters).
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def create_dag(dag_id,
schedule,
dag_number,
default_args):
def hello_world_py(*args):
print('Hello World')
print('This is DAG: {}'.format(str(dag_number)))
dag = DAG(dag_id,
schedule_interval=schedule,
default_args=default_args)
with dag:
t1 = PythonOperator(
task_id='hello_world',
python_callable=hello_world_py,
dag_number=dag_number)
return dag
# build a dag for each number in range(10)
for n in range(1, 10):
dag_id = 'hello_world_{}'.format(str(n))
default_args = {'owner': 'airflow',
'start_date': datetime(2018, 1, 1)
}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id,
schedule,
dag_number,
default_args)
The expectation is to create 9 such DAGs.But what I could see is that once i compile the above code block with python3 code_sample.py,it creates 9 DAGs however the code embeded in the DAG is entire sample code.
But to my understanding the created DAGs should have only the below code block which is available inside create_dag method in the above sample code block.
Expected DAG code:
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def hello_world_py(*args):
print('Hello World')
print('This is DAG: {}'.format(str(dag_number)))
dag = DAG(dag_id,
schedule_interval=schedule,
default_args=default_args)
with dag:
t1 = PythonOperator(
task_id='hello_world',
python_callable=hello_world_py,
dag_number=dag_number)
Actual DAG code:
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def create_dag(dag_id,
schedule,
dag_number,
default_args):
def hello_world_py(*args):
print('Hello World')
print('This is DAG: {}'.format(str(dag_number)))
dag = DAG(dag_id,
schedule_interval=schedule,
default_args=default_args)
with dag:
t1 = PythonOperator(
task_id='hello_world',
python_callable=hello_world_py,
dag_number=dag_number)
return dag
# build a dag for each number in range(10)
for n in range(1, 10):
dag_id = 'hello_world_{}'.format(str(n))
default_args = {'owner': 'airflow',
'start_date': datetime(2018, 1, 1)
}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id,
schedule,
dag_number,
default_args)
Let me know what is creating the above problem
The code that you see in Airflow UI when clicking on "Code" tab is simply the whole .py file source code. See how this function is implemented:
https://github.com/apache/airflow/blob/master/airflow/www/views.py#L437