Python operator is not being called in dynamic subdag in Airflow - python-3.x

I have created a subdag dynamically. Everything working properly, main_dag is running fine. Its PythonOperator function is being called. But Python callable in Subdag are not being called. Kindly help me. As I am new to Airflow, so got and merged this code from different sources.
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
from copy import deepcopy
import airflow
main_default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2019, 12, 16),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
}
def sub_dag_method_a():
"""
sub dag method a
"""
import pdb;pdb.set_trace()
print('Subdag method a')
return 'a'
def sub_dag_method_b():
"""
sub dag method a
"""
print('Subdag method b')
return 'b'
# sub dag arguments
def create_subdag(dag_parent, dag_id_child_prefix, db_name, dag_child_id, start_date, schedule_interval):
# dag params
# import pdb;pdb.set_trace()
dag_id_child = '%s.%s_%s' % (dag_parent,dag_child_id,dag_id_child_prefix)
# main default
default_args_copy = deepcopy(main_default_args)
subdag = DAG(dag_id=dag_id_child, schedule_interval=schedule_interval,
start_date=start_date, default_args=default_args_copy)
# operators
tid_check = 'dummy_task_start_%s' % dag_id_child_prefix
print(tid_check)
method_start = DummyOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy)
tid_check = 'get_from_facebook_and_save_to_db_%s' % dag_id_child_prefix
print(tid_check)
method_a = PythonOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy,
python_callable=sub_dag_method_a)
tid_check = 'save_to_es_fetch_from_db_%s' % dag_id_child_prefix
print(tid_check)
method_b = PythonOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy,
provide_context=True,
python_callable=sub_dag_method_b)
tid_check = 'dummy_task_end_%s' % dag_id_child_prefix
print(tid_check)
method_end = DummyOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy)
method_start >> method_a
method_a >> method_b
method_b >> method_end
return subdag
# main default arguments
# main dag
main_dag = DAG('main_dag', default_args=deepcopy(main_default_args), schedule_interval=timedelta(hours=1),
start_date=datetime(2019, 12, 16))
# hello_world
def hello_world():
"""
Hello world
"""
i=0
subdag = create_subdag('main_dag', str(i), 'db_name'+str(i), 'task_dag',
main_dag.start_date, main_dag.schedule_interval)
# import pdb;pdb.set_trace()
sd_op = SubDagOperator(task_id='task_dag_'+str(i), subdag=subdag, dag=main_dag)
return subdag
# main task
main_task = PythonOperator(task_id='main_task', python_callable=hello_world, dag=main_dag)
# hello_world()
the output by running the command
airflow test 'main_dag' 'main_task' 2019/12/16
is
(alphavu3711_1) Noamans-MacBook-Pro-2:python3 noamanfaisalbinbadar$ airflow test 'main_dag' 'main_task' 2019/12/16
[2019-12-16 21:56:10,312] {settings.py:252} INFO - settings.configure_orm(): Using pool settings. pool_size=5, max_overflow=10, pool_recycle=1800, pid=4100
[2019-12-16 21:56:11,119] {__init__.py:51} INFO - Using executor SequentialExecutor
[2019-12-16 21:56:11,119] {dagbag.py:92} INFO - Filling up the DagBag from /Users/noamanfaisalbinbadar/code/alphavu/production/python3/fb_messenger_airflow/dags
[2019-12-16 21:56:11,415] {taskinstance.py:630} INFO - Dependencies all met for <TaskInstance: main_dag.main_task 2019-12-16T00:00:00+00:00 [success]>
[2019-12-16 21:56:11,433] {taskinstance.py:630} INFO - Dependencies all met for <TaskInstance: main_dag.main_task 2019-12-16T00:00:00+00:00 [success]>
[2019-12-16 21:56:11,433] {taskinstance.py:841} INFO -
--------------------------------------------------------------------------------
[2019-12-16 21:56:11,433] {taskinstance.py:842} INFO - Starting attempt 2 of 1
[2019-12-16 21:56:11,433] {taskinstance.py:843} INFO -
--------------------------------------------------------------------------------
[2019-12-16 21:56:11,433] {taskinstance.py:862} INFO - Executing <Task(PythonOperator): main_task> on 2019-12-16T00:00:00+00:00
[2019-12-16 21:56:11,455] {python_operator.py:105} INFO - Exporting the following env vars:
AIRFLOW_CTX_DAG_ID=main_dag
AIRFLOW_CTX_TASK_ID=main_task
AIRFLOW_CTX_EXECUTION_DATE=2019-12-16T00:00:00+00:00
AIRFLOW_CTX_DAG_RUN_ID=scheduled__2019-12-16T00:00:00+00:00
dummy_task_start_0
get_from_facebook_and_save_to_db_0
save_to_es_fetch_from_db_0
dummy_task_end_0
[2019-12-16 21:56:11,459] {python_operator.py:114} INFO - Done. Returned value was: <DAG: main_dag.task_dag_0>
the new approach after your answer is this
from fb_messenger.airflow_helpers.get_conversation_ids_page_wise import GetConversationIdsPageWise
from fb_messenger.airflow_helpers.get_conversation_messages_info import GetConversationMessagesInfo
from fb_messenger.airflow_helpers.save_to_es import SaveToES
from copy import deepcopy
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
import airflow
main_default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
}
def create_subdag(dag_name, dag_name_prefix, start_date, schedule_interval, conversation_info):
# dag params
# import pdb;pdb.set_trace()
dag_name_processed = '%s_%s' % (dag_name, dag_name_prefix)
# main default
default_args_copy = deepcopy(main_default_args)
subdag = DAG(dag_name_processed, schedule_interval=schedule_interval, start_date=start_date,
default_args=deepcopy(main_default_args))
def sub_dag_method_a(**kwargs):
"""
sub dag method a
"""
print('Subdag method a')
print(kwargs['conversation_id'])
print(kwargs['updated_time'])
return 'a'
def sub_dag_method_b(**kwargs):
"""
sub dag method a
"""
print('Subdag method b')
print(kwargs['conversation_id'])
print(kwargs['updated_time'])
return 'b'
with subdag:
# operators
tid_check = 'dummy_task_start_%s' % dag_name_prefix
# print(tid_check)
method_start = DummyOperator(task_id=tid_check, dag=subdag)
# new tid
tid_check = 'get_from_facebook_and_save_to_db_%s' % dag_name_prefix
# print(tid_check)
method_a = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_a,
op_kwargs={'conversation_id':conversation_info['id'],
'updated_time':conversation_info['updated_time']})
# new tid
tid_check = 'save_to_es_fetch_from_db_%s' % dag_name_prefix
# print(tid_check)
method_b = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_b,
op_kwargs={'conversation_id':conversation_info['id'],
'updated_time':conversation_info['updated_time']})
# new tid
tid_check = 'dummy_task_end_%s' % dag_name_prefix
# print(tid_check)
method_end = DummyOperator(task_id=tid_check, dag=subdag)
# dependencies
method_start >> method_a
method_a >> method_b
method_b >> method_end
# return subdag
return subdag
start_date_ = datetime.now() + timedelta(minutes=-1)
# getting list of dictionaries
conversation_infos = GetConversationIdsPageWise().get_all()
print(conversation_infos)
print(len(conversation_infos))
for conversation_info in conversation_infos:
print(conversation_info)
i = conversation_info['id']
subdag_name = 'main_dag'
sub_dag = create_subdag(subdag_name, str(i), start_date_, timedelta(minutes=2), conversation_info)
print(sub_dag)
But I am unable to create multiple Dags even

It's impossible to create SubDAG dynamically in "execute" method of another operator. This is effectively what you try to achieve.
DAG and their dependencies (including SubDags) are created while parsing the python code and constructing the objects available in the top level of the python file. In this case it is creating the DAG and assigning it to main_dag variable and then creating PythonOperator and assigning it to the main_task. This is all that happens during scheduling. The PythonOperator callable is not called then.
When the task is executed and the callable is called - it is already too late to create DAG. By that time all the DAG structure and dependencies are already created and scheduling is done.
Basically you can only create new DAGs (including SubDAGs) in the scheduler - scheduler parses all the python code and creates DAGs and it's tasks. Then particular tasks (for example the PythonOperator you mentioned) are executed when their time and dependencies are due in one of the Workers (not in the Scheduler) and even if they create DAGS, it does not impact scheduler and the created DAGs are never scheduled.

I think , you are trying to create subdag dynamically based on conversation info . I have found few issues in your updated code
It should have a main dag object which needs to be passed to subdag function.
Need to call subdag function using subdag operator which is missing in your code .
Subdag name needs to match "parent_dag_name"."child_dag_name" pattern, not "parent_dag_name"_"child_dag_name"
Below code is working for me
from fb_messenger.airflow_helpers.get_conversation_ids_page_wise import GetConversationIdsPageWise
from fb_messenger.airflow_helpers.get_conversation_messages_info import GetConversationMessagesInfo
from fb_messenger.airflow_helpers.save_to_es import SaveToES
from copy import deepcopy
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
import airflow
main_default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
}
def create_subdag(dag_name, dag_name_prefix, start_date, schedule_interval, conversation_info):
# dag params
# import pdb;pdb.set_trace()
dag_name_processed = '%s.%s' % (dag_name, dag_name_prefix)
# main default
default_args_copy = deepcopy(main_default_args)
subdag = DAG(dag_name_processed, schedule_interval=schedule_interval, start_date=start_date,
default_args=deepcopy(main_default_args))
def sub_dag_method_a(**kwargs):
"""
sub dag method a
"""
print('Subdag method a')
print(kwargs['conversation_id'])
print(kwargs['updated_time'])
return 'a'
def sub_dag_method_b(**kwargs):
"""
sub dag method a
"""
print('Subdag method b')
print(kwargs['conversation_id'])
print(kwargs['updated_time'])
return 'b'
with subdag:
# operators
tid_check = 'dummy_task_start_%s' % dag_name_prefix
# print(tid_check)
method_start = DummyOperator(task_id=tid_check, dag=subdag)
# new tid
tid_check = 'get_from_facebook_and_save_to_db_%s' % dag_name_prefix
# print(tid_check)
method_a = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_a,
op_kwargs={'conversation_id':conversation_info['id'],
'updated_time':conversation_info['updated_time']})
# new tid
tid_check = 'save_to_es_fetch_from_db_%s' % dag_name_prefix
# print(tid_check)
method_b = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_b,
op_kwargs={'conversation_id':conversation_info['id'],
'updated_time': conversation_info['updated_time']})
# new tid
tid_check = 'dummy_task_end_%s' % dag_name_prefix
# print(tid_check)
method_end = DummyOperator(task_id=tid_check, dag=subdag)
# dependencies
method_start >> method_a
method_a >> method_b
method_b >> method_end
# return subdag
return subdag
sd = datetime.now()
main_dag = DAG('main_dag', default_args=deepcopy(main_default_args), schedule_interval=timedelta(hours=1),
start_date = sd)
# getting list of dictionaries
conversation_infos = GetConversationIdsPageWise().get_all()
print(conversation_infos)
print(len(conversation_infos))
for conversation_info in conversation_infos:
print(conversation_info)
i = conversation_info['id']
subdag_name = 'main_dag'
t_sub_dag = SubDagOperator(
subdag=create_subdag(subdag_name, str(i), sd, timedelta(minutes=2), conversation_info),
task_id=str(i),
dag=main_dag
)

Related

I am relatively new to Airflow and spark and want to use the Kubernetes Operator in airflow Dag to run a Spark Submit command

I am using the kubernetes version 1.25 client and server, I have deployed Airflow using the official helm charts on the environment. I want the Airflow dags kubernetes pod operator that has code to trigger the spark-submit operation to spawn the driver pod and an executor pod that will run inside the spark submit command and perform a task. The Dag performs the following task 1. Take a table from mysql, 2.dump it in a text file, 3. put the same file to a minio bucket(similar to aws S3) Currently the driver pod spawns with executor pod. The Driver pod then fails eventually as it does not come into a running state. This event causes the executor pod to fail as well. I am authenticating the call going to kubernetes api using the a Service Account that I am passing as a configuration.
This is my redacted dag that I am using, note that spark-submit command works perfectly fine inside the container of the image on the command line and generates a expected outcome, So I doubt its some dag configuration that I might be missing here. Also not that all the jars that I am referring here are already part of the image and are being referenced from the**/opt/spark/connectors/** I have verified this by doing exec inside the container image
import logging
import csv
import airflow
from airflow import DAG
from airflow.utils import dates as date
from datetime import timedelta, datetime
from airflow.providers.apache.spark.operators.spark_jdbc import SparkSubmitOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
from dateutil.tz import tzlocal
from airflow.kubernetes.volume import Volume
from airflow.kubernetes.volume_mount import VolumeMount
import pendulum
#from airflow.models import Variables
local_tz = pendulum.timezone("Asia/Dubai")
volume_config = {"persistentVolumeClaim": {"claimName": "nfspvc-airflow-executable"}}
air_connectors_volume_config = {"persistentVolumeClaim": {"claimName": "nfspvc-airconnectors"}}
volume_mount = VolumeMount(
"data-volume",
mount_path="/air-spark/",
sub_path=None,
read_only=False,
)
air_connectors_volume_mount = VolumeMount(
"air-connectors",
mount_path="/air-connectors/",
sub_path=None,
read_only=False,
)
volume = Volume(
name="data-volume",
configs=volume_config
)
air_connectors_volume = Volume(
name="air-connectors",
configs=air_connectors_volume_config
)
default_args = {
'owner': 'panoiqtest',
'depends_on_past': False,
'start_date': datetime(2021, 5, 1, tzinfo=local_tz),
'retries': 1,
'retry_delay': timedelta(hours=1),
'email': ['admin#panoiq.com'],
'email_on_failure': False,
'email_on_retry': False
}
dag_daily = DAG(dag_id='operator',
default_args=default_args,
catchup=False,
schedule_interval='0 */1 * * *')
_config = {
'application': '/opt/spark/connectors/spark-etl-assembly-2.0.jar',
'num_executors': 2,
'driver_memory': '5G',
'executor_memory': '10G',
'driver_class_path':'/opt/spark/connectors/mysql-connector-java-5.1.49.jar',
'jars':'/opt/spark/connectors/mysql-connector-java-5.1.49.jar,/opt/spark/connectors/aws-java-sdk-bundle-1.12.374.jar,/opt/spark/connectors/hadoop-aws-3.3.1.jar',
#'java_class': 'com.spark.ETLHandler'
}
spark_config = {
"spark.executor.extraClassPath":"/opt/spark/connectors/mysql-connector-java-5.1.49.jar,/opt/spark/connectors/aws-java-sdk-bundle-1.12.374.jar,/opt/spark/connectors/hadoop-aws-3.3.1.jar",
"spark.driver.extraClassPath":"/opt/spark/connectors/mysql-connector-java-5.1.49.jar,/opt/spark/connectors/aws-java-sdk-bundle-1.12.374.jar,/opt/spark/connectors/hadoop-aws-3.3.1.jar"
}
t2 = BashOperator(
task_id='bash_example',
# "scripts" folder is under "/usr/local/airflow/dags"
bash_command="ls /air-spark/ && pwd",
dag=dag_daily)
def get_tables(table_file='/csv-directory/success-dag.csv', **kwargs):
logging.info("#Starting get_tables()#")
tables_list=[]
with open(table_file) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
tables_list= [row for row in reader]
tables_list.pop(0) #remove header
return tables_list
def load_table(table_name, application_args, **kwargs):
k8s_arguments = [
'--name=datalake-pod',
'--master=k8s://https://IP:6443',
'--deploy-mode=cluster',
# '--driver-cores=4',
# '--executor-cores=4',
# '--num-executors=1',
# '--driver-memory=8192m',
'--executor-memory=8192m',
'--conf=spark.kubernetes.authenticate.driver.serviceAccountName=air-airflow-sa',
'--driver-class-path=/opt/spark/connectors//mysql-connector-java-5.1.49.jar,/opt/spark/connectors/aws-java-sdk-bundle-1.12.374.jar,/opt/spark/connectors/hadoop-aws-3.3.1.jar',
'--conf=spark.driver.extraJavaOptions=-Divy.cache.dir=/tmp -Divy.home=/tmp',
'--jars=/opt/spark/connectors/mysql-connector-java-5.1.49.jar,/opt/spark/connectors/aws-java-sdk-bundle-1.12.374.jar,/opt/spark/connectors/hadoop-aws-3.3.1.jar',
'--conf=spark.kubernetes.namespace=development',
# '--conf=spark.driver.cores=4',
# '--conf=spark.executor.cores=4',
# '--conf=spark.driver.memory=8192m',
# '--conf=spark.executor.memory=8192m',
'--conf=spark.kubernetes.container.image=image_name',
'--conf=spark.kubernetes.container.image.pullSecrets=Secret_name',
'--conf=spark.kubernetes.container.image.pullPolicy=Always',
'--conf=spark.dynamicAllocation.enabled=true',
'--conf=spark.dynamicAllocation.shuffleTracking.enabled=true',
'--conf=spark.kubernetes.driver.volumes.persistentVolumeClaim.air-connectors.mount.path=/air-connectors/',
'--conf=spark.kubernetes.driver.volumes.persistentVolumeClaim.air-connectors.mount.readOnly=false',
'--conf=spark.kubernetes.driver.volumes.persistentVolumeClaim.air-connectors.options.claimName=nfspvc-airconnectors',
'--conf=spark.kubernetes.file.upload.path=/opt/spark',
'--class=com.spark.ETLHandler',
'/opt/spark/connectors/spark-etl-assembly-2.0.jar'
];
all_arguments = k8s_arguments + application_args
return KubernetesPodOperator(
dag=dag_daily,
name="zombie-dry-run", #spark_submit_for_"+table_name
# image='image_name',
image='imagerepo.io:5050/panoiq/tools:sparktester',
image_pull_policy = 'Always',
image_pull_secrets = 'registry',
namespace='development',
cmds=['spark-submit'],
arguments=all_arguments,
labels={"foo": "bar"},
task_id="dry_run_demo", #spark_submit_for_"+table_name
# config_file="conf",
volumes=[volume, air_connectors_volume],
volume_mounts=[volume_mount, air_connectors_volume_mount],
)
push_tables_list = PythonOperator(task_id= "load_tables_list",
python_callable=get_tables,
dag=dag_daily)
complete = DummyOperator(task_id="complete",
dag=dag_daily)
for rec in get_tables():
table_name = rec[9]
s3_folder_name = rec[14]
s3_object_name = rec[13]
jdbcUrl = rec[4] + rec[8]
lineagegraph = ",".join(rec[17].split("#"))
entitlement = rec[10]
remarks = rec[11]
username = rec[5]
password = rec[6]
s3_folder_format = rec[16]
select_query = rec[9]
application_args= [select_query, s3_folder_name, jdbcUrl, lineagegraph,entitlement, remarks,username,password,s3_folder_format,s3_object_name]
push_tables_list >> load_table(table_name, application_args) >> complete
Any Help or pointers are appreciated on the issue!! Thanks in advance!!
I was able to fix this issue with the code below, I was able to use the Airflow pod itself as driver and that will just spawn a executor pod and run the jobs and die once completed the job flow
Below is my Python File for anyone that needs to do this again
import logging
import csv
import airflow
from airflow import DAG
from airflow.utils import dates as date
from datetime import timedelta, datetime
from airflow.providers.apache.spark.operators.spark_jdbc import SparkSubmitOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
#from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator
from dateutil.tz import tzlocal
from airflow.kubernetes.volume import Volume
from airflow.kubernetes.volume_mount import VolumeMount
import pendulum
#from airflow.models import Variables
local_tz = pendulum.timezone("Asia/Dubai")
default_args = {
'owner': 'test',
'depends_on_past': False,
'start_date': datetime(2021, 5, 1, tzinfo=local_tz),
'retries': 1,
'retry_delay': timedelta(hours=1),
'email': ['admin#test.com'],
'email_on_failure': False,
'email_on_retry': False
}
dag_daily = DAG(dag_id='datapipeline',
default_args=default_args,
catchup=False,
schedule_interval='#hourly')
start = DummyOperator(task_id='run_this_first', dag=dag_daily)
_config = {
'application': '/air-spark/spark-etl-assembly-2.0.jar',
'num_executors': 2,
'driver_memory': '5G',
'executor_memory': '10G',
'driver_class_path':'/air-connectors/mysql-connector-java-5.1.49.jar',
'jars':'/air-connectors/mysql-connector-java-5.1.49.jar,/air-connectors/aws-java-sdk-bundle-1.12.374.jar,/air-connectors/hadoop-aws-3.3.1.jar',
#'java_class': 'com.spark.ETLHandler'
}
spark_config = {
"spark.executor.extraClassPath":"/air-connectors/mysql-connector-java-5.1.49.jar,/air-connectors/aws-java-sdk-bundle-1.12.374.jar,/air-connectors/hadoop-aws-3.3.1.jar",
"spark.driver.extraClassPath":"/air-connectors/mysql-connector-java-5.1.49.jar,/air-connectors/aws-java-sdk-bundle-1.12.374.jar,/air-connectors/hadoop-aws-3.3.1.jar"
}
t2 = BashOperator(
task_id='bash_example',
# "scripts" folder is under "/usr/local/airflow/dags"
bash_command="ls /air-spark/ && pwd",
dag=dag_daily)
def get_tables(table_file='/csv-directory/success-dag.csv', **kwargs):
logging.info("#Starting get_tables()#")
tables_list=[]
with open(table_file) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
tables_list= [row for row in reader]
tables_list.pop(0) #remove header
return tables_list
def load_table(table_name, application_args, **kwargs):
k8s_arguments = [ "--master", "local[*]", "--conf", "spark.executor.extraClassPath=/air-connectors/mysql-connector-java-5.1.49.jar",
"--conf", "spark.driver.extraClassPath=/opt/spark/connectors/mysql-connector-java-5.1.49.jar", "--jars",
"/opt/spark/connectors/mysql-connector-java-5.1.49.jar,/opt/spark/connectors/ojdbc11-21.7.0.0.jar",
"--conf=spark.kubernetes.container.image=imagerepo.io:5050/tools:sparktesterV0.6",
"--conf=spark.kubernetes.container.image.pullSecrets=registry",
"--num-executors", "5", "--executor-memory", "1G", "--driver-memory", "2G", "--class=com.spark.ETLHandler",
"--name", "arrow-spark", "/opt/spark/connectors/spark-etl-assembly-2.0.jar" ];
all_arguments = k8s_arguments + application_args
# spark =
return KubernetesPodOperator(
image="imagerepo.io:5050/tools:sparktesterV0.6",
service_account_name="air-airflow-worker",
name="data_pipeline_k8s",
task_id="data_pipeline_k8s",
get_logs=True,
dag=dag_daily,
namespace="development",
image_pull_secrets="registry",
image_pull_policy="Always",
cmds=["spark-submit"],
arguments=all_arguments
)
# spark.set_upstream(start)
push_tables_list = PythonOperator(task_id= "load_tables_list",python_callable=get_tables,dag=dag_daily)
complete = DummyOperator(task_id="complete",dag=dag_daily)
for rec in get_tables():
table_name = rec[9]
s3_folder_name = rec[14]
s3_object_name = rec[13]
jdbcUrl = rec[4] + rec[8]
lineagegraph = ",".join(rec[17].split("#"))
entitlement = rec[10]
remarks = rec[11]
username = rec[5]
password = rec[6]
s3_folder_format = rec[16]
select_query = rec[9]
application_args= [select_query, s3_folder_name, jdbcUrl, lineagegraph,entitlement, remarks,username,password,s3_folder_format,s3_object_name]
push_tables_list >> load_table(table_name, application_args) >> complete

Clear tasks in Airflow

Initial question - https://stackoverflow.com/questions/70623990/retraining-a-machine-learning-model-using-airflow-pipeline
I have tried to clear a particular task and its downstream using the bash operator. However what I see is that, after clear task, this particular task is rerun in all the previous DAG runs.
What I require is the task to rerun in the current DAG run only , not in the previous ones.
I tried to do this using the airflow UI, there the behaviour is as expected !
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
# Default settings applied to all tasks
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
}
with DAG('clear_upstream_task',
start_date=datetime(2021, 1, 1),
schedule_interval=None,
default_args=default_args,
catchup=False,
) as dag:
t0 = DummyOperator(
task_id='t0'
)
t1 = DummyOperator(
task_id='t1'
)
t2 = DummyOperator(
task_id='t2'
)
t3 = BashOperator(
task_id='t3',
bash_command='airflow tasks clear -t t1 -d -y clear_upstream_task'
)
t0 >> t1 >> t2 >> t3

Cannot run a spider successfully after finishing scraping data by another spider through running a script

#I am following code from this previous stackoverflow posts:
How to schedule Scrapy crawl execution programmatically
Running Scrapy multiple times in the same process
##The following script works well while using one spider:
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
# from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
PList = yaml.load(file, Loader=yaml.FullLoader)
Purl= []
for k, v in PList.items():
arg = v['M_title']
args = {"k": arg}
amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
Purl.append(amazon_url)
print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(settings = get_project_settings())
#inlineCallbacks
def loop_urls(urls):
for url in urls:
yield runner.crawl(AmazonfeedSpider, url)
# reactor.stop()
loop_urls(Purl)
reactor.run()
enter code here
##But this script doesn't even scrape successfully using the first spider.. and can't access the 2nd spider...
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
# def crawl_job():
# """
# Job to start spiders.
# Return Deferred, which will execute after crawl has completed.
# """
# settings = get_project_settings()
# runner = CrawlerRunner(settings)
# return runner.crawl(AmazonfeedSpider)
def CrawlProduct():
settings = get_project_settings()
runner2 = CrawlerRunner(settings)
yield runner2.crawl(ProductfeedSpider)
reactor.stop()
def schedule_next_crawl(null, sleep_time):
"""
Schedule the next crawl
"""
reactor.callLater(sleep_time, CrawlProduct)
#inlineCallbacks
def loop_urls(urls):
"""
# Job to start spiders.
# Return Deferred, which will execute after crawl has completed.
# """
settings = get_project_settings()
runner = CrawlerRunner(settings)
for url in urls:
yield runner.crawl(AmazonfeedSpider, url)
# reactor.stop()
def crawl(Purl):
"""
A function that schedules a crawl 30 seconds after
each successful crawl.
"""
# loop_urls() returns a Deferred
d = loop_urls(Purl)
# call schedule_next_crawl(<scrapy response>, n) after crawl job is complete
d.addCallback(schedule_next_crawl, 30)
d.addErrback(catch_error)
def catch_error(failure):
print(failure.value)
if __name__=="__main__":
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
PList = yaml.load(file, Loader=yaml.FullLoader)
Purl= []
for k, v in PList.items():
arg = v['M_title']
args = {"k": arg}
amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
Purl.append(amazon_url)
print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
crawl(Purl)
reactor.run()
#Is it for not executing the inlineCallbacks function properly..? I am drawing the attention of altruistic experts and looking forward to their suggestions and solutions. please speculate the aforementioned stackoverflow questions and solutions first, before answering my question.

No module named airfow.gcp - how to run dataflow job that uses python3/beam 2.15?

When I go to use operators/hooks like the BigQueryHook I see a message that these operators are deprecated and to use the airflow.gcp... operator version. However when i try and use it in my dag it fails and says no module named airflow.gcp. I have the most up to date airflow composer version w/ beta features, python3. Is it possible to install these operators somehow?
I am trying to run a Dataflow Job in python 3 using beam 2.15. I have tried virtualenv operator, but that doesn't work because it only allows python2.7. How can I do this?
The newest Airflow version available in Composer is either 1.10.2 or 1.10.3 (depending on the region). By then, those operators were in the contrib section.
Focusing on how to run Python 3 Dataflow jobs with Composer you'd need for a new version to be released. However, if you need an immediate solution you can try to back-port the fix.
In this case I defined a DataFlow3Hook which extends the normal DataFlowHook but that it does not hard-code python2 in the start_python_dataflow method:
class DataFlow3Hook(DataFlowHook):
def start_python_dataflow(
...
py_interpreter: str = "python3"
):
...
self._start_dataflow(variables, name, [py_interpreter] + py_options + [dataflow],
label_formatter)
Then we'll have our custom DataFlowPython3Operator calling the new hook:
class DataFlowPython3Operator(DataFlowPythonOperator):
def execute(self, context):
...
hook = DataFlow3Hook(gcp_conn_id=self.gcp_conn_id,
delegate_to=self.delegate_to,
poll_sleep=self.poll_sleep)
...
hook.start_python_dataflow(
self.job_name, formatted_options,
self.py_file, self.py_options, py_interpreter="python3")
Finally, in our DAG we just use the new operator:
task = DataFlowPython3Operator(
py_file='/home/airflow/gcs/data/main.py',
task_id=JOB_NAME,
dag=dag)
See full code here. Job runs with Python 3.6:
Environment details and dependencies used (Beam job was a minimal example):
softwareConfig:
imageVersion: composer-1.8.0-airflow-1.10.3
pypiPackages:
apache-beam: ==2.15.0
google-api-core: ==1.14.3
google-apitools: ==0.5.28
google-cloud-core: ==1.0.3
pythonVersion: '3'
Let me know if that works for you. If so, I'd recommend moving the code to a plugin for code readability and to reuse it across DAGs.
As an alternative, you can use the PythonVirtualenvOperator on older airflow versions. Given some beam pipeline (wrapped in a function) saved as dataflow_python3.py:
def main():
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
import argparse
import logging
class ETL(beam.DoFn):
def process(self, row):
#do data processing
def run(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://bucket/input/input.txt',
help='Input file to process.'
)
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
'--runner=DataflowRunner',
'--project=project_id',
'--region=region',
'--staging_location=gs://bucket/staging/',
'--temp_location=gs://bucket/temp/',
'--job_name=job_id',
'--setup_file=./setup.py'
])
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
with beam.Pipeline(options=pipeline_options) as p:
rows = (p | 'read rows' >> beam.io.ReadFromText(known_args.input))
etl = (rows | 'process data' >> beam.ParDo(ETL()))
logging.getLogger().setLevel(logging.DEBUG)
run()
You can run it using the following DAG file:
from airflow import DAG
from datetime import datetime, timedelta
from airflow.operators.python_operator import PythonVirtualenvOperator
import sys
import dataflow_python3 as py3 #import your beam pipeline file here
default_args = {
'owner': 'John Smith',
'depends_on_past': False,
'start_date': datetime(2016, 1, 1),
'email': ['email#gmail.com'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 3,
'retry_delay': timedelta(minutes=1),
}
CONNECTION_ID = 'proj_id'
with DAG('Dataflow_Python3', schedule_interval='#once', template_searchpath=['/home/airflow/gcs/dags/'], max_active_runs=15, catchup=True, default_args=default_args) as dag:
dataflow_python3 = PythonVirtualenvOperator(
task_id='dataflow_python3',
python_callable=py3.main, #this is your beam pipeline callable
requirements=['apache-beam[gcp]', 'pandas'],
python_version=3,
dag=dag
)
dataflow_python3
I have run Python 3 Beam -2.17 by using DataflowTemplateOperator and it worked like a charm.
Use below command to create template:
python3 -m scriptname --runner DataflowRunner --project project_id --staging_location staging_location --temp_location temp_location --template_location template_location/script_metadata --region region --experiments use_beam_bq_sink --no_use_public_ips --subnetwork=subnetwork
scriptname would be name of your Dataflow Python file(without .py extension)
--template_location - The location where dataflow template would be created, don't put any extension like .json to it. Simply, scriptname_metadata would work.
--experiments use_beam_bq_sink - This parameter would be used if your sink is BigQuery otherwise you can remove it.
import datetime as dt
import time
from airflow.models import DAG
from airflow.contrib.operators.dataflow_operator import DataflowTemplateOperator
lasthour = dt.datetime.now() - dt.timedelta(hours=1)
args = {
'owner': 'airflow',
'start_date': lasthour,
'depends_on_past': False,
'dataflow_default_options': {
'project': "project_id",
'staging_location': "staging_location",
'temp_location': "temp_location",
'region': "region",
'runner': "DataflowRunner",
'job_name': 'job_name' + str(time.time()),
},
}
dag = DAG(
dag_id='employee_dataflow_dag',
schedule_interval=None,
default_args=args
)
Dataflow_Run = DataflowTemplateOperator(
task_id='dataflow_pipeline',
template='template_location/script_metadata',
parameters ={
'input':"employee.csv",
'output':'project_id:dataset_id.table',
'region':"region"
},
gcp_conn_id='google_cloud_default',
poll_sleep=15,
dag=dag
)
Dataflow_Run

Dynamic DAG Creation Not working as expected in Apache airflow

I was trying to understand how dynamic dags are created in Apache airflow as I need this to create dynamic dags in my project.
Below is the link iam following:Dynamic DAG creation in Apache airflow
Below is the code block for creating a sample hello world dynamic DAGS.(Dynamic DAGs creation based on input parameters).
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def create_dag(dag_id,
schedule,
dag_number,
default_args):
def hello_world_py(*args):
print('Hello World')
print('This is DAG: {}'.format(str(dag_number)))
dag = DAG(dag_id,
schedule_interval=schedule,
default_args=default_args)
with dag:
t1 = PythonOperator(
task_id='hello_world',
python_callable=hello_world_py,
dag_number=dag_number)
return dag
# build a dag for each number in range(10)
for n in range(1, 10):
dag_id = 'hello_world_{}'.format(str(n))
default_args = {'owner': 'airflow',
'start_date': datetime(2018, 1, 1)
}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id,
schedule,
dag_number,
default_args)
The expectation is to create 9 such DAGs.But what I could see is that once i compile the above code block with python3 code_sample.py,it creates 9 DAGs however the code embeded in the DAG is entire sample code.
But to my understanding the created DAGs should have only the below code block which is available inside create_dag method in the above sample code block.
Expected DAG code:
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def hello_world_py(*args):
print('Hello World')
print('This is DAG: {}'.format(str(dag_number)))
dag = DAG(dag_id,
schedule_interval=schedule,
default_args=default_args)
with dag:
t1 = PythonOperator(
task_id='hello_world',
python_callable=hello_world_py,
dag_number=dag_number)
Actual DAG code:
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def create_dag(dag_id,
schedule,
dag_number,
default_args):
def hello_world_py(*args):
print('Hello World')
print('This is DAG: {}'.format(str(dag_number)))
dag = DAG(dag_id,
schedule_interval=schedule,
default_args=default_args)
with dag:
t1 = PythonOperator(
task_id='hello_world',
python_callable=hello_world_py,
dag_number=dag_number)
return dag
# build a dag for each number in range(10)
for n in range(1, 10):
dag_id = 'hello_world_{}'.format(str(n))
default_args = {'owner': 'airflow',
'start_date': datetime(2018, 1, 1)
}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id,
schedule,
dag_number,
default_args)
Let me know what is creating the above problem
The code that you see in Airflow UI when clicking on "Code" tab is simply the whole .py file source code. See how this function is implemented:
https://github.com/apache/airflow/blob/master/airflow/www/views.py#L437

Resources