i install apache airflow in Azure kubernetes servece AKS and i want run a DAG as Unix user
i add run_as_user to my dags but it's not working
from airflow import DAG
from airflow.operators.bash import BashOperator
from airflow.utils.dates import days_ago
from airflow.models import Variable
from airflow.operators.python_operator import PythonOperator
args = {
'owner': 'airflow',
}
default_args = {
'run_as_user':'ssaxen3',
}
with DAG(
dag_id='sanity_check',
schedule_interval=None,
start_date=days_ago(1),
tags=['platform'],
default_args=default_args,
) as dag:
sanity_check = BashOperator(
task_id='sanity_check',
bash_command='echo "Hello"'
)
sanity_check
if __name__ == "__main__":
dag.cli()
Related
How can I use Django validators component as standalone component in non-Django applications ?
Currently I use Django ORM in my applications.
manager.py
import os
from dotenv import load_dotenv
load_dotenv()
def init_django():
import django
from django.conf import settings
if settings.configured:
return
settings.configure(
TIME_ZONE=False,
INSTALLED_APPS=[
'db',
],
DATABASES={
'default': {
'ENGINE': 'django.db.backends.postgresql',
'NAME': os.environ.get('db_name'),
'USER': os.environ.get('db_user'),
'PASSWORD': os.environ.get('db_password'),
'HOST': os.environ.get('db_host'),
'PORT': os.environ.get('db_port'),
}
}
)
django.setup()
if __name__ == "__main__":
from django.core.management import execute_from_command_line
init_django()
execute_from_command_line()
I have a DAG which has a triggerdagrunoperator to self trigger the same DAG. The DAG code is shared below.
from datetime import timedelta, datetime
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dagrun_operator import TriggerDagRunOperator
default_args = {
'owner': 'ownername',
'depends_on_past': False,
'start_date': datetime(2021,3,2,10,1),
'email': [***#mail.com],
'email_on_failure': True,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
with DAG('self_trigger_dag', schedule_interval=None, max_active_runs=1, catchup=False, default_args=default_args) as dag:
sleep_task = BashOperator(
task_id='sleep_task',
bash_command='sleep 180',
dag=dag,
)
bash_command =BashOperator(
task_id='run_command',
bash_command="my bash_command",
use_legacy_sql=False,
dag=dag,
)
dag_trigger = TriggerDagRunOperator(
task_id='trigger_self',
trigger_dag_id='self_trigger_dag',
dag=dag)
sleep_task >> bash_command >> dag_trigger
The requirement is that the DAG should be scheduled from 8 AM to 9 PM. I cannot give expression like '* 8-21 * * *' as this is a self triggering DAG. Kindly help me with the correct crontab expression or any other alternative.
Thanks in advance.
I was able to achieve the requirement using a time_check DAG that controls the process of my main DAG. Here I am triggering the main DAG at 12 PM and switching it off at 1 AM using time_check DAG.
The time_check DAG code is shared below:
from datetime import timedelta, datetime,timezone,date
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.python_operator import BranchPythonOperator
from airflow.operators.email_operator import EmailOperator
from airflow.operators.dagrun_operator import TriggerDagRunOperator
from airflow.models import Variable
default_args = {
'owner': 'owner',
'depends_on_past': False,
'start_date': datetime(2021,3,2,10,1),
'email': ['xxx#xxx.com'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
airflow_variable = 'stream_dag_status'
def check_current_time(**context):
now_utc = datetime.now(timezone.utc)
start_utc = now_utc.replace(hour=12, minute=0, second=0,microsecond=0)
end_utc = now_utc.replace(hour=1, minute=0, second=0,microsecond=0)+ timedelta(days=1)
if now_utc >= start_utc and now_utc < end_utc:
Variable.set(airflow_variable, 'START')
start_stream = 'start_stream'
return start_stream
else:
update_variable = 'update_variable'
return update_variable
def set_airflow_variable(**context):
Variable.set(airflow_variable, 'STOP')
with DAG('time_check', schedule_interval='0 12,1 * * *', max_active_runs=1, catchup=False,
default_args=default_args) as dag:
check_current_time = BranchPythonOperator(task_id='check_current_time', python_callable=check_current_time,
provide_context=True,
dag=dag)
start_stream = TriggerDagRunOperator(
task_id='start_stream',
trigger_dag_id='STREAMING_TEST',
dag=dag)
update_variable = PythonOperator(task_id='update_variable', python_callable=set_airflow_variable,
provide_context=True,
dag=dag)
stop_stream_email = EmailOperator(task_id='stop_stream_email', to='xxx#xxx.com',
subject='Streaming DAG is OFF now',
html_content="<p>Hi,<br><br>Turning streaming DAG to OFF state<br>", dag=dag)
check_current_time >> start_stream
check_current_time >> update_variable >> stop_stream_email
The self triggering DAG code is shared below:
from datetime import timedelta, datetime
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dagrun_operator import TriggerDagRunOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import BranchPythonOperator
from airflow.operators.email_operator import EmailOperator
from airflow.models import Variable
default_args = {
'owner': 'owner',
'depends_on_past': False,
'start_date': datetime(2021,3,2,10,1),
'email': ['xxx#xxx.com'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
airflow_variable = 'stream_dag_status'
def check_airflow_variable(**context):
status = Variable.get(airflow_variable)
if status == 'START':
sleep_task = 'sleep_task'
return sleep_task
else:
send_email = 'email_notify'
return send_email
with DAG('STREAMING_TEST', schedule_interval=None, max_active_runs=1, catchup=False, default_args=default_args) as dag:
check_airflow_variable = BranchPythonOperator(task_id='check_airflow_variable', python_callable=check_airflow_variable,
provide_context=True,
dag=dag)
sleep_task = BashOperator(
task_id='sleep_task',
bash_command='sleep 60',
dag=dag,
)
start_group = DummyOperator(task_id='start_split', depends_on_past=False)
dag_trigger = TriggerDagRunOperator(
task_id='trigger_self',
trigger_dag_id='STREAMING_TEST',
dag=dag)
email_notify = EmailOperator(task_id='email_notify', to='xxx#xxx#tegna.com',
subject='Variable value is STOP',
html_content="<p>Hi,<br><br>Streaming is stopped<br>", dag=dag)
check_airflow_variable >> sleep_task >> start_group >> dag_trigger
check_airflow_variable >> email_notify
I'm very new to working with jenkins, so far I was able to run simple pipeline with simple pip install, but I need to pass global credentials from Jenkins into python script test.py invoked by jenkinsfile.
pipeline {
options { timeout(time: 30, unit: 'MINUTES')
buildDiscarder(logRotator(numToKeepStr: '30', artifactNumToKeepStr: '30')) }
agent { label 'ops_slave' }
stages {
stage('Environment Build') {
steps {
echo "Hello World!"
sh "echo Hello from the shell"
sh "hostname"
sh "uptime"
sh "python3 -m venv test_env"
sh "source ./test_env/bin/activate"
sh "pip3 install pandas psycopg2"
sh """echo the script is working"""
withCredentials([[
$class: 'UsernamePasswordMultiBinding',
credentialsId: 98,
usernameVariable: 'user',
passwordVariable: 'pw',
]])
sh """python3 bartek-jenkins-testing/python/test.py"""
}
}
}
}
I've seen implementation where you use argparse, but it's way above my level at this point, and I believe there is a way to reference it from python script or jenkins directly to pass to python script. I was googling for some time now, but I'm not sure questions I'm asking are correct.
My python script should be able to get username and password from Jenkins global credentials ID98:
print('Hello World this is python')
import pandas as pd
print(pd.__version__)
import pyodbc
import psycopg2
# can pass environemtn variables
connection = psycopg2.connect(
host="saturn-dv",
database="saturn_dv",
port='8080',
user='saturn_user_bartek_malysz',
password='')
connection.set_session(readonly=True)
query = """
SELECT table_name FROM information_schema.tables
WHERE table_schema = 'public'
ORDER BY table_schema,table_name;"""
data = pd.read_sql(query, connection)
print(data)
A straightforward way is to leverage environment variable as following
// Jenkinsfile
withCredentials([[
$class: 'UsernamePasswordMultiBinding',
credentialsId: 98,
usernameVariable: 'user',
passwordVariable: 'pw',
]]) {
sh """
export DB_USERNAME="${user}"
export DB_PASSWORD="${pw}"
python3 bartek-jenkins-testing/python/test.py
"""
}
// test.py
connection = psycopg2.connect(
host="saturn-dv",
database="saturn_dv",
port='8080',
user=os.getenv('DB_USERNAME'),
password=os.getenv('DB_PASSWORD'))
I am trying to create a Snowflake connection in Airflow programmatically using DAG. However, after running the DAG file in Airflow, the connection is created without password and connection type. Please advise. The python code is as below:
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(2),
'email': ['me#me.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
dag = DAG(
'MakeConnection',
default_args=default_args,
description='Test',
schedule_interval=timedelta(days=1),
)
def BuildConnection():
conn = Connection(
conn_id='SNOWFLAKECONNECTION',
conn_type='Snowflake'
host='abc.snowflakecomputing.com',
login='AIRFLOW',
password='123',
schema='PUBLIC',
port=443,
extra=json.dumps(dict(Account='abc')),
)
session = settings.Session() # get the session
session.add(conn)
session.commit()
run_Rule_SOW = PythonOperator(
task_id='BuildConnection',
python_callable=BuildConnection,
dag=dag,
)
Assuming this is using the Snowflake Python connector, I think you might have the wrong parameter for login. Should be user. Its hard to tell from your post what it is that you are seeing, though. Are you getting an error message that you could share?
I'm playing around with Gcloud Composer, trying to create a DAG that creates a DataProc cluster, runs a simple Spark job, then tears down the cluster. I am trying to run the Spark PI example job.
I understand that when calling DataProcSparkOperator I can choose only to define either the main_jar or the main_class property. When I define main_class, the job fails with the error:
java.lang.ClassNotFoundException: org.apache.spark.examples.SparkPi
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.util.Utils$.classForName(Utils.scala:239)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:851)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
When I choose to define the main_jar property, the job fails with the error:
Error: No main class set in JAR; please specify one with --class
Run with --help for usage help or --verbose for debug output
I'm at a bit of a loss as to how to resolve this, as I am kinda new to both Spark and DataProc.
My DAG:
import datetime as dt
from airflow import DAG, models
from airflow.contrib.operators import dataproc_operator as dpo
from airflow.utils import trigger_rule
MAIN_JAR = 'file:///usr/lib/spark/examples/jars/spark-examples.jar'
MAIN_CLASS = 'org.apache.spark.examples.SparkPi'
CLUSTER_NAME = 'quickspark-cluster-{{ ds_nodash }}'
yesterday = dt.datetime.combine(
dt.datetime.today() - dt.timedelta(1),
dt.datetime.min.time())
default_dag_args = {
'start_date': yesterday,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': dt.timedelta(seconds=30),
'project_id': models.Variable.get('gcp_project')
}
with DAG('dataproc_spark_submit', schedule_interval='0 17 * * *',
default_args=default_dag_args) as dag:
create_dataproc_cluster = dpo.DataprocClusterCreateOperator(
project_id = default_dag_args['project_id'],
task_id = 'create_dataproc_cluster',
cluster_name = CLUSTER_NAME,
num_workers = 2,
zone = models.Variable.get('gce_zone')
)
run_spark_job = dpo.DataProcSparkOperator(
task_id = 'run_spark_job',
#main_jar = MAIN_JAR,
main_class = MAIN_CLASS,
cluster_name = CLUSTER_NAME
)
delete_dataproc_cluster = dpo.DataprocClusterDeleteOperator(
project_id = default_dag_args['project_id'],
task_id = 'delete_dataproc_cluster',
cluster_name = CLUSTER_NAME,
trigger_rule = trigger_rule.TriggerRule.ALL_DONE
)
create_dataproc_cluster >> run_spark_job >> delete_dataproc_cluster
I compared it with a successful job using the CLI and saw that, even when the class was populating the Main class or jar field, the path to the Jar was specified in Jar files:
Checking the operator I noticed there is also a dataproc_spark_jars parameter which is not mutually exclusive to main_class:
run_spark_job = dpo.DataProcSparkOperator(
task_id = 'run_spark_job',
dataproc_spark_jars = [MAIN_JAR],
main_class = MAIN_CLASS,
cluster_name = CLUSTER_NAME
)
Adding it did the trick: