I have a dag like below,
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.python_operator import PythonVirtualenvOperator
import platform
def get_info():
from modelsFile import model_config
print("version: ")
dag = DAG('sample', description='Sample DAG',
schedule_interval=None,
start_date=datetime(2022, 5, 1), catchup=False)
get_info_operator = PythonVirtualenvOperator(task_id='get_info_task', python_callable=get_info, dag=dag)
get_info_operator
Since I have used PythonVirtualEnvOperator, I need to give all the dependencies inside the python_callable function.
But, when I tried to import the class model_config from the file modelsfile, It is throwing the error as "ModuleNotFoundError: No module named 'modelsfile'".
But, when I change from pythonVirtualEnvOperator to PythonOperator, It is working fine.
Can anyone help me to solve this issue in Airflow PythonVirtualEnvOperator?
Related
I am trying to use pg8000 in my Glue Script, following are params in Glue Job
--extra-py-files s3://mybucket/pg8000libs.zip //NOTE: my zip contains __init__.py
Some Insights towards code
import sys
import os
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
from pyspark.sql import Row
from datetime import datetime, date
zip_path = os.path.join('/tmp', 'pg8000libs.zip')
sys.path.insert(0, zip_path)
def dump_python_path():
print("python path:", sys.path)
for path in sys.path:
if os.path.isdir(path):
print(f"dir: {path}")
print("\t" + str(os.listdir(path)))
print(path)
print(os.listdir('/tmp'))
dump_python_path()
# Import the library
import pg8000
Dump in cloudwatch
python path: ['/tmp/pg8000libs.zip', '/opt/amazon/bin', '/tmp/pg8000libs.zip', '/opt/amazon/spark/jars/spark-core_2.12-3.1.1-amzn-0.jar', '/opt/amazon/spark/python/lib/pyspark.zip', '/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip', '/opt/amazon/lib/python3.6/site-packages', '/usr/lib64/python37.zip', '/usr/lib64/python3.7', '/usr/lib64/python3.7/lib-dynload', '/home/spark/.local/lib/python3.7/site-packages', '/usr/lib64/python3.7/site-packages', '/usr/lib/python3.7/site-packages']
I want to save pyspark dataframe directly into s3 bucket. I tried some options but getting error. Can someone help me to solve my problem?
I created one sample pyspark dataframe and tried to save in S3 bucket directly.
I tried below code-
from pyspark.context import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from pyspark.sql.functions import last
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.functions import col
from pyspark.sql.functions import unix_timestamp
from functools import reduce
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import max
from pyspark.sql.types import *
from pyspark.sql import DataFrame
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import dense_rank
from pyspark.sql.window import Window
from pyspark.sql.functions import abs, lit
#from __future__ import division
import sys
import mysql.connector
import traceback
import json
#from sqlalchemy import create_engine
import os
import math
import os.path
import datetime
from os import getpid
import pymysql.cursors
import time
import signal
from bs4 import BeautifulSoup
import pandas as pd
from pyspark.context import SparkConf
from collections import OrderedDict
import multiprocessing
import multiprocessing as mp
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from threading import Thread
from functools import partial
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.application import MIMEApplication
from email import encoders
import smtplib
import shutil
import glob
from datetime import datetime, date
from pyspark.sql import Row
spark = SparkSession.builder.appName("app_name").getOrCreate()
print(spark.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion())
sc = spark.sparkContext
aws_access_key_id="*******"
aws_secret_access_key="********"
spark._jsc.hadoopConfiguration().set("fs.s3.awsAccessKeyId", aws_access_key_id)
spark._jsc.hadoopConfiguration().set("fs.s3.awsSecretAccessKey", aws_secret_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
spark._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.DefaultAWSCredentialsProviderChain')
df = spark.createDataFrame([Row(a=1, b=4., c='GFG1', d=date(2000, 8, 1),e=datetime(2000, 8, 1, 12, 0)),
Row(a=2, b=8., c='GFG2', d=date(2000, 6, 2),e=datetime(2000, 6, 2, 12, 0)),
Row(a=4, b=5., c='GFG3', d=date(2000, 5, 3),e=datetime(2000, 5, 3, 12, 0))])
print(df.show())
print(df.printSchema())
df.write.format('csv').option('header','true').save('s3a://******/testing_s3/emp.csv',mode='overwrite')
After running this code I am getting below error-
py4j.protocol.Py4JJavaError: An error occurred while calling o48.save.
: com.amazonaws.services.s3.model.AmazonS3Exception: Status Code: 403, AWS Service: Amazon S3, AWS Request ID: RNKTVM6JMDACZ16W, AWS Error Code: null, AWS Error Message: Forbidden, S3 Extended Request ID: MS8lToBlzqSmn1YDdq6SPh7JC6aCKSROuldEz5x9LbsnQdxhKVEQriOpJz5KkCJPBnlk4KgsCkQ=
Please tell me what are the things I am missing in my script. Thanks in advance!!
after creating the spark context use these lines to set the credentials
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", AWS_ACCESS_KEY_ID)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
or
import pyspark
conf = (
pyspark.SparkConf()
.setAppName('app_name')
.setMaster(SPARK_MASTER)
.set('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY)
.set('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_KEY)
)
sc = pyspark.SparkContext(conf=conf)
Airflow is running but the task is stuck as its status queued.
I ran airflow scheduler.
Here are my code and snapshot of the airflow ui.
Can any one explain to me what the problem would be?
import datetime as dt
from datetime import timedelta
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
import pandas as pd
def CSVToJson():
df = pd.read_csv('/Users/daeyong/Desktop/Projects/Python/airflow2/file.csv')
for i,r in df.interrows() :
print(r['name'])
df.to_json('fromAirflow.json', orient='record')
default_args = {
'owner': 'paulcrickard',
'start_date': dt.datetime(2022, 3, 10),
'retries': 1,
'retry_delay': dt.timedelta(minutes=5)
}
with DAG('MyCSVDAG',
default_args=default_args,
schedule_interval=timedelta(minutes=5),
# '0 * * * *',
) as dag:
print_starting = BashOperator(task_id='starting',
bash_command='echo "I am reading the CSV now....."')
CSVJson = PythonOperator(task_id='convertCSVtoJson', python_callable=CSVToJson)
print_starting >> CSVJson
airflow_screenshot_1
airflow_screenshot_2
Two possible reasons without more context.
Your default pool does not have any slots assigned or available
Your declaration of tasks needs to be tabbed over to fall within the "with DAG" statement
Scheduler logs and an image of your pools page would help more.
Below is my main code which I want to UnitTest
get_data.py
from pyspark.sql import SparkSession
from pyspark_llap.sql.session import HiveWarehouseSession
def get_hive_data(query):
hive_data = hive.executeQuery(query)
return hive_data
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("HiveApp")\
.getOrCreate()
hive=HiveWarehouseSession.session(spark).build()
data = get_hive_data()
Below is my unittest code, I have written only the imports here, since i get error when i do from get_data import /*/
test.py
import unittest
import pyspark
import pyspark.sql.functions as f
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from get_data import *
ERROR
ImportError: No module named pyspark_llap
But if i run just get_data.py, it runs successfully
I am running it on edge node!
I have been trying to write a simple text to a local txt file through a DAG script. Even though the task runs successfully. I cannot seem to find the file anywhere. Is it because I am using WSL on Windows?
Here is my simple script:
import os
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
"owner": "airflow",
"depends_on_past": False,
"start_date": datetime(2020, 12, 5),
"retries": 0,
}
dag = DAG(
"simple_dag",
default_args=default_args,
schedule_interval="#once",
)
t1 = BashOperator(
task_id="print_file",
bash_command='echo "pipeline" > opDE.txt',
dag=dag)
t1
You will need to define a path for the output file.
When Airflow execute your code it moves it to temp location and execute it from there so it does export it to that location. You can also see this from the log:
So the fix is to export to a desired path