DatabricksRunNowOperator returning value - databricks

Can I pass a return value from DatabricksRunNowOperator using xcom or any other method ? I just want to return back "date" after my 1st databricks operator has finished and pass it on to the dependent task
for eg:
I want to pass the return value of the verification_run to the insert_run and workspace_run. Usually we can use a xcom_pull and xcom_push to do it in python. But not sure how to make two notebooks talk to each other
from airflow import DAG
from airflow.providers.databricks.operators.databricks import DatabricksRunNowOperator
from airflow.utils.dates import days_ago
default_args = {
'owner': 'airflow'
}
with DAG('databricks_dag',
start_date = days_ago(2),
schedule_interval = None,
default_args = default_args
) as dag:
verification_run = DatabricksRunNowOperator(
task_id = 'verification_task',
databricks_conn_id = 'databricks_default',
job_id = '-----'
)
insert_run = DatabricksRunNowOperator(
task_id = 'insert_task',
databricks_conn_id = 'databricks_default',
job_id = '-----'
)
workspace_run = DatabricksRunNowOperator(
task_id = 'workspace_task',
databricks_conn_id = 'databricks_default',
job_id = '------'
)
verification_run >> [insert_run,workspace_run]

Related

How can I only call a BigQuery table each dag run instead of every time Airflow refreshes?

I have a DAG that queries a table, pulling data from it, and which also uses a ShortCircuitOperator to check if the DAG needs to run, which is also based on a BQ table name. The issue is this currently queries the table every time Airflow refreshes. The table is small (each query is less than 1 kb) but I'm concerned this will get more expensive as its scaled up. Is there a way to only query this table each DAG run instead?
Here's a code snippet to show what's going on:
client = bigquery.Client()
def create_query_lists(list_type):
query_job = client.query(
"""
SELECT filename
FROM `requests`
"""
)
results = query_job.result()
results_list = []
for row in results:
results_list.append(row.filename)
return results_list
def check_contents():
if len(create_query_lists()) == 0:
raise ValueError('Nothing to do')
return False
else:
print("There's stuff to do")
return True
#Create task to check if data being pulled is empty, if so fail so other tasks don't run
check_list = ShortCircuitOperator(
task_id="check_column_not_empty",
provide_context=True,
python_callable=check_list_contents
)
check_list #do subsequent tasks which use the same function
If I correctly understood your need, you want to execute tasks only if the result of SQL query is not empty.
In this case you can also use BranchPythonOperator, example :
import airflow
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import BranchPythonOperator
from google.cloud import bigquery
client = bigquery.Client()
def create_query_lists(list_type):
query_job = client.query(
"""
SELECT filename
FROM `requests`
"""
)
results = query_job.result()
results_list = []
for row in results:
results_list.append(row.filename)
return results_list
def check_contents():
if len(create_query_lists()) == 0:
return 'KO'
else:
return 'OK'
with airflow.DAG(
"your_dag",
schedule_interval=None) as dag:
branching = BranchPythonOperator(
task_id='file_exists',
python_callable=check_contents,
provide_context=True,
op_kwargs={
'param': 'param'
},
dag=dag
)
ok = DummyOperator(task_id='OK', dag=dag)
ko = DummyOperator(task_id='KO', dag=dag)
fake_task = DummyOperator(task_id='fake_task', dag=dag)
(branching >>
ok >>
fake_task)
branching >> ko
The BranchPythonOperator executes the query, if the result is not empty, it returns OK, otherwise KO
We create 2 DummyOperator one for OK, the other for KO (2 branches)
Depending on the result, we will go to the OK or KO branch
The KO branch will finish the DAG without other tasks
The OK branch will continue the DAG with tasks that follow (fake_task) in my example

Inserting pandas dataframe into django model

I am having an issue writing a dataframe to my django models.py.
The file is long, but is quite simple in its methodology:
-import modules
-create django database
-requests.get necessary data
-alter data some to fit my goals, save as df
-connect to django db and insert df
My models.py is the following:
from django.db import models
import requests
import pandas as pd
from datetime import timezone
from datetime import datetime
from datetime import date
from datetime import timedelta
import time
from django.conf import settings
from sqlalchemy.engine import create_engine
class cryptoData(models.Model):
coin = models.CharField(max_length=10)
asset_id = models.SmallIntegerField()
time = models.DateTimeField()
close = models.FloatField()
volume = models.BigIntegerField()
market_cap = models.FloatField()
reddit_posts = models.IntegerField()
reddit_comments = models.IntegerField()
tweets = models.IntegerField()
tweet_favorites = models.IntegerField()
social_volume = models.IntegerField()
lunarcrush_key = 'fakekey1234'
def top_coins():
lc_market = requests.get(
url = 'https://api.lunarcrush.com/v2?data=market&',
params = {
'key': lunarcrush_key,
}
)
all_coins = []
for entry in lc_market.json().get('data'):
coin = []
coin.append(entry.get('s'))
coin.append(entry.get('mc'))
all_coins.append(coin)
all_coins.sort(key = lambda x : x[1], reverse = True)
top_ten_coins = all_coins[:10]
return(top_ten_coins)
top_coins_lst = top_coins()
top_coin_names_lst = [x[0] for x in top_coins_lst]
def get_coin_data(key, coin, date_diff, start_date, end_date):
lc = requests.get(
url = 'https://api.lunarcrush.com/v2?data=assets&',
params = {
'key': lunarcrush_key,
'symbol': coin,
'interval': 'day',
'data_points': date_diff,
'start': int(start_date.replace(tzinfo=timezone.utc).timestamp()),
'end': int(end_date.replace(tzinfo=timezone.utc).timestamp())
}
)
metric_names = []
for entry in lc.json().get('data')[0].get('timeSeries'):
for key in entry:
metric_names.append(key) if key not in metric_names else metric_names
metrics_list = []
for entry in lc.json().get('data')[0].get('timeSeries'):
row_list = []
for key in entry:
row_list.append(entry.get(key))
metrics_list.append(row_list)
metrics_df = pd.DataFrame(metrics_list, columns = metric_names)
metrics_df['time'] = metrics_df['time'].apply(lambda x : datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
metrics_df['coin'] = coin
cols = list(metrics_df)
cols.insert(0, cols.pop(cols.index('coin')))
metrics_df = metrics_df.loc[:, cols]
return(metrics_df)
def get_all_coins_data(coins_list):
appended_data = []
end_date = datetime.now()
start_date = end_date - timedelta(days = 700)
date_diff = (end_date - start_date).days
for coin in coins_list:
appended_data.append(get_coin_data(lunarcrush_key, coin, date_diff, start_date, end_date))
time.sleep(.1)
output = pd.concat(appended_data)
return(output)
df = get_all_coins_data(top_coin_names_lst)
focused_df = df[['coin', 'asset_id', 'time', 'close', 'volume', 'market_cap', 'reddit_posts', 'reddit_comments', 'tweets', 'tweet_favorites', 'social_volume']]
user = settings.DATABASES['default']['USER']
password = settings.DATABASES['default']['PASSWORD']
database_name = settings.DATABASES['default']['NAME']
database_url = 'sqlite://{user}:{password}#localhost:5432/{database_name}'.format(
user=user,
password=password,
database_name=database_name,
)
engine = create_engine(database_url, echo=False)
focused_df.to_sql(cryptoData, con=engine)
When I run the manage.py runserver command, I get the following error:
sqlalchemy.exc.ArgumentError: Invalid SQLite URL: sqlite://user:password#localhost:5432/C:\Users\user\Programming\django_crypto_v6\source\db.sqlite3
Valid SQLite URL forms are:
sqlite:///:memory: (or, sqlite://)
sqlite:///relative/path/to/file.db
sqlite:////absolute/path/to/file.db
I'm struggling to resolve this issue. Any thoughts?
you are using the wrong pattern for SQLite database_url
see the docs at https://docs.sqlalchemy.org/en/14/core/engines.html#sqlite

Initialize Model Class Variable At Runtime

I am trying to import student data from an Excel workbook. I have to select column_name of the class StudentMasterResource dynamically which is present in the file. I got all column name in constants module has one dictionary which name column_name. When I do it for the first time, it works, then it fails
constants.py
column_name = dict()
resource.py
from common_account import constants
from import_export import widgets, fields, resources
def getClassName(key):
if key in constants.column_name:
return constants.column_name[key]
return key
class StudentMasterResource(resources.ModelResource):
organisation_id = fields.Field(
column_name=getClassName('organisation_id'),
attribute='organisation_id',
widget=widgets.ForeignKeyWidget(OrganisationMaster, 'organisation_name'),
saves_null_values=True
)
name = fields.Field(
column_name=getClassName('Name'),
attribute='name',
saves_null_values=True,
widget=widgets.CharWidget()
)
date_of_birth = fields.Field(
column_name=getClassName('date'),
attribute='date_of_birth',
saves_null_values=True,
widget=widgets.DateWidget()
)
views.py
from common_account import constants
from tablib import Dataset
#api_view(['POST'])
#permission_classes([IsAuthenticated])
def student_import(request):
if request.method == 'POST':
context_data = dict()
data_set = Dataset()
file = request.FILES['myfile']
extension = file.name.split(".")[-1].lower()
column_data = request.data
is_import = column_name['is_import']
constants.valid_data.clear()
constants.invalid_data.clear()
if extension == 'csv':
data = data_set.load(file.read().decode('utf-8'), format=extension)
else:
data = data_set.load(file.read(), format=extension)
constants.column_name = {
'date' : column_data.get('birth'),
'name' : column_data.get('name'),
}
if is_import == 'No':
result = student_resource.import_data(data_set, organisation_id = request.user.organisation_id,
offering_id = offering_id,all_invalid_data = False, dry_run=True, raise_errors=True)
context_data['valid_data'] = constants.valid_data
context_data['invalid_data'] = constants.invalid_data
context_data[constants.RESPONSE_RESULT] = {"Total records":student_resource.total_cnt,
"skip records":len(constants.invalid_data),
"Records imported": len(constants.valid_data),
}
return JsonResponse(context_data)
elif is_import == 'Yes':
result = student_resource.import_data(data_set, organisation_id = request.user.organisation_id,
offering_id = offering_id,all_invalid_data = False, dry_run=False, raise_errors=False)
context_data[constants.RESPONSE_ERROR] = False
context_data[constants.RESPONSE_MESSAGE] = 'Data Imported !!!'
context_data[constants.RESPONSE_RESULT] = {"Total records":student_resource.total_cnt,
"skip records":len(constants.invalid_data),
"Records imported": len(constants.valid_data),
}
return JsonResponse(context_data)

I've been looking for the way to return multiple SQL results with Json format in Python3

I've been looking for the way to output multiple SQL result with Json format in Python3. My API gets request parameters(luid) and passes them to BigQuery in SQL and return the result to client with Json format. So far , I succeeded in building the api that can handle only one parameter but if you put multiple parameters , this api could return only first parameter's result. I'm guessing this problem is occurred by using return jsonify({request_luid: str(row[0]) }) in outer loop. I have no idea how should I change my code and logic.Could anyone tell me?? I'm welcome to any idea .
output of my code
{
"XXXXXXX5e30ab17f6b536879d25555": "True"⬅︎My SQL seems work correctly
}
ideal otput
{
"XXXXXXX5e30ab17f6b536879d25555": "True",
"XXXXXXX8r30ab17f6b536879d25555": "False",
"XXXXXXX9t30ab17f6b536879d25555": "True",
}
Endpoint
https://test-project-galvanic-ripsaw-281806.df.r.appspot.com?luid=XXXXXXX5e30ab17f6b536879d25555&luid=XXXXXXX8r30ab17f6b536879d25555
&luid=XXXXXXX9t30ab17f6b536879d25555
main.py
#app.route('/')
def get_request():
request_luids = request.args.getlist('luid') or ''
for i in range(len(request_luids)):
request_luid = request_luids[i]
client = bigquery.Client()
query = """SELECT EXISTS(
SELECT 1
FROM `test-project-281806.hitobito_test.test3` as p
WHERE p.luid = '{}'
AND p.cv_date IS NOT NULL limit 1000)""".format(request_luid)
job_config = bigquery.QueryJobConfig(
query_parameters=[
bigquery.ScalarQueryParameter("request_luid", "STRING", request_luid)
]
)
query_job = client.query(query)
query_res = query_job.result()
for row in query_res:
return jsonify({request_luid: str(row[0]) })
if __name__ == "__main__":
app.run()
I think the problem is here:
for row in query_res:
return jsonify({request_luid: str(row[0]) })
you can use this piece of code:
#app.route('/')
def get_request():
request_luids = request.args.getlist('luid') or ''
result = {} # define an empty dictionary for final result
for i in range(len(request_luids)):
request_luid = request_luids[i]
client = bigquery.Client()
query = """SELECT EXISTS(
SELECT 1
FROM `test-project-281806.hitobito_test.test3` as p
WHERE p.luid = '{}'
AND p.cv_date IS NOT NULL limit 1000)""".format(request_luid)
job_config = bigquery.QueryJobConfig(
query_parameters=[
bigquery.ScalarQueryParameter("request_luid", "STRING", request_luid)
]
)
query_job = client.query(query)
query_res = query_job.result()
# you don't need other rows of result because you return the first element, so break the for loop
for row in query_res:
temp_result = jsonify({request_luid: str(row[0]) })
break
result.update(temp_result) # add temp_result to the final result
# then return the final result
return result
if __name__ == "__main__":
app.run()
My final code is below . Thank you so much for good advice ! Maryam Abdoli
from flask import Flask, request, jsonify
from google.cloud import bigquery
import json
app = Flask(__name__)
#app.route('/')
def get_request():
request_luids = request.args.getlist('luid') or ''
result = {}
for i in range(len(request_luids)):
request_luid = str(request_luids[i])
client = bigquery.Client()
query = """SELECT EXISTS(
SELECT 1
FROM `test-project-281806.hitobito_test.test3` as p
WHERE p.luid = '{}'
AND p.cv_date IS NOT NULL)""".format(request_luid)
job_config = bigquery.QueryJobConfig(
query_parameters=[
bigquery.ScalarQueryParameter("request_luid", "STRING", request_luid)
]
)
query_job = client.query(query)
query_res = query_job.result()
for row in query_res:
temp_result = {request_luid: str(row[0])}
break
result.update(temp_result)
return json.dumps(result)
if __name__ == "__main__":
app.run()

Why my spark streaming app does not show any out put

This is my follow up question from my earlier stack overflow question ,for which I did not get a response
I have tried writing this ,which does not throw up any error but it does not show any out put
My goal is to evaluate the Dstream objects with historical data RDD ,I could not
find any example for pyspark like this ( checking streaming RDD with static RDD
created before hand ) .Appreciate your response . Thanks
"""
Created on Thu May 05 16:23:15 2016
#author: bghosh
"""
import re
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext,functions as func,Row
sc = SparkContext("local[2]", "realtimeApp")
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc,10)
files = ssc.textFileStream("hdfs://RealTimeInputFolder/")
########Lets get the data from the db which is relavant for streaming ###
driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
dataurl = "jdbc:sqlserver://devserver:1433"
db = "devDB"
table = "stream_helper"
credential = "dev_credential"
########basic data for evaluation purpose ########
#base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data.registerTempTable("base_data")
######
files_count = files.flatMap(lambda file: file.split( ))
#pattern = '(TranAmount=Decimal.{2})(.[0-9]*.[0-9]*)(\\S+ )(TranDescription=u.)([a-zA-z\\s]+)([\\S\\s]+ )(dSc=u.)([A-Z]{2}.[0-9]+)'
tranfiles = "wasb://vanspark01#vanspark01.blob.core.windows.net/RealTimeInputFolder01/"
def getSqlContextInstance(sparkContext):
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
return globals()['sqlContextSingletonInstance']
def preparse(logline):
#match = re.search(pattern,logline)
pre = logline.split(",")
return(
Row(
Customer_id = pre[-1],
trantype = pre[-4],
amount = float(pre[-5]))
)
def parse():
parsed_tran = ssc.textFileStream(tranfiles).map(preparse)
#success = parsed_tran.filter(lambda s: s[1] == 1).map(lambda x:x[0])
#fail = parsed_tran.filter(lambda s:s[1] == 0).map(lambda x:x[0])
"""if fail.count() > 0:
print "no of non parsed file : %d",fail.count()
"""
return parsed_tran#success
def check_historic(rdd):
#checking with the historical table #
try:
streamSqlcontext = getSqlContextInstance(rdd)
stream_df = streamSqlcontext.createDataFrame(rdd)
stream_df.registerTempTable("stream_df")
result_data_frame = streamSqlcontext.sql("select * from stream_df LEFT OUTER JOIN base_data on stream_df.Customer_id= base_data.Customer_id" )
result_data_frame.show()
except:
pass
#return result_data_frame.rdd
success = parse()
success.foreachRDD(check_historic)
ssc.start()
ssc.awaitTermination()

Resources