How to use Prefect's resource manager with a spark cluster - apache-spark

I have been messing around with Prefect for workflow management, but got stuck with
building up and braking down a spark session withing Prefect's resource manager.
I browsed Prefects docs and an example with Dusk is available:
from prefect import resource_manager
from dask.distributed import Client
#resource_manager
class DaskCluster:
def init(self, n_workers):
self.n_workers = n_workers
def setup(self):
"Create a local dask cluster"
return Client(n_workers=self.n_workers)
def cleanup(self, client):
"Cleanup the local dask cluster"
client.close()
with Flow("example") as flow:
n_workers = Parameter("n_workers")
with DaskCluster(n_workers=n_workers) as client:
some_task(client)
some_other_task(client)
However I couldn't work out how to do the same with a spark session.

The simplest way to do this is with Spark in local mode:
from prefect import task, Flow, resource_manager
from pyspark import SparkConf
from pyspark.sql import SparkSession
#resource_manager
class SparkCluster:
def __init__(self, conf: SparkConf = SparkConf()):
self.conf = conf
def setup(self) -> SparkSession:
return SparkSession.builder.config(conf=self.conf).getOrCreate()
def cleanup(self, spark: SparkSession):
spark.stop()
#task
def get_data(spark: SparkSession):
return spark.createDataFrame([('look',), ('spark',), ('tutorial',), ('spark',), ('look', ), ('python', )], ['word'])
#task(log_stdout=True)
def analyze(df):
word_count = df.groupBy('word').count()
word_count.show()
with Flow("spark_flow") as flow:
conf = SparkConf().setMaster('local[*]')
with SparkCluster(conf) as spark:
df = get_data(spark)
analyze(df)
if __name__ == '__main__':
flow.run()
Your setup() method returns the resource being managed and the cleanup() method accepts the same resource returned by setup(). In this case, we create and return a Spark session, and then stop it. You don't need spark-submit or anything (though I find it a bit harder managing dependencies this way).
Scaling it up gets harder and is something I'm still working on figuring out. For example, Prefect won't know how to serialize Spark DataFrames for output caching or persisting results. Also, you have to be careful about using the Dask executor with Spark sessions because they can't be pickled, so you have to set the executor to use scheduler='threads' (see here).

Related

pyspark called from an imported method that calls another method gives empty dataframe

I have a module called test_pyspark_module with following code:
class SparkTest:
def __init__(self, spark, sql):
self.spark = spark
self.sql = sql
def fetch_data(self, sql_text):
data = self.sql(sql_text).toPandas()
spark = self.spark
print(len(data))
def call_fetch_data(self):
sql_text = """
SELECT *
FROM
<TABLENAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
return self.fetch_data(sql_text)
def fetch_data(sql, sql_text):
data = sql(sql_text).toPandas()
print(len(data))
I have a pyspark kernel running and I have following code in my jupyter notebook:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sqlContext = SQLContext(spark)
sql = sqlContext.sql
sql_text = """
SELECT *
FROM
<TABLENAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
from test_pyspark_module import *
st = SparkTest(spark, sql)
Now when I run st.fetch_data() I get 43000. However, when I run st.call_fetch_data() I get 0.
I wanted to see if something is going wrong with import so I implemented a duplicate of SparkTest locally calling it SparkTest2. However, this works as I expect with both functions returning 43000.
class SparkTest2:
def __init__(self, spark, sql):
self.spark = spark
self.sql = sql
def fetch_data(self, sql_text):
data = self.sql(sql_text).toPandas()
print(len(data))
def call_fetch_data(self):
sql_text = """
SELECT *
FROM
<TABLE_NAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
return self.fetch_data(sql_text)
st2 = SparkTest2(spark, sql)
st2.fetch_data(sql_text) gives output 43000 and st2.call_fetch_data() gives output 43000
So seems like if try to run a class method that calls another method and the class is imported then it will fail to give correct results. Note that there is no error or exception. Just that I get 0 rows (I do get correct number of columns, i.e. 28)

Not able to list dataproc jobs for certain clusters in google cloud

We have dataproc clusters.And I am trying to list the jobs in the cluster using a service account. I am able to list the jobs in a certain cluster but not on few others within the same project. No errors. But jobs are not showing up. I am using the code below. Please let me know for any thoughts.
from google.cloud import dataproc_v1 as dataproc
from google.cloud import bigquery
import google.cloud as resources
from google.oauth2 import service_account as sa
from google.cloud.dataproc_v1 import JobControllerClient
client = Clients.get_client(ClientType.DATAPROC, name="x")
client_cluster = Clients.get_client(ClientType.DATAPROC_CLUSTER, name="x")
clus_collec = []
try:
cl_seq = client_cluster.list_clusters(project_id='y',
region='us-central1')
for k in cl_seq:
clus_collec.append(k.cluster_name)
for m in clus_collec:
data = client.list_jobs(project_id='y', region='us-central1', filter=f"clusterName={m} AND labels.execution_date=2022-08-25")
print(data)
except Exception as e:
print(e)

Python: Callback on the worker-queue not working

Apologies for the long post. I am trying to subscribe to rabbitmq queue and then trying to create a worker-queue to execute tasks. This is required since the incoming on the rabbitmq would be high and the processing task on the item from the queue would take 10-15 minutes to execute each time. Hence necessitating the need for a worker-queue. Now I am trying to initiate only 4 items in the worker-queue, and register a callback method for processing the items in the queue. The expectation is that my code handles the part when all the 4 instances in the worker-queue are busy, the new incoming would be blocked until a free slot is available.
The rabbitmq piece is working well. The problem is I cannot figure out why the items from my worker-queue are not executing the task, i.e the callback is not working. In fact, the item from the worker queue gets executed only once when the program execution starts. For the rest of the time, tasks keep getting added to the worker-queue without being consumed. Would appreciate it if somebody could help out with the understanding on this one.
I am attaching the code for rabbitmqConsumer, driver, and slaveConsumer. Some information has been redacted in the code for privacy issues.
# This is the driver
#!/usr/bin/env python
import time
from rabbitmqConsumer import BasicMessageReceiver
basic_receiver_object = BasicMessageReceiver()
basic_receiver_object.declare_queue()
while True:
basic_receiver_object.consume_message()
time.sleep(2)
#This is the rabbitmqConsumer
#!/usr/bin/env python
import pika
import ssl
import json
from slaveConsumer import slave
class BasicMessageReceiver:
def __init__(self):
# SSL Context for TLS configuration of Amazon MQ for RabbitMQ
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
url = <url for the queue>
parameters = pika.URLParameters(url)
parameters.ssl_options = pika.SSLOptions(context=ssl_context)
self.connection = pika.BlockingConnection(parameters)
self.channel = self.connection.channel()
# worker-queue object
self.slave_object = slave()
self.slave_object.start_task()
def declare_queue(self, queue_name=“abc”):
print(f"Trying to declare queue inside consumer({queue_name})...")
self.channel.queue_declare(queue=queue_name, durable=True)
def close(self):
print("Closing Receiver")
self.channel.close()
self.connection.close()
def _consume_message_setup(self, queue_name):
def message_consume(ch, method, properties, body):
print(f"I am inside the message_consume")
message = json.loads(body)
self.slave_object.execute_task(message)
ch.basic_ack(delivery_tag=method.delivery_tag)
self.channel.basic_qos(prefetch_count=1)
self.channel.basic_consume(on_message_callback=message_consume,
queue=queue_name)
def consume_message(self, queue_name=“abc”):
print("I am starting the rabbitmq start_consuming")
self._consume_message_setup(queue_name)
self.channel.start_consuming()
#This is the slaveConsumer
#!/usr/bin/env python
import pika
import ssl
import json
import requests
import threading
import queue
import os
class slave:
def __init__(self):
self.job_queue = queue.Queue(maxsize=3)
self.job_item = ""
def start_task(self):
def _worker():
while True:
json_body = self.job_queue.get()
self._parse_object_from_queue(json_body)
self.job_queue.task_done()
threading.Thread(target=_worker, daemon=True).start()
def execute_task(self, obj):
print("Inside execute_task")
self.job_item = obj
self.job_queue.put(self.job_item)
# print(self.job_queue.queue)
def _parse_object_from_queue(self, json_body):
if bool(json_body[‘entity’]):
if json_body['entity'] == 'Hello':
print("Inside Slave: Hello")
elif json_body['entity'] == 'World':
print("Inside Slave: World")
self.job_queue.join()

Streaming Pipeline in Dataflow to Bigtable Python

I wanted to read the pubsub topic and write data to BigTable with the dataflow code written in Python. I could find the sample code in JAVA but not in Python.
How can we assign columns in a row from pubsub to different column families and write the data to Bigtable?
To write to Bigtable in a Dataflow pipeline, you'll need to create direct rows and pass them to the WriteToBigTable doFn. Here is a brief example that just passes in the row keys and adds one cell for each key nothing too fancy:
import datetime
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from google.cloud.bigtable import row
class MyOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_argument(
'--bigtable-project',
help='The Bigtable project ID, this can be different than your '
'Dataflow project',
default='bigtable-project')
parser.add_argument(
'--bigtable-instance',
help='The Bigtable instance ID',
default='bigtable-instance')
parser.add_argument(
'--bigtable-table',
help='The Bigtable table ID in the instance.',
default='bigtable-table')
class CreateRowFn(beam.DoFn):
def process(self, key):
direct_row = row.DirectRow(row_key=key)
direct_row.set_cell(
"stats_summary",
b"os_build",
b"android",
datetime.datetime.now())
return [direct_row]
def run(argv=None):
"""Build and run the pipeline."""
options = MyOptions(argv)
with beam.Pipeline(options=options) as p:
p | beam.Create(["phone#4c410523#20190501",
"phone#4c410523#20190502"]) | beam.ParDo(
CreateRowFn()) | WriteToBigTable(
project_id=options.bigtable_project,
instance_id=options.bigtable_instance,
table_id=options.bigtable_table)
if __name__ == '__main__':
run()
I am just starting to explore this now and can link to a more polished version on GitHub once it's complete. Hope this helps you get started.
Building on top of what was proposed and adding PubSub, here’s a working version..
Pre requisites
GCS Bucket created (for Dataflow temp/staging files)
PubSub topic created
PubSub subscription created
BigTable instance created
BigTable table created
BigTable column family must be created (no visible error otherwise !)
Example of the latter with cbt:
cbt -instance test-instance createfamily test-table cf1
Code
Define and run the Dataflow pipeline.
# Packages
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from google.cloud import pubsub_v1
# Classes
class CreateRowFn(beam.DoFn):
def __init__(self, pipeline_options):
self.instance_id = pipeline_options.bigtable_instance
self.table_id = pipeline_options.bigtable_table
def process(self, key):
from google.cloud.bigtable import row
import datetime
direct_row = row.DirectRow(row_key=key)
direct_row.set_cell(
'cf1',
'field1',
'value1',
timestamp=datetime.datetime.now())
yield direct_row
# Options
class XyzOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_argument('--bigtable_project', default='nested'),
parser.add_argument('--bigtable_instance', default='instance'),
parser.add_argument('--bigtable_table', default='table')
pipeline_options = XyzOptions(
save_main_session=True, streaming=True,
runner='DataflowRunner',
project=PROJECT,
region=REGION,
temp_location=TEMP_LOCATION,
staging_location=STAGING_LOCATION,
requirements_file=REQUIREMENTS_FILE,
bigtable_project=PROJECT,
bigtable_instance=INSTANCE,
bigtable_table=TABLE)
# Pipeline
def run (argv=None):
with beam.Pipeline(options=pipeline_options) as p:
input_subscription=f"projects/{PROJECT}/subscriptions/{SUBSCRIPTION}"
_ = (p
| 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(subscription=input_subscription).with_output_types(bytes)
| 'Conversion UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
| 'Conversion string to row object' >> beam.ParDo(CreateRowFn(pipeline_options))
| 'Writing row object to BigTable' >> WriteToBigTable(project_id=pipeline_options.bigtable_project,
instance_id=pipeline_options.bigtable_instance,
table_id=pipeline_options.bigtable_table))
if __name__ == '__main__':
run()
Publish a message b"phone#1111" to PubSub topic (e.g. using the Python PublisherClient()).
Table content (using happybase)
b'phone#1111': {b'cf1:field1': b'value1'}
Row length: 1

Spark Streaming - updateStateByKey and caching data

I have a problem with using updateStateByKey function and caching some big data at the same time. Here is a example.
Lets say I get data (lastname,age) from kafka. I want to keep actual age for every person so I use updateStateByKey. Also I want to know name of every person so I join output with external table (lastname,name) e.g. from Hive. Lets assume it's really big table, so I don't want to load it in every batch. And there's a problem.
All works well, when I load table in every batch, but when I try to cache table, StreamingContext doesn't start. I also tried to use registerTempTable and later join data with sql but i got the same error.
Seems like the problem is checkpoint needed by updateStateByKey. When I remove updateStateByKey and leave checkpoint i got error, but when I remove both it works.
Error I'm getting: pastebin
Here is code:
import sys
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, HiveContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
# function to keep actual state
def updateFunc(channel, actualChannel):
if (actualChannel is None or not channel is None):
try:
actualChannel = channel[-1]
except Exception:
pass
if channel is None:
channel = actualChannel
return actualChannel
def splitFunc(row):
row = row.strip()
lname,age = row.split()
return (lname,age)
def createContext(brokers,topics):
# some conf
conf = SparkConf().setAppName(appName).set("spark.streaming.stopGracefullyOnShutdown","true").set("spark.dynamicAllocation.enabled","false").\
set("spark.serializer","org.apache.spark.serializer.KryoSerializer").set("spark.sql.shuffle.partitions",'100')
# create SparkContext
sc = SparkContext(conf=conf)
# create HiveContext
sqlContext = HiveContext(sc)
# create Streaming Context
ssc = StreamingContext(sc, 5)
# read big_df and cache (not work, Streaming Context not start)
big_df = sqlContext.sql('select lastname,name from `default`.`names`')
big_df.cache().show(10)
# join table
def joinTable(time,rdd):
if rdd.isEmpty()==False:
df = HiveContext.getOrCreate(SparkContext.getOrCreate()).createDataFrame(rdd,['lname','age'])
# read big_df (work)
#big_df = HiveContext.getOrCreate(SparkContext.getOrCreate()).sql('select lastname,name from `default`.`names`')
# join DMS
df2 = df.join(big_df,df.lname == big_df.lastname,"left_outer")
return df2.map(lambda row:row)
# streaming
kvs = KafkaUtils.createDirectStream(ssc, [topics], {'metadata.broker.list': brokers})
kvs.map(lambda (k,v): splitFunc(v)).updateStateByKey(updateFunc).transform(joinTable).pprint()
return ssc
if __name__ == "__main__":
appName="SparkCheckpointUpdateSate"
if len(sys.argv) != 3:
print("Usage: SparkCheckpointUpdateSate.py <broker_list> <topic>")
exit(-1)
brokers, topics = sys.argv[1:]
# getOrCreate Context
checkpoint = 'SparkCheckpoint/checkpoint'
ssc = StreamingContext.getOrCreate(checkpoint,lambda: createContext(brokers,topics))
# start streaming
ssc.start()
ssc.awaitTermination()
Can you tell me how to properly cache data when checkpoint is enabled? Maybe there is some workaround I don't know.
Spark ver. 1.6
I get this working using lazily instantiated global instance of big_df. Something like that is done in recoverable_network_wordcount.py
.
def getBigDf():
if ('bigdf' not in globals()):
globals()['bigdf'] = HiveContext.getOrCreate(SparkContext.getOrCreate()).sql('select lastname,name from `default`.`names`')
return globals()['bigdf']
def createContext(brokers,topics):
...
def joinTable(time,rdd):
...
# read big_df (work)
big_df = getBigDF()
# join DMS
df2 = df.join(big_df,df.lname == big_df.lastname,"left_outer")
return df2.map(lambda row:row)
...
Seems like in streaming all data must be cached inside streaming processing, not before.

Resources