Delete HdfsTarget before running a SparkSubmitTask - apache-spark

Community, I'd like to delete the HdfsTarget folder before running a SparkSubmitTask. What is the best practice? So far I tried two options mentioned in the code attached without success:
Dependent/required job doesn't get executed if HdfsTarget already exists
Tasks would be executed in parallel if called with yield
import luigi
import luigi.format
import luigi.contrib.hdfs
from luigi.contrib.spark import SparkSubmitTask
class CleanUp(luigi.Task):
path = luigi.Parameter()
def run(self):
self.target = luigi.contrib.hdfs.HdfsTarget(self.path, format=luigi.format.Gzip)
if self.target.exists():
self.target.remove(skip_trash=True)
class MySparkTask(SparkSubmitTask):
output = luigi.Parameter()
driver_memory = '8g'
executor_memory = '3g'
num_executors = 5
app = 'my-app.jar'
entry_class = 'com.company.MyJob'
def app_options(self):
return ['/input', self.output]
def requires(self):
(1)
def output(self):
return luigi.contrib.hdfs.HdfsTarget(self.output, format=luigi.format.Gzip)
class RunAll(luigi.Task):
result_dir = '/output'
''' Dummy task that triggers execution of a other tasks'''
def requires(self):
(2)
return MySparkTask(self.result_dir)

Related

How to publish to Pub/Sub from Dataflow in batch (efficiently)?

I want to publish messages to a Pub/Sub topic with some attributes thanks to Dataflow Job in batch mode.
My dataflow pipeline is write with python 3.8 and apache-beam 2.27.0
It works with the #Ankur solution here : https://stackoverflow.com/a/55824287/9455637
But I think it could be more efficient with a shared Pub/Sub Client : https://stackoverflow.com/a/55833997/9455637
However an error occurred:
return StockUnpickler.find_class(self, module, name) AttributeError:
Can't get attribute 'PublishFn' on <module 'dataflow_worker.start'
from
'/usr/local/lib/python3.8/site-packages/dataflow_worker/start.py'>
Questions:
Would the shared publisher implementation improve beam pipeline performance?
Is there another way to avoid pickling error on my shared publisher client ?
My Dataflow Pipeline :
import apache_beam as beam
from apache_beam.io.gcp import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from google.cloud.pubsub_v1 import PublisherClient
import json
import argparse
import re
import logging
class PubsubClient(PublisherClient):
def __reduce__(self):
return self.__class__, (self.batch_settings,)
# The DoFn to perform on each element in the input PCollection.
class PublishFn(beam.DoFn):
def __init__(self):
from google.cloud import pubsub_v1
batch_settings = pubsub_v1.types.BatchSettings(
max_bytes=1024, # One kilobyte
max_latency=1, # One second
)
self.publisher = PubsubClient(batch_settings)
super().__init__()
def process(self, element, **kwargs):
future = self.publisher.publish(
topic=element["topic"],
data=json.dumps(element["data"]).encode("utf-8"),
**element["attributes"],
)
return future.result()
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--source_table_id",
dest="source_table_id",
default="",
help="BigQuery source table <project>.<dataset>.<table> with columns (topic, attributes, data)",
)
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
# pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
bq_source_table = known_args.source_table_id
bq_table_regex = r"^(?P<PROJECT_ID>[a-zA-Z0-9_-]*)[\.|\:](?P<DATASET_ID>[a-zA-Z0-9_]*)\.(?P<TABLE_ID>[a-zA-Z0-9_-]*)$"
regex_match = re.search(bq_table_regex, bq_source_table)
if not regex_match:
raise ValueError(
f"Bad BigQuery table id : `{bq_source_table}` please match {bq_table_regex}"
)
table_ref = bigquery.TableReference(
projectId=regex_match.group("PROJECT_ID"),
datasetId=regex_match.group("DATASET_ID"),
tableId=regex_match.group("TABLE_ID"),
)
with beam.Pipeline(options=pipeline_options) as p:
(
p
| "ReadFromBqTable" #
>> bigquery.ReadFromBigQuery(table=table_ref, use_json_exports=True) # Each row contains : topic / attributes / data
| "PublishRowsToPubSub" >> beam.ParDo(PublishFn())
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
run()
After fussing with this a bit, I think I have an answer that works consistently and is, if not world-beatingly performant, at least tolerably usable:
import logging
import apache_beam as beam
from apache_beam.io.gcp.pubsub import PubsubMessage
from google.cloud.pubsub_v1 import PublisherClient
from google.cloud.pubsub_v1.types import (
BatchSettings,
LimitExceededBehavior,
PublishFlowControl,
PublisherOptions,
)
class PublishClient(PublisherClient):
"""
You have to override __reduce__ to make PublisherClient pickleable 😡 😤 🤬
Props to 'Ankur' and 'Benjamin' on SO for figuring this part out; god knows
I would not have...
"""
def __reduce__(self):
return self.__class__, (self.batch_settings, self.publisher_options)
class PubsubWriter(beam.DoFn):
"""
beam.io.gcp.pubsub does not yet support batch operations, so
we do this the hard way. it's not as performant as the native
pubsubio but it does the job.
"""
def __init__(self, topic: str):
self.topic = topic
self.window = beam.window.GlobalWindow()
self.count = 0
def setup(self):
batch_settings = BatchSettings(
max_bytes=1e6, # 1MB
# by default it is 10 ms, should be less than timeout used in future.result() to avoid timeout
max_latency=1,
)
publisher_options = PublisherOptions(
enable_message_ordering=False,
# better to be slow than to drop messages during a recovery...
flow_control=PublishFlowControl(limit_exceeded_behavior=LimitExceededBehavior.BLOCK),
)
self.publisher = PublishClient(batch_settings, publisher_options)
def start_bundle(self):
self.futures = []
def process(self, element: PubsubMessage, window=beam.DoFn.WindowParam):
self.window = window
self.futures.append(
self.publisher.publish(
topic=self.topic,
data=element.data,
**element.attributes,
)
)
def finish_bundle(self):
"""Iterate over the list of async publish results and block
until all of them have either succeeded or timed out. Yield
a WindowedValue of the success/fail counts."""
results = []
self.count = self.count + len(self.futures)
for fut in self.futures:
try:
# future.result() blocks until success or timeout;
# we've set a max_latency of 60s upstairs in BatchSettings,
# so we should never spend much time waiting here.
results.append(fut.result(timeout=60))
except Exception as ex:
results.append(ex)
res_count = {"success": 0}
for res in results:
if isinstance(res, str):
res_count["success"] += 1
else:
# if it's not a string, it's an exception
msg = str(res)
if msg not in res_count:
res_count[msg] = 1
else:
res_count[msg] += 1
logging.info(f"Pubsub publish results: {res_count}")
yield beam.utils.windowed_value.WindowedValue(
value=res_count,
timestamp=0,
windows=[self.window],
)
def teardown(self):
logging.info(f"Published {self.count} messages")
The trick is that if you call future.result() inside the process() method, you will block until that single message is successfully published, so instead collect a list of futures and then at the end of the bundle make sure they're all either published or definitively timed out. Some quick testing with one of our internal pipelines suggested that this approach can publish 1.6M messages in ~200s.

Pytest object created by object assert_called_once_with

I known how I can test if an injected object was called with a specific argument. But in my case the injected object will create an object that object will create another object and I want to test if that last object was called with the right argument.
in the example below the question would be if c.dirve was called with 100 as argument:
class car:
def drive(self, distance):
print("so fast")
class car_shop:
def buy_car(self):
return car()
class shop_shop:
def buy_shop(self):
return car_shop()
class processor:
def __init__(self, sshop):
self.sshop = sshop
def run(self):
cshop = self.sshop.buy_shop()
c = cshop.buy_car()
c.drive(100)
def main():
sshop = shop_shop()
proc = processor(sshop)
proc.run()
if __name__ == "__main__":
main()
is there a way to test that?
Since this was requested here my approach for testing these objects:
import pytest
from unittest.mock import Mock
from object_returns_object_test_for_arguments import processor, shop_shop
#pytest.fixture
def mock_shop_shop():
return Mock(spec=shop_shop)
def test_processor_car_called_with_100(mock_shop_shop):
proc = processor(mock_shop_shop)
proc.run()
assert mock_shop_shop.car_shop.car.drive.assert_called_once_with(100)
assert mock_shop_shop.car_shop.car.drive.call_count == 1
If using just the code shown in the question, you only have to mock car.drive. This could be done for example this way:
from unittest import mock
from object_returns_object_test_for_arguments import processor, shop_shop
#mock.patch('object_returns_object_test_for_arguments.car.drive')
def test_processor_car_called_with_100(drive_mock):
proc = processor(shop_shop())
proc.run()
drive_mock.assert_called_once_with(100)
As I don't know your real code, you may have to mock more stuff.
As an aside: class names in Python are written upper-case, camelcase-style by default.

How to save python luigi terminal output in log file with timestamp in log file name

I have a simple luigi pipeline.
import luigi
import subprocess
import row_count_test
class TaskOne(luigi.Task):
def requires(self):
return None
def run(self):
output = row_count_test()
if output:
with self.output().open('w') as open_file:
open_file.write('{}'.format(output))
def output(self):
return luigi.LocalTarget('TaskOne.txt')
class TaskTwo(luigi.Task):
def requires(self):
return TaskOne()
def run(self):
subprocess.call('rm *.txt', shell = True)
if __name__ == "__main__":
luigi.run()
I run the following code through command line:
python luigi_demo.py --scheduler-host localhost TaskTwo
I want to be able to save the terminal output to a log file. I also want to be able to add a time stamp to the log file name. I know there's a way to do it through bash commands. Is there a way to do this using luigi? I looked at the luigi.cfg documentation and it wasn't too helpful. A simple example would be greatly appreciated.
You just have to changes the following to your TaskTwo.
import datetime as dt
class TaskTwo(luigi.Task):
date= luigi.DateSecondParameter(default=dt.datetime.now())
def output(self):
# Here you create a file with your date in it.
return luigi.LocalTarget('path/to/your/log/file%s.txt' % self.date)
def requires(self):
return TaskOne()
def run(self):
self.output().open('w') as f:
subprocess.call('rm *.txt', shell = True,stdout=f)
Also, on a side note if you want to delete the file created in the Taskone then you can remove all the code in the run() then just add self.input().remove()
class TaskTwo(luigi.Task):
date= luigi.DateSecondParameter(default=dt.datetime.now())
def output():
return luigi.LocalTarget('path/to/your/log/file%s.txt' % self.date)
def requires(self):
return TaskOne()
def run(self):
# this should remove the file created in the Task one.
self.input().remove()

Coroutine to mimic a OS's scheduler

I am following the :A Curious Course on Coroutines and Concurrency to learn coroutine, encounter problem to get the following codes running:
The code mimic an operating system to schedule tasks
from queue import Quue
class Task:
taskid = 0
def __init__(self, target):
Task.taskid += 1 #count the task
self.tid = Task.taskid
self.tartet = target
self.sendval = None
def run(self):
return self.target.send(self.sendval)
class Scheduler:
def __init__(self):
self.ready = Queue() # a queue of tasks that are ready to run.
self.taskmap = {} #dictionary that keeps track of all active tasks (each task has a unique integer task ID)
def new(self, target): #introduce a new task to the scheduler
newtask = Task(target)
self.taskmap[newtask.tid] = newtask
def schedule(self, task):
self.ready.put(task)
def mainloop(self):
while self.taskmap: #I think the problem is here
task = self.ready.get() #I think it should be while self.ready
result = task.run()
self.schedule(task)
Test it with
def foo():
while True:
print("I'm foo")
yield
def bar():
while True:
print("I'm bar")
yield
It pending instead of return value
In [85]: schedule.new(foo())
In [86]: schedule.new(bar())
In [87]: schedule.mainloop()
^C---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
I review the codes and find problem with
def mainloop(self):
while self.taskmap: #I think the problem is here
task = self.ready.get() #I think it should be while self.ready
result = task.run()
self.schedule(task)
while self.taskmap, but there is no methods to remove elements, so it is an infinite loop
I changed it to
def mainloop(self):
while self.taskmap: #I think the problem is here
task = self.ready.get() #I think it should be while self.ready
result = task.run()
self.schedule(task)
However, it still not work.
What's the problem with my code.

Why are my class functions executed when importing the class?

it's probably a very basic question but I was unable to find an answer that I could thoroughly understand.
In my main program main_program.py, I'm importing a class that itself imports another class:
in main_program.py:
from createTest import *
in createTest.py:
print("TEST")
from recordRecallQused import *
print("TEST")
now in recordRecallQused:
class recordRecallQused:
def __init__(self, path):
self.path = path
try:
with open(self.path, 'r',newline = '') as question_used:
question_used.closed
except IOError:
#if file doesnt exist
print("the file doesn't exist")
with open(self.path, 'w',newline = '') as question_used:
question_used.closed
def recallQused(self):
list_Qused = []
print("I'm being executed")
with open(self.path, 'r',newline = '') as question_used:
questionused = csv.reader(question_used)
for item in questionused:
if len(item)>0:
list_Qused.append(item[0])
question_used.closed
return list_Qused
What I obtain in the kernel:
>TEST
>I'm being executed
>TEST
so functions inside the class are executed even though they are not called, but I have read that it's "normal", "def" are no statements but "live" things.
Still, I have tried something much more simple:
in main_program_TEST.py
from class1 import *
a = class1()
in class1.py:
print("t")
from class2 import *
print("t")
class class1:
def __init__(self):
pass
def message(self):
print("prout")
in class2.py:
class class2:
def __init__(self):
pass
def message(self):
print("prout2")
When executing main_program_TEST.py the kernel displays
>t
>t
so this time the functions in class2.py have not been executed, otherwise the kernel would show instead:
>t
>prout2
>t
I really wonder why.
Stephen Rauch you are right, part of my code in recordRecallQused.py was calling the function.
"""#load all list
print("loading questions info")
# questions info: answers, category
list_AllQ = []
with open('questionsInfoTo130.csv', newline = '') as csvfile:
questionsInfo = csv.reader(csvfile)
# loop over the questions information rows
for (i,row) in enumerate(questionsInfo):
if(i!=0):
list_AllQ.append(row)
csvfile.close()
path = 'question_used.csv'"""
list_AllQ = [[0,1,2,1,"que"],[0,1,2,2,"que"],[0,1,2,3,"que"],[0,1,2,4,"que"],[0,1,2,55,"que"],[0,1,2,6,"que"],[0,1,2,7,"que"],[0,1,2,8,"que"],[0,1,2,9,"que"]]
a = recordRecallQused('question_used.csv')
list_Qused = a.recallQused()
list_avQ = a.createListavQ(list_Qused, list_AllQ)
list_Qtest = a.createListQtest(list_avQ)
a.recordQused(list_Qtest)

Resources