I am writing Apache beam python code which read data from pubsub subscription and print it on console but it is getting struck and not getting completed.
import argparse
import logging
import ast
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
class FlattenJson(beam.DoFn):
def process(self, element, *args, **kwargs):
print("Element: {element}")
class DecodeMsgs(beam.DoFn):
def process(self, element):
print("############Before Decode:element",element)
logging.info("Before Decode: {element}")
return [element]
class PubsubStreamingToBq:
def __init__(self):
pass
def run(self, subscription, pipeline_args=None):
pipeline_options = PipelineOptions(pipeline_args, streaming=True, save_main_session=True)
print("Args: {pipeline_args}", pipeline_args)
with beam.Pipeline(options=pipeline_options) as p:
pubsub_logs = (p | "Read Pubsub Msg" >> beam.io.ReadFromPubSub(subscription=subscription) | "Decoding" >> beam.ParDo(DecodeMsgs()))
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument('--subscription', required=True)
known_args, pipeline_args = parser.parse_known_args()
print("Known Args: {known_args}", known_args)
PubsubStreamingToBq_obj = PubsubStreamingToBq()
PubsubStreamingToBq_obj.run(known_args.subscription, pipeline_args)
Could anyone let me know what is the issue?
Using beam version 2.27.0 and Python version 3.6
Related
I wanted to create sunrpc client in python using xdrlib library and sunrpc server is already implemented in C. I have implemented one rpc client in python over udp by referencing following link:
https://svn.python.org/projects/stackless/trunk/Demo/rpc/rpc.py
It is giving timeout error as well as can not unpack none object error.
can anyone guide me on this how it can be done?
there is no information available on this on google.
has anyone implemented such type of code?
please help..I am struggling on this like a week now.
Here is my client code:
import rpc
import rpc_new
from tq_const import *
from tq_type import *
import tq_pack
import socket
import os
class PartialTQClient:
def __init__(self):
pass
def addpackers(self):
self.packer = tq_pack.TQPacker(self)
self.unpacker = tq_pack.TQUnpacker(self, '')
def unpack_month_temperatures(self):
return self.unpacker.unpack_array(self.unpacker.unpack_uint)
def call(self, month):
res = self.make_call(0, month, self.packer.pack_uint, self.unpack_month_temperatures)
return res
class UDPTQClient(PartialTQClient, rpc.RawUDPClient):
def __init__(self, host):
rpc.RawUDPClient.__init__(self, host, TQ_PROGRAM, TQ_VERSION, TQ_PORT)
PartialTQClient.__init__(self)
if __name__ == "__main__":
tqcl = UDPTQClient("127.0.0.1")
print(tqcl)
res = tqcl.call(12)
#print ("Got result", res)
Here is my server code:
import rpc
import rpc_new
from tq_const import *
from tq_type import *
import tq_pack
import socket
import os
class TQServer(rpc.UDPServer):
print("Inside TQServer")
def handle_0(self):
print ("Got request")
m = self.unpacker.unpack_uint()
print ("Arguments was", m)
self.turn_around()
self.packer.pack_array([1, 2, 3], self.packer.pack_int)
#res = PFresults(self, status=TRUE, phone="555-12345")
#res.pack()
if __name__ == "__main__":
s = TQServer("", TQ_PROGRAM, TQ_VERSION, TQ_PORT)
print ("Service started...",s)
try:
print("Trying")
s.loop()
finally:
print ("Service interrupted.")
When I am running client and server on localhost I am getting following error: TypeError: cannot unpack non-iterable NoneType object
I want use a parallel downloading videos from youtube, but my code ending with exception "PicklingError". Can you help guys with code, how it should be, please.
Another fixed variant:
import sys
#from pathos.multiprocessing import ProcessingPool as Pool
from multiprocessing import Pool
from pytube import YouTube
from youtubeMultiDownloader import UiMainWindow
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import QFileDialog
class YouTubeInstance:
def __init__(self, path):
self.youtube = YouTube
self.path = path
#self.ui_obj = ui_obj
def download_file(self, url):
self.youtube(url).streams.get_highest_resolution().download(self.path)
#self.ui.ui.youtube_outputs.setText(f'Video \'{self.youtube.title}\' has been downloaded successfully!')
class YouTubeMultiDownloader(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
self.pool = Pool
self.ui = UiMainWindow()
self.ui.setup_ui(self)
self.path_to_dir = None
self.urls = None
def _get_urls_from_form(self):
self.urls = self.ui.youtube_urls.toPlainText().split('\n')
return len(self.urls)
def choose_directory(self):
self.path_to_dir = str(QFileDialog.getExistingDirectory(self, "Select Directory"))
def run_multi_downloads(self):
youtube = YouTubeInstance(self.path_to_dir)
self.pool(self._get_urls_from_form()).map(youtube.download_file, self.urls)
if __name__ == "__main__":
app = QtWidgets.QApplication([])
application = YouTubeMultiDownloader()
application.show()
sys.exit(app.exec_())
Updated:
My ui :)
Error 1 fixed:
Error 2 fixed:
Error 3 actual:
You've got the wrong side of the stick. Take a look at multiprocessing module documents. As it says, calling Pool method is for running multiple instance of same function simultaneously (in parallel). So call Pool method as many numbers you want, meanwhile your method does not any parameters, call it without any arguments:
with Pool(5) as p:
print(p.map(YouTubeMultiDownloader))
It create 5 parallel instance. You can change the code an refine your errors.
how can I make progress bar for cloning git repository in pyqt5
git.Repo.clone_from('https://github.com/addddd123/Osdag', '_new_update')
You have to execute the task in another thread, connect to the callback and send the progress information through signals:
import threading
import sys
from dataclasses import dataclass, field
from typing import List, Optional, Any, Mapping, Dict
from PyQt5 import QtCore, QtWidgets
import git
class GitReply(QtCore.QObject):
pass
#dataclass
class GitCloneReply(GitReply):
progress_changed = QtCore.pyqtSignal(int)
started = QtCore.pyqtSignal()
finished = QtCore.pyqtSignal()
url: str
path: str
env: Optional[Mapping[str, Any]] = None
multi_options: Optional[List[str]] = None
kwargs: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
super().__init__()
def start(self):
threading.Thread(target=self._execute, daemon=True).start()
def _execute(self):
self.started.emit()
repo = git.Repo.clone_from(
self.url,
self.path,
self.callback,
self.env,
self.multi_options,
**self.kwargs
)
self.finished.emit()
def callback(self, op_code, cur_count, max_count=None, message=""):
self.progress_changed.emit(int((cur_count / max_count) * 100))
#dataclass
class RepoManager(QtCore.QObject):
_replies: List[GitReply] = field(init=False, default_factory=list)
def __post_init__(self):
super().__init__()
def clone_from(self, url, path, env=None, multi_options=None, **kwargs):
reply = GitCloneReply(url, path, env, multi_options, kwargs)
reply.finished.connect(self.handle_finished)
reply.start()
self._replies.append(reply)
return reply
def handle_finished(self):
reply = self.sender()
if reply in self._replies:
self._replies.remove(reply)
def main():
app = QtWidgets.QApplication(sys.argv)
progressbar = QtWidgets.QProgressBar()
progressbar.show()
manager = RepoManager()
reply = manager.clone_from("https://github.com/addddd123/Osdag", "_new_update")
reply.progress_changed.connect(progressbar.setValue)
ret = app.exec_()
sys.exit(ret)
if __name__ == "__main__":
main()
```
I want to publish messages to a Pub/Sub topic with some attributes thanks to Dataflow Job in batch mode.
My dataflow pipeline is write with python 3.8 and apache-beam 2.27.0
It works with the #Ankur solution here : https://stackoverflow.com/a/55824287/9455637
But I think it could be more efficient with a shared Pub/Sub Client : https://stackoverflow.com/a/55833997/9455637
However an error occurred:
return StockUnpickler.find_class(self, module, name) AttributeError:
Can't get attribute 'PublishFn' on <module 'dataflow_worker.start'
from
'/usr/local/lib/python3.8/site-packages/dataflow_worker/start.py'>
Questions:
Would the shared publisher implementation improve beam pipeline performance?
Is there another way to avoid pickling error on my shared publisher client ?
My Dataflow Pipeline :
import apache_beam as beam
from apache_beam.io.gcp import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from google.cloud.pubsub_v1 import PublisherClient
import json
import argparse
import re
import logging
class PubsubClient(PublisherClient):
def __reduce__(self):
return self.__class__, (self.batch_settings,)
# The DoFn to perform on each element in the input PCollection.
class PublishFn(beam.DoFn):
def __init__(self):
from google.cloud import pubsub_v1
batch_settings = pubsub_v1.types.BatchSettings(
max_bytes=1024, # One kilobyte
max_latency=1, # One second
)
self.publisher = PubsubClient(batch_settings)
super().__init__()
def process(self, element, **kwargs):
future = self.publisher.publish(
topic=element["topic"],
data=json.dumps(element["data"]).encode("utf-8"),
**element["attributes"],
)
return future.result()
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--source_table_id",
dest="source_table_id",
default="",
help="BigQuery source table <project>.<dataset>.<table> with columns (topic, attributes, data)",
)
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
# pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
bq_source_table = known_args.source_table_id
bq_table_regex = r"^(?P<PROJECT_ID>[a-zA-Z0-9_-]*)[\.|\:](?P<DATASET_ID>[a-zA-Z0-9_]*)\.(?P<TABLE_ID>[a-zA-Z0-9_-]*)$"
regex_match = re.search(bq_table_regex, bq_source_table)
if not regex_match:
raise ValueError(
f"Bad BigQuery table id : `{bq_source_table}` please match {bq_table_regex}"
)
table_ref = bigquery.TableReference(
projectId=regex_match.group("PROJECT_ID"),
datasetId=regex_match.group("DATASET_ID"),
tableId=regex_match.group("TABLE_ID"),
)
with beam.Pipeline(options=pipeline_options) as p:
(
p
| "ReadFromBqTable" #
>> bigquery.ReadFromBigQuery(table=table_ref, use_json_exports=True) # Each row contains : topic / attributes / data
| "PublishRowsToPubSub" >> beam.ParDo(PublishFn())
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
run()
After fussing with this a bit, I think I have an answer that works consistently and is, if not world-beatingly performant, at least tolerably usable:
import logging
import apache_beam as beam
from apache_beam.io.gcp.pubsub import PubsubMessage
from google.cloud.pubsub_v1 import PublisherClient
from google.cloud.pubsub_v1.types import (
BatchSettings,
LimitExceededBehavior,
PublishFlowControl,
PublisherOptions,
)
class PublishClient(PublisherClient):
"""
You have to override __reduce__ to make PublisherClient pickleable 😡 😤 🤬
Props to 'Ankur' and 'Benjamin' on SO for figuring this part out; god knows
I would not have...
"""
def __reduce__(self):
return self.__class__, (self.batch_settings, self.publisher_options)
class PubsubWriter(beam.DoFn):
"""
beam.io.gcp.pubsub does not yet support batch operations, so
we do this the hard way. it's not as performant as the native
pubsubio but it does the job.
"""
def __init__(self, topic: str):
self.topic = topic
self.window = beam.window.GlobalWindow()
self.count = 0
def setup(self):
batch_settings = BatchSettings(
max_bytes=1e6, # 1MB
# by default it is 10 ms, should be less than timeout used in future.result() to avoid timeout
max_latency=1,
)
publisher_options = PublisherOptions(
enable_message_ordering=False,
# better to be slow than to drop messages during a recovery...
flow_control=PublishFlowControl(limit_exceeded_behavior=LimitExceededBehavior.BLOCK),
)
self.publisher = PublishClient(batch_settings, publisher_options)
def start_bundle(self):
self.futures = []
def process(self, element: PubsubMessage, window=beam.DoFn.WindowParam):
self.window = window
self.futures.append(
self.publisher.publish(
topic=self.topic,
data=element.data,
**element.attributes,
)
)
def finish_bundle(self):
"""Iterate over the list of async publish results and block
until all of them have either succeeded or timed out. Yield
a WindowedValue of the success/fail counts."""
results = []
self.count = self.count + len(self.futures)
for fut in self.futures:
try:
# future.result() blocks until success or timeout;
# we've set a max_latency of 60s upstairs in BatchSettings,
# so we should never spend much time waiting here.
results.append(fut.result(timeout=60))
except Exception as ex:
results.append(ex)
res_count = {"success": 0}
for res in results:
if isinstance(res, str):
res_count["success"] += 1
else:
# if it's not a string, it's an exception
msg = str(res)
if msg not in res_count:
res_count[msg] = 1
else:
res_count[msg] += 1
logging.info(f"Pubsub publish results: {res_count}")
yield beam.utils.windowed_value.WindowedValue(
value=res_count,
timestamp=0,
windows=[self.window],
)
def teardown(self):
logging.info(f"Published {self.count} messages")
The trick is that if you call future.result() inside the process() method, you will block until that single message is successfully published, so instead collect a list of futures and then at the end of the bundle make sure they're all either published or definitively timed out. Some quick testing with one of our internal pipelines suggested that this approach can publish 1.6M messages in ~200s.
I wanted to read the pubsub topic and write data to BigTable with the dataflow code written in Python. I could find the sample code in JAVA but not in Python.
How can we assign columns in a row from pubsub to different column families and write the data to Bigtable?
To write to Bigtable in a Dataflow pipeline, you'll need to create direct rows and pass them to the WriteToBigTable doFn. Here is a brief example that just passes in the row keys and adds one cell for each key nothing too fancy:
import datetime
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from google.cloud.bigtable import row
class MyOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_argument(
'--bigtable-project',
help='The Bigtable project ID, this can be different than your '
'Dataflow project',
default='bigtable-project')
parser.add_argument(
'--bigtable-instance',
help='The Bigtable instance ID',
default='bigtable-instance')
parser.add_argument(
'--bigtable-table',
help='The Bigtable table ID in the instance.',
default='bigtable-table')
class CreateRowFn(beam.DoFn):
def process(self, key):
direct_row = row.DirectRow(row_key=key)
direct_row.set_cell(
"stats_summary",
b"os_build",
b"android",
datetime.datetime.now())
return [direct_row]
def run(argv=None):
"""Build and run the pipeline."""
options = MyOptions(argv)
with beam.Pipeline(options=options) as p:
p | beam.Create(["phone#4c410523#20190501",
"phone#4c410523#20190502"]) | beam.ParDo(
CreateRowFn()) | WriteToBigTable(
project_id=options.bigtable_project,
instance_id=options.bigtable_instance,
table_id=options.bigtable_table)
if __name__ == '__main__':
run()
I am just starting to explore this now and can link to a more polished version on GitHub once it's complete. Hope this helps you get started.
Building on top of what was proposed and adding PubSub, here’s a working version..
Pre requisites
GCS Bucket created (for Dataflow temp/staging files)
PubSub topic created
PubSub subscription created
BigTable instance created
BigTable table created
BigTable column family must be created (no visible error otherwise !)
Example of the latter with cbt:
cbt -instance test-instance createfamily test-table cf1
Code
Define and run the Dataflow pipeline.
# Packages
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from google.cloud import pubsub_v1
# Classes
class CreateRowFn(beam.DoFn):
def __init__(self, pipeline_options):
self.instance_id = pipeline_options.bigtable_instance
self.table_id = pipeline_options.bigtable_table
def process(self, key):
from google.cloud.bigtable import row
import datetime
direct_row = row.DirectRow(row_key=key)
direct_row.set_cell(
'cf1',
'field1',
'value1',
timestamp=datetime.datetime.now())
yield direct_row
# Options
class XyzOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_argument('--bigtable_project', default='nested'),
parser.add_argument('--bigtable_instance', default='instance'),
parser.add_argument('--bigtable_table', default='table')
pipeline_options = XyzOptions(
save_main_session=True, streaming=True,
runner='DataflowRunner',
project=PROJECT,
region=REGION,
temp_location=TEMP_LOCATION,
staging_location=STAGING_LOCATION,
requirements_file=REQUIREMENTS_FILE,
bigtable_project=PROJECT,
bigtable_instance=INSTANCE,
bigtable_table=TABLE)
# Pipeline
def run (argv=None):
with beam.Pipeline(options=pipeline_options) as p:
input_subscription=f"projects/{PROJECT}/subscriptions/{SUBSCRIPTION}"
_ = (p
| 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(subscription=input_subscription).with_output_types(bytes)
| 'Conversion UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
| 'Conversion string to row object' >> beam.ParDo(CreateRowFn(pipeline_options))
| 'Writing row object to BigTable' >> WriteToBigTable(project_id=pipeline_options.bigtable_project,
instance_id=pipeline_options.bigtable_instance,
table_id=pipeline_options.bigtable_table))
if __name__ == '__main__':
run()
Publish a message b"phone#1111" to PubSub topic (e.g. using the Python PublisherClient()).
Table content (using happybase)
b'phone#1111': {b'cf1:field1': b'value1'}
Row length: 1