Apache beam DirectRunner code failing with attribute error (No Checkpoint) - python-3.x

I am running an apache-beam python code with a direct runner. It's failing with attribute error giving exception in the thread.
AttributeError: '_SDFBoundedSourceRestrictionTracker' object has no
attribute 'checkpoint'
Find the piece of the code:
def run(argv=None):
"""Main entry point; defines and runs the barc records pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument('--input',
type=str,
dest='input',
default='gs://{Bucket name}/Week28 - Weekly.xlsb',
help='Input file to process')
args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
with beam.Pipeline(options=pipeline_options) as p:
if args.input and args.week_num:
#Read Master from BQ
channel_master = (p | 'ReadMaster' >> beam.io.Read(beam.io.BigQuerySource(
query = "SELECT * FROM DATASET.MASTER_TABLE"
))
| "Map on name" >> beam.Map(lambda elem:(elem['name'],elem)))
#Read name
gc = (p | 'ReadGC' >> beam.io.Read(beam.io.BigQuerySource(
query = "SELECT Display_Name FROM DEST.TABLE"))
| 'yieldvals' >> beam.ParDo(PrintValsDoFn())
)
fa_data_rows = (p
| 'ReadFaData' >> ReadFromText(args.fa.format(args.week_num))
| 'ConvertFaToDict' >> beam.ParDo(ConvertFAToDictFn(
gracenoteEvent.GracenoteEventType('fa_input').get_dict_keys()
))
| 'FilterWritableRows' >> beam.Filter(lambda row: str(row['FA_CODE?']).lower() == "true"
and row['GN_ID'] != '-')
| "Map master on channel" >> beam.Map(
lambda x: (str(str(x['NAME'])), x)))
And writing the results to BQ.
Traceback:
Exception in thread Thread-2:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/threading.py", line 1254, in run
self.function(*self.args, **self.kwargs)
File "/Users/kshitijbhadage/gracenote/lib/python3.8/site-packages/apache_beam/runners/direct/sdf_direct_runner.py", line 467, in initiate_checkpoint
checkpoint_state.residual_restriction = tracker.checkpoint()
AttributeError: '_SDFBoundedSourceRestrictionTracker' object has no attribute 'checkpoint'
Exception in thread Thread-3:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/threading.py", line 1254, in run
self.function(*self.args, **self.kwargs)
File "/Users/kshitijbhadage/gracenote/lib/python3.8/site-packages/apache_beam/runners/direct/sdf_direct_runner.py", line 467, in initiate_checkpoint
checkpoint_state.residual_restriction = tracker.checkpoint()
AttributeError: '_SDFBoundedSourceRestrictionTracker' object has no attribute 'checkpoint'
Not exactly sure why this error is coming. Tried to debug line by line still the issue persists.

Related

Cloud Video Intelligence API error 400 & 504

When I tried to call Cloud Video Intelligence API to detec subtitle in local video file.It always returned error 400 or 504, but use gas is fine.I have tried to adjusted timeout in Cloud Video Intelligence config but it still show error 400 with invalid argument.
this is my python code for detecting video subtitle:
"""This application demonstrates detection subtitles in video using the Google Cloud API.
Usage Examples:
use video in google cloud storge:
python analyze.py text_gcs gs://"video path"
use video in computer:
python analyze.py text_file video.mp4
"""
import argparse
import io
from google.cloud import videointelligence
from google.cloud.videointelligence import enums
def video_detect_text_gcs(input_uri):
# [START video_detect_text_gcs]
"""Detect text in a video stored on GCS."""
from google.cloud import videointelligence
video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.enums.Feature.TEXT_DETECTION]
config = videointelligence.types.TextDetectionConfig(language_hints=["zh-TW","en-US"])
video_context = videointelligence.types.VideoContext(text_detection_config=config)
operation = video_client.annotate_video(input_uri=input_uri, features=features, video_context=video_context)
print("\nSubtitle detecting......")
result = operation.result(timeout=300)
# The first result is retrieved because a single video was processed.
annotation_result = result.annotation_results[0]
subtitle_data=[ ]
for text_annotation in annotation_result.text_annotations:
text_segment = text_annotation.segments[0]
start_time = text_segment.segment.start_time_offset
frame = text_segment.frames[0]
vertex=frame.rotated_bounding_box.vertices[0]
if text_segment.confidence > 0.95 and vertex.y >0.7:
lists=[text_annotation.text,start_time.seconds+ start_time.nanos * 1e-9,vertex.y]
subtitle_data=subtitle_data+[lists]
length=len(subtitle_data)
subtitle_sort=sorted(subtitle_data,key = lambda x: (x[1],x[2]))
i=0
subtitle=[ ]
while i<length :
subtitle=subtitle+[subtitle_sort[i][0]]
i=i+1
with open("subtitle.txt",mode="w",encoding="utf-8") as file:
for x in subtitle:
file.write(x+'\n')
def video_detect_text(path):
# [START video_detect_text]
"""Detect text in a local video."""
from google.cloud import videointelligence
video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.enums.Feature.TEXT_DETECTION]
video_context = videointelligence.types.VideoContext()
with io.open(path, "rb") as file:
input_content = file.read()
operation = video_client.annotate_video(
input_content=input_content, # the bytes of the video file
features=features,
video_context=video_context
)
print("\nSubtitle detecting......")
result = operation.result(timeout=300)
# The first result is retrieved because a single video was processed.
annotation_result = result.annotation_results[0]
subtitle_data=[ ]
for text_annotation in annotation_result.text_annotations:
text_segment = text_annotation.segments[0]
start_time = text_segment.segment.start_time_offset
frame = text_segment.frames[0]
vertex=frame.rotated_bounding_box.vertices[0]
if text_segment.confidence > 0.95 and vertex.y >0.7:
lists=[text_annotation.text,start_time.seconds+ start_time.nanos * 1e-9,vertex.y]
subtitle_data=subtitle_data+[lists]
length=len(subtitle_data)
subtitle_sort=sorted(subtitle_data,key = lambda x: (x[1],x[2]))
i=0
subtitle=[ ]
while i<length :
subtitle=subtitle+[subtitle_sort[i][0]]
i=i+1
with open("subtitle.txt",mode="w",encoding="utf-8") as file:
for x in subtitle:
file.write(x+'\n')
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
subparsers = parser.add_subparsers(dest="command")
detect_text_parser = subparsers.add_parser(
"text_gcs", help=video_detect_text_gcs.__doc__
)
detect_text_parser.add_argument("path")
detect_text_file_parser = subparsers.add_parser(
"text_file", help=video_detect_text.__doc__
)
detect_text_file_parser.add_argument("path")
args = parser.parse_args()
if args.command == "text_gcs":
video_detect_text_gcs(args.path)
if args.command == "text_file":
video_detect_text(args.path)
This is error report:
Ghuang#/Users/Ghuang/Documents/GitHub/Video-subtitles-detection$ python3 analyze.py text_file video.mp4
Traceback (most recent call last):
File "/Users/Ghuang/Library/Python/3.7/lib/python/site-packages/google/api_core/grpc_helpers.py", line 57, in error_remapped_callable
return callable_(*args, **kwargs)
File "/Users/Ghuang/Library/Python/3.7/lib/python/site-packages/grpc/_channel.py", line 826, in __call__
return _end_unary_response_blocking(state, call, False, None)
File "/Users/Ghuang/Library/Python/3.7/lib/python/site-packages/grpc/_channel.py", line 729, in _end_unary_response_blocking
raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.DEADLINE_EXCEEDED
details = "Deadline Exceeded"
debug_error_string = "{"created":"#1587691109.677447000","description":"Error received from peer ipv4:172.217.24.10:443","file":"src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Deadline Exceeded","grpc_status":4}"
>
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "analyze.py", line 144, in <module>
video_detect_text(args.path)
File "analyze.py", line 90, in video_detect_text
video_context=video_context
File "/Library/Python/3.7/site-packages/google/cloud/videointelligence_v1/gapic/video_intelligence_service_client.py", line 303, in annotate_video
request, retry=retry, timeout=timeout, metadata=metadata
File "/Users/Ghuang/Library/Python/3.7/lib/python/site-packages/google/api_core/gapic_v1/method.py", line 143, in __call__
return wrapped_func(*args, **kwargs)
File "/Users/Ghuang/Library/Python/3.7/lib/python/site-packages/google/api_core/retry.py", line 286, in retry_wrapped_func
on_error=on_error,
File "/Users/Ghuang/Library/Python/3.7/lib/python/site-packages/google/api_core/retry.py", line 184, in retry_target
return target()
File "/Users/Ghuang/Library/Python/3.7/lib/python/site-packages/google/api_core/timeout.py", line 214, in func_with_timeout
return func(*args, **kwargs)
File "/Users/Ghuang/Library/Python/3.7/lib/python/site-packages/google/api_core/grpc_helpers.py", line 59, in error_remapped_callable
six.raise_from(exceptions.from_grpc_error(exc), exc)
File "<string>", line 3, in raise_from
google.api_core.exceptions.DeadlineExceeded: 504 Deadline Exceeded

Python3 multiprocessing shared dictionary consume by all process

I'm beginner for multiprocessing,
i would like to use multiprocessing for parallel code running with streaming data.
To start good, I have coded below and got error.
Could you please tell me correct way to print on the screen.
Code:
import sys
from multiprocessing import Process, Manager
import time
def producer(dic, name):
for i in range(10000):
dic["A"] = i
time.sleep(2)
def consumer(dic, name):
for i in range(10000):
aval = dic.get("A")
#print(f" {name} - Val = {aval}")
sys.stdout.write(f" {name} - Val = {aval}")
sys.stdout.flush()
time.sleep(2.2)
if __name__ == '__main__':
manager = Manager()
dic = manager.dict()
Process(target=producer, args=(dic,"TT")).start()
time.sleep(1)
Process(target=consumer, args=(dic,"Con1")).start()
Process(target=consumer, args=(dic,"Con2")).start()
When I run the same in the windows console, I got below error, how can I print Consumer's print function in the console.Thanks
(base) PS D:\> python .\mulpro.py
Process Process-3:
Process Process-4:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 811, in
_callmethod
conn = self._tls.connection
AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _
bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in ru
n
self._target(*self._args, **self._kwargs)
File "D:\mulpro.py", line 19, in consumer
aval = dic.get("A")
File "<string>", line 2, in get
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 815, in
_callmethod
self._connect()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 802, in
_connect
conn = self._Client(self._token.address, authkey=self._authkey)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 490, i
n Client
c = PipeClient(address)
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 691, i
n PipeClient
_winapi.WaitNamedPipe(address, 1000)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 811, in
_callmethod
conn = self._tls.connection
FileNotFoundError: [WinError 2] The system cannot find the file specified
AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _
bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in ru
n
self._target(*self._args, **self._kwargs)
File "D:\mulpro.py", line 19, in consumer
aval = dic.get("A")
File "<string>", line 2, in get
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 815, in
_callmethod
self._connect()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 802, in
_connect
conn = self._Client(self._token.address, authkey=self._authkey)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 490, i
n Client
c = PipeClient(address)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 691, i
n PipeClient
_winapi.WaitNamedPipe(address, 1000)
FileNotFoundError: [WinError 2] The system cannot find the file specified
Process Process-2:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _
bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in ru
n
self._target(*self._args, **self._kwargs)
File "D:\mulpro.py", line 13, in producer
dic["A"] = i
File "<string>", line 2, in __setitem__
File "C:\ProgramData\Anaconda3\lib\multiprocessing\managers.py", line 818, in
_callmethod
conn.send((self._id, methodname, args, kwds))
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 206, i
n send
self._send_bytes(_ForkingPickler.dumps(obj))
File "C:\ProgramData\Anaconda3\lib\multiprocessing\connection.py", line 280, i
n _send_bytes
ov, err = _winapi.WriteFile(self._handle, buf, overlapped=True)
BrokenPipeError: [WinError 232] The pipe is being closed
The reason might be that the main process doesn't wait for two children process, so make your code like this could work:
def run():
manager = Manager()
dic = manager.dict()
Process(target=producer, args=(dic,"TT")).start()
time.sleep(1)
Process(target=consumer, args=(dic,"Con1")).start()
Process(target=consumer, args=(dic,"Con2")).start()
while True:
pass
if __name__ == '__main__':
run()
However it's very strange that if I append a dead loop in main instead of using another function, it still raise that exception. Anyway, the code above could help.

Does `concurrent.futures.ProcessPoolExecutor()` have any restriction on number of processes?

I wrote a simple code using concurrent.futures.ProcessPoolExecutor() whihc you can see in below. I'm using Python 3.7.4 on Windows 10 (64 bit) on a Core-i7 laptop.
import time
import concurrent.futures
def f(x):
lo = 0
for i in range(x):
lo += i
return(lo)
n = 7
if __name__ == '__main__':
t1 = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor() as executor:
Ans = [executor.submit(f, 10**7-i) for i in range(n)]
for f in concurrent.futures.as_completed(Ans):
print(f.result())
t2 = time.perf_counter()
print('completed at', t2-t1, 'seconds')
The variable n determines how many processes is going to execute. When I set n to 1, 2, 4, 7 everything work fine. For example the output for n=7 is
49999995000000
49999955000010
49999965000006
49999985000001
49999975000003
49999945000015
49999935000021
completed at 2.0607623 seconds
However for n=10 it gives the following error!
49999945000015
49999955000010
49999965000006
concurrent.futures.process._RemoteTraceback:
"""
Traceback (most recent call last):
File "E:\Python37\lib\multiprocessing\queues.py", line 236, in _feed
obj = _ForkingPickler.dumps(obj)
File "E:\Python37\lib\multiprocessing\reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <function f at 0x00000285BFC4E0D8>: it's not the same object as __main__.f
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "e:/Python37/Python files/Parallel struggle/Python_20191219_parallel_3.py", line 23, in <module>
print(f.result())
File "E:\Python37\lib\concurrent\futures\_base.py", line 428, in result
return self.__get_result()
File "E:\Python37\lib\concurrent\futures\_base.py", line 384, in __get_result
raise self._exception
File "E:\Python37\lib\multiprocessing\queues.py", line 236, in _feed
obj = _ForkingPickler.dumps(obj)
File "E:\Python37\lib\multiprocessing\reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <function f at 0x00000285BFC4E0D8>: it's not the same object as __main__.f
Why some of the processes get done if something is wrong with the code? And what has happened that an error comes up? And is it specific to python on windows? Is it about the number of CPUs?

Python threading causing issues with google api

I'm running through a list of locations and trying to find places along my route. This is my first attempt at threading, so any tips would be appreciated! When i run this it'll work fine for the first few iterations, but then i start getting a KeyError and the API response says route is not found (even though it should be). If I search along a shorter route, everything runs fine. When I extend the route past a couple of hours of drive time I start getting these errors. Is it possible that I'm overloading it or does my code look off?
import pandas as pd
from threading import Thread
import threading
import requests
start_input = input("start: ")
end_input = input("end: ")
out_way = input("out of the way: ")
out_way_secs = int(out_way) * 60
thread_local = threading.local()
def get_session():
if not getattr(thread_local, "session", None):
thread_local.session = requests.Session()
return thread_local.session
def get_routes(url, start, end, waypoint, idx):
session = get_session()
with session.get(url, params={'origins': f'{start}|{waypoint}', 'destinations': f'{start}|{end}',
'key': '# key'}) as response:
route = response.json()
if route['rows'][1]['elements'][0]['status'] != 'OK':
results[idx] = {'# info'}
else:
nonstop_route = route['rows'][0]['elements'][1]['duration']['value']
leg1 = route['rows'][1]['elements'][0]['duration']['value']
leg2 = route['rows'][1]['elements'][1]['duration']['value']
time_added = (leg1 + leg2) - nonstop_route
time_added_mins = str(datetime.timedelta(seconds=(leg1 + leg2) - nonstop_route))
more_time = time_added_mins.split(':')
added_time_str = str(f'{more_time[0]}:{more_time[1]}:{more_time[2]} away!')
if time_added < allowable_time:
results[idx] = {# info to return}
return results[idx]
if __name__ == "__main__":
start_time = time.time()
output_df = pd.DataFrame(columns=['Location', 'Added Time', 'Notes'])
threads = [None] * coords[0]
results = [None] * coords[0]
for i in range(len(threads)):
threads[i] = Thread(target=get_routes, args=('https://maps.googleapis.com/maps/api/distancematrix/json',
start_input, end_input, stops[i], i))
threads[i].start()
for i in range(len(threads)):
threads[i].join()
for x in range(len(results)):
output_df = output_df.append(results[x], ignore_index=True)
output_df = output_df.sort_values(['Added Time'], ascending=True)
output_df.to_csv('output.csv', index=False)
there are 3 errors that it will get, this first one pops up by itself and the last 2 will come together. The code is the same when I run it, so not sure why i'm getting different errors.
This is the most common error that comes by itself (the routing duration works fine when run individually):
Exception in thread Thread-171:
Traceback (most recent call last):
File "C:\Python37-32\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\Python37-32\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:program.py", line 46, in get_routes
nonstop_route = route['rows'][0]['elements'][1]['duration']['value']
KeyError: 'duration'
The two below I get together and are less common:
Exception in thread Thread-436:
Traceback (most recent call last):
File "C:\Python37-32\lib\threading.py", line 917, in _bootstrap_inner
self.run()
File "C:\Python37-32\lib\threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "C:/program.py", line 40, in get_routes
route = response.json()
File "C:\requests\models.py", line 897, in json
return complexjson.loads(self.text, **kwargs)
File "C:\Python37-32\lib\json\__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "C:\Python37-32\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Python37-32\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
second error:
Exception in thread Thread-196:
Traceback (most recent call last):
File "C:\site-packages\urllib3\response.py", line 360, in _error_catcher
yield
File "C:\urllib3\response.py", line 442, in read
data = self._fp.read(amt)
File "C:\Python37-32\lib\http\client.py", line 447, in read
n = self.readinto(b)
File "C:\Python37-32\lib\http\client.py", line 491, in readinto
n = self.fp.readinto(b)
File "C:\Python37-32\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "C:\Python37-32\lib\ssl.py", line 1052, in recv_into
return self.read(nbytes, buffer)
File "C:\Python37-32\lib\ssl.py", line 911, in read
return self._sslobj.read(len, buffer)
ConnectionAbortedError: [WinError 10053] An established connection was aborted by the software in your host machine
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\site-packages\requests\models.py", line 750, in generate
for chunk in self.raw.stream(chunk_size, decode_content=True):
File "C:\site-packages\urllib3\response.py", line 494, in stream
data = self.read(amt=amt, decode_content=decode_content)
File "C:\site-packages\urllib3\response.py", line 459, in read
raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
File "C:\Python37-32\lib\contextlib.py", line 130, in __exit__
self.gen.throw(type, value, traceback)
File "C:\site-packages\urllib3\response.py", line 378, in _error_catcher
raise ProtocolError('Connection broken: %r' % e, e)
urllib3.exceptions.ProtocolError: ("Connection broken: ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None)", ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))

Using threading.timer to delay sub-procedure

def emailCheck(self):
n=0
(retcode, messages) = mail.search(None, '(UNSEEN)')
if retcode == 'OK':
for num in messages[0].split() :
n=n+1
typ, data = mail.fetch(num,'(RFC822)')
for response_part in data:
if isinstance(response_part, tuple):
original = email.message_from_bytes(response_part[1])
print (original['From'])
print (original['Subject'])
typ, data = mail.store(num,'+FLAGS','\\Seen')
print (n)
t = threading.Timer(10.0, emailCheck)
t.start()
I am trying to delay the sub-procedure using threading.timer(), but I think the error is to do with the inclusion of self in the brackets. I am using PyQt so all of this is contained within the class MainWindow.
The error:
Exception in thread Thread-1:
Traceback (most recent call last):
File "C:\Python33\lib\threading.py", line 637, in _bootstrap_inner
self.run()
File "C:\Python33\lib\threading.py", line 823, in run
self.function(*self.args, **self.kwargs)
TypeError: emailCheck() missing 1 required positional argument: 'self'
t = threading.Timer(10.0, self.emailCheck)

Resources