kedro dynamic catalog creation only for specific nodes before their run - hook

I have several thousands of files to be processed of the different types. I am using dynamic catalog creation with hooks. I used first after_catalog_created hook but it is too early in and I need those entries only for specific nodes. My try is with before_node_run for specific node tags returning the dictionary with just dynamically created entries. Node function is **kwargs only. It works as I see that node get updated inputs, but the problem is that I need to provide for the node specification the already existing catalog entry. So I have such, fake one. Then I am using it to build a dictionary with the same length as the dictionary that is being returned by the hook.
Pipeline code
for doc in docs["Type1_documents"]:
item = doc["name"]
item_name, _ = os.path.splitext(item)
type1_datasets_dict[item_name] = "brace_dictionary"
return Pipeline(
[
node(
func=func1,
inputs=type1_datasets_dict,
outputs=[
f"output1",
f"output2",
],
name=f"type1_eta",
tags=["dynamic-catalog", "type1", "data-engineering"],
)
]
)
Hook code
#hook_impl
def before_node_run(
self, node: Node, catalog: DataCatalog
) -> Optional[Dict[str, Any]]:
self.node = node
self.catalog = catalog
if "dynamic-catalog" in node.tags:
input_catalog_name = node.name
catalog_string = f"params:{input_catalog_name}.full_name"
if self.catalog.exists(catalog_string):
true_datasets_dict = {}
catalog_properties = self.catalog.load(f"params:{input_catalog_name}")
catalog_name = catalog_properties["full_name"]
type = catalog_properties["type"]
subtype = catalog_properties["subtype"]
datasets_dict = self.catalog.load(f"params:{catalog_name}")
for dataset in datasets_dict:
doc_name, _ = os.path.splitext(dataset["name"])
self.add_text_dataset(
name=doc_name,
folder=f"parsed/{type}/{subtype}",
)
true_datasets_dict[doc_name] = doc_name
return true_datasets_dict
return true_datasets_dict
But I am getting value error for this:
line 487, in _run_with_dict
raise ValueError(
ValueError: Node type1_eta: func1([brace_dictionary,brace_dictionary,brace_dictionary,..,brace_dictionary]) -> [output1, output2] expected 1 input(s) ['brace_dictionary'], but got the following 1497 input(s) instead: ['file1', 'file2', ...].
Is there another way how to do it conditionally?

Related

IndexError:: array index out of range

python3
def __init__(self):
super().__init__('object_tracking')
# Declare ROS parameters
self.declare_parameters(namespace='',
parameters=[('qos_length',0),
('topic.untracked_obj',''),
('topic.rgb_image',''),
('topic.tracked_obj',''),
('obj_class.id',[]),
('obj_class.name',[]),
('display',True),
('frame_id.tracked_obj','')])
self.nodeParams()
qos_length = self.get_parameter('qos_length').get_parameter_value().integer_value
qos_profile = QoSProfile(depth=qos_length,
history=QoSHistoryPolicy.KEEP_LAST,
reliability=QoSReliabilityPolicy.RELIABLE)
# Load cv_bridge
self.bridge = CvBridge()
# Create instance of SORT
self.mot_tracker = Sort()
# Create Subscribers
obj_topic = self.get_parameter('topic.untracked_obj').get_parameter_value().string_value
self.obj_sub = mf.Subscriber(self,ObjectArray,obj_topic,qos_profile=qos_profile)
rgb_topic = self.get_parameter('topic.rgb_image').get_parameter_value().string_value
self.rgb_sub = mf.Subscriber(self,Image,rgb_topic,qos_profile=qos_profile)
# Apply message filter
self.timestamp_sync = mf.TimeSynchronizer([self.obj_sub,self.rgb_sub],queue_size=qos_length)
self.timestamp_sync.registerCallback(self.objCallback)
# Create Publishers
obj_topic = self.get_parameter('topic.tracked_obj').get_parameter_value().string_value
self.obj_pub = self.create_publisher(ObjectArray,obj_topic,qos_profile)
def nodeParams(self):
#print('1')
self.display = self.get_parameter('display').get_parameter_value().bool_value
class_id = self.get_parameter('obj_class.id').get_parameter_value().integer_array_value
#print(class_id)
class_name = self.get_parameter('obj_class.name').get_parameter_value().integer_array_value
#print(class_name)
self.class_dict = {}
#for name in class_name:
'''#for i,id_ in enumerate(class_id):
#print('2')
#self.class_dict = class_name [name]
#print('3')'''
for i,id_ in enumerate(class_id):
self.class_dict[int(id_)] = class_name[i]
I'm not sure what's going on...I'd like to try object tracking in Carla 0.9.13 with ros2 foxy in Python 3.8. Could you please help me?
[object_tracking.py-3] self.nodeParams()
[object_tracking.py-3] File "/home/smit/ros2_ws/install/carla_simulation/lib/carla_simulation/object_tracking.py", line 64, in nodeParams
[object_tracking.py-3] self.class_dict[int(id_)] = class_name[i]
[object_tracking.py-3] IndexError: array index out of range
[ERROR] [object_tracking.py-3]: process has died [pid 623526, exit code 1, cmd '/home/smit/ros2_ws/install/carla_simulation/lib/carla_simulation/object_tracking.py --ros-args --params-file /home/smit/ros2_ws/install/carla_simulation/share/carla_simulation/config/params.yaml'].
You are pobably using the returned hierarchy variable wrong.
According to the specification:
In Python, hierarchy is nested inside a top level array. Use hierarchy[0][i] to access hierarchical elements of i-th contour.
https://docs.opencv.org/4.x/d3/dc0/group__imgproc__shape.html#gadf1ad6a0b82947fa1fe3c3d497f260e0

test the function using Unitests Mock method

def command(self) -> None:
"""
This command will create the latest tag
:param args: arguments for setting up make_client, it also consists of
the name of the microservice for tag needs to be created
"""
gitlab_url = "https://gitlab.com"
gl = gitlab.Gitlab(url=gitlab_url, private_token=self.args.privatetoken)
project = gl.projects.get(f'pa/{self.args.service}')
tags = project.tags.list(get_all=False)
# iterate over tags from tag list retrieved
tags = [tag for tag in tags if self.args.environment in tag.name]
if not tags:
self.log(f"No tag found for environment {self.args.environment}")
return
tag = tags[0].name.split('-')
newtag = f"{tag[0]}-{tag[1]}-{int(tag[2])+1}"
git("tag", "-a", newtag, "-m", "$newtag")
git("push", "origin", newtag)
self.log(f"New tag is pushed... {newtag}")
i am unable to use patch i have used #patch("arke.commands.create_tag_command.CreateTagCommand.gitlab"). but it keeps on saying that <class 'arke.commands.create_tag_command.CreateTagCommand'> does not have the attribute 'gitlab'
test method i wrote
class TestCreateTagCommand:
#patch("sys.argv", [__file__, "--createtag", "True", "--environment", "b",
"--privatetoken", "testtoken", "--service", "c", ])
#patch("arke.commands.create_tag_command.CreateTagCommand")
#patch("arke.commands.create_tag_command.CreateTagCommand.gitlab")
#patch("logging.Logger.info")
def test_git_create_tag(
self, mock_logger, mock_api_call, mock_create_tag_command):
mock_api_call.return_value = MagicMock(spec=Response, status_code=200, response=json.dumps(['<ProjectTag name:impl-59>']))
ct = CreateTagCommand()
response=ct.command()
assert(response, '200')

Python list add variables in rows

im trying to add variables to a list that i created. Got a result from a session.execute.
i´ve done this:
def machine_id(session, machine_serial):
stmt_raw = '''
SELECT
id
FROM
machine
WHERE
machine.serial = :machine_serial_arg
'''
utc_now = datetime.datetime.utcnow()
utc_now_iso = pytz.utc.localize(utc_now).isoformat()
utc_start = datetime.datetime.utcnow() - datetime.timedelta(days = 30)
utc_start_iso = pytz.utc.localize(utc_start).isoformat()
stmt_args = {
'machine_serial_arg': machine_serial,
}
stmt = text(stmt_raw).columns(
#ts_insert = ISODateTime
)
result = session.execute(stmt, stmt_args)
ts = utc_now_iso
ts_start = utc_start_iso
ID = []
for row in result:
ID.append({
'id': row[0],
'ts': ts,
'ts_start': ts_start,
})
return ID
In trying to get the result over api like this:
def form_response(response, session):
result_machine_id = machine_id(session, machine_serial)
if not result_machine_id:
response['Error'] = 'Seriennummer nicht vorhanden/gefunden'
return
response['id_timerange'] = result_machine_id
Output looks fine.
{
"id_timerange": [
{
"id": 1,
"ts": "2020-08-13T08:32:25.835055+00:00",
"ts_start": "2020-07-14T08:32:25.835089+00:00"
}
]
}
Now i only want the id from it as a parameter for another function. Problem is i think its not a list. I cant select the first element. result_machine_id[0] result is like the posted Output. I think in my first function i only add ts & ts_start to the first row? Is it possible to add emtpy rows and then add 'ts':ts as value?
Help would be nice
If I have understood your question correctly ...
Your output looks like dict. so access its id_timerange key which gives you a list. Access the first element which gives you another dict. On this dict you have an id key:
result_machine_id["id_timerange"][0]["id"]

Doing feature generation in serving_input_fn for Tensorflow model

I've been playing around with BERT and TensorFlow following the example here and have a trained working model.
I then wanted to save and deploy the model, so used the export_saved_model function, which requires you build a serving_input_fn to handle any incoming requests when the model is reloaded.
I wanted to be able to pass a single string for sentiment analysis to the deployed model, rather than having a theoretical client side application do the tokenisation and feature generation etc, so tried to write an input function that would handle that and pass the constructed features to the model. Is this possible? I wrote the following which I feel should do what I want:
import json
import base64
def plain_text_serving_input_fn():
input_string = tf.placeholder(dtype=tf.string, shape=None, name='input_string_text')
# What format to expect input in.
receiver_tensors = {'input_text': input_string}
input_examples = [run_classifier.InputExample(guid="", text_a = str(input_string), text_b = None, label = 0)] # here, "" is just a dummy label
input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
variables = {}
for i in input_features:
variables["input_ids"] = i.input_ids
variables["input_mask"] = i.input_mask
variables["segment_ids"] = i.segment_ids
variables["label_id"] = i.label_id
feature_spec = {
"input_ids" : tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
"input_mask" : tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
"segment_ids" : tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
"label_ids" : tf.FixedLenFeature([], tf.int64)
}
string_variables = json.dumps(variables)
encode_input = base64.b64encode(string_variables.encode('utf-8'))
encode_string = base64.decodestring(encode_input)
features_to_input = tf.parse_example([encode_string], feature_spec)
return tf.estimator.export.ServingInputReceiver(features_to_input, receiver_tensors)
I would expect that this would allow me to call predict on my deployed model with
variables = {"input_text" : "This is some test input"}
predictor.predict(variables)
I've tried a range of variations of this (putting it in an array, converting to base 64 etc), but I get a range of errors either telling me
"error": "Failed to process element: 0 of 'instances' list. Error: Invalid argument: JSON Value: {\n \"input_text\": \"This is some test input\"\n} not formatted correctly for base64 data" }"
or
Object of type 'bytes' is not JSON serializable
I suspect I'm formatting my requests incorrectly, but I also can't find any examples of something similar being done in a serving_input_fn, so has anyone ever done something similar?

Google Matrix API - python return Nonetype error

"Update"
*Finally resolved the issue, changed the try except to include TypeError and also use pass instead of continue in the except.
"End of update"
I wrote code to search for distance between two locations using Google Distance Matrix API. The origin location are fixed, however for the destination, I get it from an xlsx file. I was expecting to get Dictionary with Destination as the Key and the distance as value. When I run the code below, after certain loop I'm stumbled with this error code:
TypeError: Expected a lat/lng dict or tuple, but got NoneType
Can you help me understand the cause of the error? Here is the code (pygmap.py):
import googlemaps
import openpyxl
#get origin and destination locations
def cleanWB(file_path):
destination = list()
wb = openpyxl.load_workbook(filename=file_path)
ws = wb.get_sheet_by_name('Sheet1')
for i in range(ws.max_row):
cellValueLocation = ws.cell(row=i+2,column=1).value
destination.append(cellValueLocation)
#remove duplicates from destination list
unique_location = list(set(destination))
return unique_location
def getDistance(origin, destination):
#Google distance matrix API key
gmaps = googlemaps.Client(key = 'INSERT API KEY')
distance = gmaps.distance_matrix(origin, destination)
distance_status = distance['rows'][0]['elements'][0]['status']
if distance_status != 'ZERO_RESULTS':
jDistance = distance['rows'][0]['elements'][0]
distance_location = jDistance['distance']['value']
else:
distance_location = 0
return distance_location
And I run it using this code:
import pygmap
unique_location = pygmap.cleanWB('C:/Users/an_id/Documents/location.xlsx')
origin = 'alam sutera'
result = {}
for i in range(len(unique_location)):
try:
result[unique_location[i]] = pygmap.getDistance(origin, unique_location[i])
except (KeyError, TypeError):
pass
If I print results it will show that I have successfully get 46 results
result
{'Pondok Pinang': 25905, 'Jatinegara Kaum': 40453, 'Serdang': 1623167, 'Jatiasih
': 44737, 'Tanah Sereal': 77874, 'Jatikarya': 48399, 'Duri Kepa': 20716, 'Mampan
g Prapatan': 31880, 'Pondok Pucung': 12592, 'Johar Baru': 46791, 'Karet': 26889,
'Bukit Duri': 34039, 'Sukamaju': 55333, 'Pasir Gunung Selatan': 42140, 'Pinangs
ia': 30471, 'Pinang Ranti': 38099, 'Bantar Gebang': 50778, 'Sukabumi Utara': 204
41, 'Kembangan Utara': 17708, 'Kwitang': 25860, 'Kuningan Barat': 31231, 'Cilodo
ng': 58879, 'Pademangan Barat': 32585, 'Kebon Kelapa': 23452, 'Mekar Jaya': 5381
0, 'Kampung Bali': 1188894, 'Pajang': 30008, 'Sukamaju Baru': 53708, 'Benda Baru
': 19965, 'Sukabumi Selatan': 19095, 'Gandaria Utara': 28429, 'Setia Mulya': 635
34, 'Rawajati': 31724, 'Cireundeu': 28220, 'Cimuning': 55712, 'Lebak Bulus': 273
61, 'Kayuringin Jaya': 47560, 'Kedaung Kali Angke': 19171, 'Pagedangan': 16791,
'Karang Anyar': 171165, 'Petukangan Selatan': 18959, 'Rawabadak Selatan': 42765,
'Bojong Sari Baru': 26978, 'Padurenan': 53216, 'Jati Mekar': 2594703, 'Jatirang
ga': 51119}
Resolved the issue to include TypeError in the Try Except. And also use pass instead of continue
import pygmap
unique_location = pygmap.cleanWB('C:/Users/an_id/Documents/location.xlsx')
origin = 'alam sutera'
result = {}
#get getPlace
for i in range(len(unique_location)):
try:
result[unique_location[i]] = pygmap.getDistance(origin, unique_location[i])
except (KeyError, TypeError):
pass
I skipped some locations using this solution though.

Resources