iterate nested list in json msg by cql stream analytics - azure

I have a json msg coming from iotHub like:
{
"deviceId": "abc",
"topic": "data",
"data": {
"varname1": [{
"t": "timestamp1",
"v": "value1",
"f": "respondFrame1"
},
{
"t": "timestamp2",
"v": "value2",
"f": "respondFrame2"
}],
"varname2": [{
"t": "timestamp1",
"v": "value1",
"f": "respondFrame1"
},
{
"t": "timestamp2",
"v": "value2",
"f": "respondFrame2"
}]
}
}
and want to store this by azure stream analytics job into a transact sql like this:
ID | deviceId | varname | timestamp | respondFrame | value
-----+------------+-----------+-------------+----------------+--------
1 | abc | varname1 | timestamp1 | respondFrame1 | value1
2 | abc | varname1 | timestamp2 | respondFrame2 | value2
3 | abc | varname2 | timestamp1 | respondFrame1 | value1
4 | abc | varname2 | timestamp2 | respondFrame2 | value2
does anaybody knwo how to handle this stacked iterations and combine it (cross apply)?
something like this "phantomCode":
deviceId = msg.deviceId
for d in msg.data:
for key in d:
varname = key.name
timestamp = key[varname].t
frame = key[varname].f
value = key[varname].v
UPDATE regarding to JS Azure answer:
with the code
WITH datalist AS
(
SELECT
iotHubAlias.deviceId,
data.PropertyName as varname,
data.PropertyValue as arrayData
FROM [iotHub] as iotHubAlias
CROSS APPLY GetRecordProperties(iotHubAlias.data) AS data
WHERE iotHubAlias.topic = 'data'
)
SELECT
datalist.deviceId,
datalist.varname,
arrayElement.ArrayValue.t as [timestamp],
arrayElement.ArrayValue.f as respondFrame,
arrayElement.ArrayValue.v as value
INTO [temporary]
FROM datalist
CROSS APPLY GetArrayElements(datalist.arrayData) AS arrayElement
I always get an error:
{
"channels": "Operation",
"correlationId": "f9d4437b-707e-4892-a37b-8ad721eb1bb2",
"description": "",
"eventDataId": "ef5a5f2b-8c2f-49c2-91f0-16213aaa959d",
"eventName": {
"value": "streamingNode0",
"localizedValue": "streamingNode0"
},
"category": {
"value": "Administrative",
"localizedValue": "Administrative"
},
"eventTimestamp": "2018-08-21T18:23:39.1804989Z",
"id": "/subscriptions/46cd2f8f-b46b-4428-8f7b-c7d942ff745d/resourceGroups/fieldtest/providers/Microsoft.StreamAnalytics/streamingjobs/streamAnalytics4fieldtest/events/ef5a5f2b-8c2f-49c2-91f0-16213aaa959d/ticks/636704726191804989",
"level": "Error",
"operationId": "7a38a957-1a51-4da1-a679-eae1c7e3a65b",
"operationName": {
"value": "Process Events: Processing events Runtime Error",
"localizedValue": "Process Events: Processing events Runtime Error"
},
"resourceGroupName": "fieldtest",
"resourceProviderName": {
"value": "Microsoft.StreamAnalytics",
"localizedValue": "Microsoft.StreamAnalytics"
},
"resourceType": {
"value": "Microsoft.StreamAnalytics/streamingjobs",
"localizedValue": "Microsoft.StreamAnalytics/streamingjobs"
},
"resourceId": "/subscriptions/46cd2f8f-b46b-4428-8f7b-c7d942ff745d/resourceGroups/fieldtest/providers/Microsoft.StreamAnalytics/streamingjobs/streamAnalytics4fieldtest",
"status": {
"value": "Failed",
"localizedValue": "Failed"
},
"subStatus": {
"value": "",
"localizedValue": ""
},
"submissionTimestamp": "2018-08-21T18:24:34.0981187Z",
"subscriptionId": "46cd2f8f-b46b-4428-8f7b-c7d942ff745d",
"properties": {
"Message Time": "2018-08-21 18:23:39Z",
"Error": "- Unable to cast object of type 'Microsoft.EventProcessing.RuntimeTypes.ValueArray' to type 'Microsoft.EventProcessing.RuntimeTypes.IRecord'.\r\n",
"Message": "Runtime exception occurred while processing events, - Unable to cast object of type 'Microsoft.EventProcessing.RuntimeTypes.ValueArray' to type 'Microsoft.EventProcessing.RuntimeTypes.IRecord'.\r\n, : OutputSourceAlias:temporary;",
"Type": "SqlRuntimeError",
"Correlation ID": "f9d4437b-707e-4892-a37b-8ad721eb1bb2"
},
"relatedEvents": []
}
and here an example of a real json msg coming from a device:
{
"topic": "data",
"data": {
"ExternalFlowTemperatureSensor": [{
"t": "2018-08-22T11:00:11.955381",
"v": 16.64103,
"f": "Q6ES8KJIN1NX2DRGH36RX1WDT"
}],
"AdaStartsP2": [{
"t": "2018-08-22T11:00:12.863383",
"v": 382.363138,
"f": "9IY7B4DFBAMOLH3GNKRUNUQNUX"
},
{
"t": "2018-08-22T11:00:54.172501",
"v": 104.0,
"f": "IUJMP20CYQK60B"
}],
"s_DriftData[4].c32_ZeitLetzterTest": [{
"t": "2018-08-22T11:01:01.829568",
"v": 348.2916,
"f": "MMTPWQVLL02CA"
}]
},
"deviceId": "test_3c27db"
}
and (to have it complete) the creation code for the sql table:
create table temporary (
id int NOT NULL IDENTITY PRIMARY KEY,
deviceId nvarchar(20) NOT NULL,
timestamp datetime NOT NULL,
varname nvarchar(100) NOT NULL,
value float,
respondFrame nvarchar(50)
)

the following query will give you the expected output
WITH step1 AS
(
SELECT
event.deviceID,
data.PropertyName as varname,
data.PropertyValue as arrayData
FROM blobtest as event
CROSS APPLY GetRecordProperties(event.data) AS data
)
SELECT
event.deviceId,
event.varname,
arrayElement.ArrayValue.t as [timestamp],
arrayElement.ArrayValue.f as frame,
arrayElement.ArrayValue.v as value
FROM step1 as event
CROSS APPLY GetArrayElements(event.arrayData) AS arrayElement
You can find more info about JSON parsing on our documentation page "Parse JSON and Avro data in Azure Stream Analytics"
Let me know if you have any other question.
JS (Azure Stream Analytics)

Related

Sending Messages to Topics On Azure Service Bus [duplicate]

I have one topic DemoTopic and having 2 subscriptions 'sub1' and sub2
my message payload is like this
{
"data": [
{
"id": "1",
"name": "a",
"pid": "p1"
},
{
"id": "2",
"name": "b",
"pid": "p2"
},
{
"id": "3",
"name": "c",
"pid": "p3"
}
]
}
if pid value is p1 and p2 then send messsage to sub1
how to create a filter for this using p1 and p2 value?
Subscriptions can only filter messages on some system properties and user/custom properties (aka headers). If your message contains the data required for filtering, you should promote those property values to the headers when dispatching messages.

Syntax for JSONPath filtering to not return array

I'm new to JSONPath and want to write a JSONPath-syntax that retrieves the property value only if a certain condition is met. The value I'm after is not part of an array, but I've managed to make filtering work in the following JSONPath tool: https://www.site24x7.com/tools/json-path-evaluator.html
Given the following JSON, I only want to extract the value of column2.dimValue if column2.attributeId equals B0:
{
"batchId": 279,
"companyId": "40",
"period": 202208,
"taxCode": "1",
"taxSystem": "",
"transactionDate": "2022-08-05T00:00:00.000",
"transactionNumber": 222006089,
"transactionType": "IF",
"year": 2022,
"accountingInformation": {
"account": "4010",
"column1": {
"attributeId": "H9",
"dimValue": "76"
},
"column2": {
"attributeId": "B0",
"dimValue": "2170103"
},
"column3": {
"attributeId": "",
"dimValue": ""
},
"column4": {
"attributeId": "BF",
"dimValue": "217010330"
},
"column5": {
"attributeId": "10",
"dimValue": "3101"
},
"column6": {
"attributeId": "06",
"dimValue": ""
},
"column7": {
"attributeId": "19",
"dimValue": "K"
}
},
"categories": {
"cat1": "H9",
"cat2": "B0",
"cat3": "",
"cat4": "BF",
"cat5": "10",
"cat6": "06",
"cat7": "19",
"dim1": "76",
"dim2": "2170103",
"dim3": "",
"dim4": "217010330",
"dim5": "3101",
"dim6": "",
"dim7": "K"
},
"amounts": {
"amount": 48.24,
"amount3": 0.0,
"amount4": 0.0,
"currencyAmount": 48.24,
"currencyCode": "NOK",
"debitCreditFlag": 1
},
"invoice": {
"customerOrSupplierId": "58118",
"description": "",
"externalArchiveReference": "",
"externalReference": "2170103",
"invoiceNumber": "220238522",
"ledgerType": "P"
},
"additionalInformation": {
"number": 0,
"orderLineNumber": 0,
"orderNumber": 0,
"sequenceNumber": 1,
"status": "",
"value": 0.0,
"valueDate": "2022-08-05T00:00:00.000"
},
"lastUpdated": {
"updatedAt": "2022-09-05T10:59:11.633",
"updatedBy": "HELVES"
}
}
I've used this JSONPath-syntax:
$['accountingInformation']['column2'][?(#.attributeId=='B0')].dimValue
This gives the following result:
[
"2170103"
]
I'm using this result in Azure Data Factory mapping, and it seems that it doesn't work as the result is an array.
Can anyone help me with the syntax to it only returns the actual value? Is that even possible?
I repro'd the same and below is the approach.
Sample Json file is taken as in below image as a source in lookup activity.
If activity is taken to filter the value of column2 with attributeId='B0'. Expression is given as below
#equals(activity('Lookup1').output.value[0].accountingInformation.column2.attributeId ,'B0')
In true case of IF activity, Set Variable is added. New Variable with string type is taken and it is set using below expression.
#activity('Lookup1').output.value[0].accountingInformation.column2.dimvalue
Then Copy activity is added next to IF activity sequentially. In source dummy dataset is taken. +New is click in additional columns
Name: col1
Value: #variables('v2')
In Mapping, Import schemas is clicked. All other columns except the additional column that is added in source are deleted.
Pipeline is debugged and data is copied to sink without error.

PySpark Dataframe to Json - grouping data

We are trying to create a json from a dataframe. Please find the dataframe below,
+----------+--------------------+----------+--------------------+-----------------+--------------------+---------------+--------------------+---------------+--------------------+--------------------+
| CustId| TIN|EntityType| EntityAttributes|AddressPreference| AddressDetails|EmailPreference| EmailDetails|PhonePreference| PhoneDetails| MemberDetails|
+----------+--------------------+----------+--------------------+-----------------+--------------------+---------------+--------------------+---------------+--------------------+--------------------+
|1234567890|XXXXXXXXXXXXXXXXXX...| Person|[{null, PRINCESS,...| Alternate|[{Home, 460 M XXX...| Primary|[{Home, HEREBY...| Alternate|[{Home, {88888888...|[{7777777, 999999...|
|1234567890|XXXXXXXXXXXXXXXXXX...| Person|[{null, PRINCESS,...| Alternate|[{Home, 460 M XXX...| Primary|[{Home, HEREBY...| Primary|[{Home, {88888888...|[{7777777, 999999...|
|1234567890|XXXXXXXXXXXXXXXXXX...| Person|[{null, PRINCESS,...| Primary|[{Home, PO BOX 695020...| Primary|[{Home, HEREBY...| Alternate|[{Home, {88888888...|[{7777777, 999999...|
|1234567890|XXXXXXXXXXXXXXXXXX...| Person|[{null, PRINCESS,...| Primary|[{Home, PO BOX 695020...| Primary|[{Home, HEREBY...| Primary|[{Home, {88888888...|[{7777777, 999999...|
+----------+--------------------+----------+--------------------+-----------------+--------------------+---------------+--------------------+---------------+--------------------+--------------------+
So the initial columns custid, TIN, Entitytype,EntityAttributes will be same for a particular customer, say 1234567890 in our example. But he might be having multiple addresses/phone/email. Could you please help us on how to group them under 1 json.
Expected Structure :
{
"CustId": 1234567890,
"TIN": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"EntityType": "Person",
"EntityAttributes": [
{
"FirstName": "PRINCESS",
"LastName": "XXXXXX",
"BirthDate": "xxxx-xx-xx",
"DeceasedFlag": "False"
}
],
"Address": [
{
"AddressPreference": "Alternate",
"AddressDetails": {
"AddressType": "Home",
"Address1": "460",
"City": "XXXX",
"State": "XXX",
"Zip": "XXXX"
}
},
{
"AddressPreference": "Primary",
"AddressDetails": {
"AddressType": "Home",
"Address1": "PO BOX 695020",
"City": "XXX",
"State": "XXXX",
"Zip": "695020",
}
}
],
"Phone": [
{
"PhonePreference": "Primary",
"PhoneDetails": {
"PhoneType": "Home",
"PhoneNumber": "xxxxx",
"FormatPhoneNumber": "xxxxxx"
}
},
{
"PhonePreference": "Alternate",
"PhoneDetails": {
"PhoneType": "Home",
"PhoneNumber": "xxxx",
"FormatPhoneNumber": "xxxxx"
}
},
{
],
"Email": [
{
"EmailPreference": "Primary",
"EmailDetails": {
"EmailType": "Home",
"EmailAddress": "xxxxxxx#GMAIL.COM"
}
}
],
}
]
}
UPDATE
Tried with the below recommended group by method, it ended up giving 1 customer details, but the email is repeated 4 times in the list. Ideally it should be having only 1 email. Also In the Address Preference Alternate has 1 address and primary has 1 address, but the Alternate shows 2 entries and primary shows 2. Could you please help with an ideal solution.
Probably this should work. id is like a custid in your example which has repeating values.
>>> df.show()
+----+------------+----------+
| id| address| email|
+----+------------+----------+
|1001| address-a| email-a|
|1001| address-b| email-b|
|1002|address-1002|email-1002|
|1003|address-1003|email-1002|
|1002| address-c| email-2|
+----+------------+----------+
Aggregate on those repeating columns and then convert to JSON
>>> results = df.groupBy("id").agg(collect_list("address").alias("address"),collect_list("email").alias("email")).toJSON().collect()
>>> for i in results: print(i)
...
{"id":"1003","address":["address-1003"],"email":["email-1002"]}
{"id":"1002","address":["address-1002","address-c"],"email":["email-1002","email-2"]}
{"id":"1001","address":["address-a","address-b"],"email":["email-a","email-b"]}

python dictionary how can create (structured) unique dictionary list if the key contains list of values of other keys

I have below unstructured dictionary list which contains values of other keys in a list .
I am not sure if the question i ask is strange. this is the actual dictionary payload that we receive from source which not aligned with respective entry
[
{
"dsply_nm": [
"test test",
"test test",
"",
""
],
"start_dt": [
"2021-04-21T00:01:00-04:00",
"2021-04-21T00:01:00-04:00",
"2021-04-21T00:01:00-04:00",
"2021-04-21T00:01:00-04:00"
],
"exp_dt": [
"2022-04-21T00:01:00-04:00",
"2022-04-21T00:01:00-04:00",
"2022-04-21T00:01:00-04:00",
"2022-04-21T00:01:00-04:00"
],
"hrs_pwr": [
"14",
"12",
"13",
"15"
],
"make_nm": "test",
"model_nm": "test",
"my_yr": "1980"
}
]
"the length of list cannot not be expected and it could be more than 4 sometimes or less in some keys"
#Expected:
i need to check if the above dictionary are in proper structure or not and based on that it should return the proper dictionary list associate with each item
for eg:
def get_dict_list(items):
if type(items == not structure)
result = get_associated_dict_items_mapped
return result
else:
return items
#Final result
expected_dict_list=
[{"dsply_nm":"test test","start_dt":"2021-04-21T00:01:00-04:00","exp_dt":"2022-04-21T00:01:00-04:00","hrs_pwr":"14"},
{"dsply_nm":"test test","start_dt":"2021-04-21T00:01:00-04:00","exp_dt":"2022-04-21T00:01:00-04:00","hrs_pwr":"12","make_nm": "test",model_nm": "test","my_yr": "1980"},
{"dsply_nm":"","start_dt":"2021-04-21T00:01:00-04:00","exp_dt":"2022-04-21T00:01:00-04:00","hrs_pwr":"13"},
{"dsply_nm":"","start_dt":"2021-04-21T00:01:00-04:00","exp_dt":"2022-04-21T00:01:00-04:00","hrs_pwr":"15"}
]
in above dictionary payload, below part is associated with the second dictionary items and have to map respectively
"make_nm": "test",
"model_nm": "test",
"my_yr": "1980"
}
Can anyone help on this?
Thanks
Since customer details is a list
dict(zip(customer_details[0], list(customer_details.values[0]())))
this yields:
{'insured_details': ['asset', 'asset', 'asset'],
'id': ['213', '214', '233'],
'dept': ['account', 'sales', 'market'],
'salary': ['12', '13', '14']}
​
I think a couple of list comprehensions will get you going. If you would like me to unwind them into more traditional for loops, just let me know.
import json
def get_dict_list(item):
first_value = list(item.values())[0]
if not isinstance(first_value, list):
return [item]
return [{key: item[key][i] for key in item.keys()} for i in range(len(first_value))]
cutomer_details = [
{
"insured_details": "asset",
"id": "xxx",
"dept": "account",
"salary": "12"
},
{
"insured_details": ["asset", "asset", "asset"],
"id":["213","214","233"],
"dept":["account","sales","market"],
"salary":["12","13","14"]
}
]
cutomer_details_cleaned = []
for detail in cutomer_details:
cutomer_details_cleaned.extend(get_dict_list(detail))
print(json.dumps(cutomer_details_cleaned, indent=4))
That should give you:
[
{
"insured_details": "asset",
"id": "xxx",
"dept": "account",
"salary": "12"
},
{
"insured_details": "asset",
"id": "213",
"dept": "account",
"salary": "12"
},
{
"insured_details": "asset",
"id": "214",
"dept": "sales",
"salary": "13"
},
{
"insured_details": "asset",
"id": "233",
"dept": "market",
"salary": "14"
}
]

Update the Nested Json with another Nested Json using Python

For example, I have one full set of nested JSON, I need to update this JSON with the latest values from another nested JSON.
Can anyone help me with this?
I want to implement this in Pyspark.
Full Set Json look like this:
{
"email": "abctest#xxx.com",
"firstName": "name01",
"id": 6304,
"surname": "Optional",
"layer01": {
"key1": "value1",
"key2": "value2",
"key3": "value3",
"key4": "value4",
"layer02": {
"key1": "value1",
"key2": "value2"
},
"layer03": [
{
"inner_key01": "inner value01"
},
{
"inner_key02": "inner_value02"
}
]
},
"surname": "Required only$uid"
}
LatestJson look like this:
{
"email": "test#xxx.com",
"firstName": "name01",
"surname": "Optional",
"id": 6304,
"layer01": {
"key1": "value1",
"key2": "value2",
"key3": "value3",
"key4": "value4",
"layer02": {
"key1": "value1_changedData",
"key2": "value2"
},
"layer03": [
{
"inner_key01": "inner value01"
},
{
"inner_key02": "inner_value02"
}
]
},
"surname": "Required only$uid"
}
In above for id=6304 we have received updates for the layer01.layer02.key1 and emailaddress fileds.
So I need to update these values to full JSON, Kindly help me with this.
You can load the 2 JSON files into Spark data frames and do a left_join to get updates from the latest JSON data :
from pyspark.sql import functions as F
full_json_df = spark.read.json(full_json_path, multiLine=True)
latest_json_df = spark.read.json(latest_json_path, multiLine=True)
updated_df = full_json_df.alias("full").join(
latest_json_df.alias("latest"),
F.col("full.id") == F.col("latest.id"),
"left"
).select(
F.col("full.id"),
*[
F.when(F.col("latest.id").isNotNull(), F.col(f"latest.{c}")).otherwise(F.col(f"full.{c}")).alias(c)
for c in full_json_df.columns if c != 'id'
]
)
updated_df.show(truncate=False)
#+----+------------+---------+-----------------------------------------------------------------------------------------------------+--------+
#|id |email |firstName|layer01 |surname |
#+----+------------+---------+-----------------------------------------------------------------------------------------------------+--------+
#|6304|test#xxx.com|name01 |[value1, value2, value3, value4, [value1_changedData, value2], [[inner value01,], [, inner_value02]]]|Optional|
#+----+------------+---------+-----------------------------------------------------------------------------------------------------+--------+
Update:
If the schema changes between full and latest JSONs, you can load the 2 files into the same data frame (this way the schemas are being merged) and then deduplicate per id:
from pyspark.sql import Window
from pyspark.sql import functions as F
merged_json_df = spark.read.json("/path/to/{full_json.json,latest_json.json}", multiLine=True)
# order priority: latest file then full
w = Window.partitionBy(F.col("id")).orderBy(F.when(F.input_file_name().like('%latest%'), 0).otherwise(1))
updated_df = merged_json_df.withColumn("rn", F.row_number().over(w))\
.filter("rn = 1")\
.drop("rn")
updated_df.show(truncate=False)

Resources