Spark Avro record namespace generation for nested structures - apache-spark

I'd like to write Avro records with Spark 2.2.0 where the schema has a
namespace and some nested records inside.
{
"type": "record",
"name": "userInfo",
"namespace": "my.example",
"fields": [
{
"name": "username",
"type": "string"
},
{
"name": "address",
"type": [
"null",
{
"type": "record",
"name": "address",
"fields": [
{
"name": "street",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "box",
"type": [
"null",
{
"type": "record",
"name": "box",
"fields": [
{
"name": "id",
"type": "string"
}
]
}
],
"default": null
}
]
}
],
"default": null
}
]
}
I need to write out records like:
{
"username": "tom taylor",
"address": {
"my.example.address": {
"street": {
"string": "unknown"
},
"box": {
"my.example.box": {
"id": "id1"
}
}
}
}
}
However when I read some Avro GenericRecords with spark-avro (4.0.0) and do some conversion (e.g: I'm adding a namespace) and would want to write out the output:
df.foreach {
...
.write
.option("recordName", "userInfo")
.option("recordNamespace", "my.example")
...
}
then in the resulting GenericRecord the namespace of the nested records will contain the "full path" to that element from the parents.
I.e instead of my.example.box I get my.example.address.box . When I try to read this record back with the schema of course there's a mismatch.
What is the right way to define the namespace for the writer?

Related

Azure data factory json data conversion null value

I have a json feed in the below format. I need to update the data in NoSQL collection having a different schema as shown below. Using Azure data factory how can I transform input json schema to target schema?
Since the currentValue can be of different data type(array, number, complex type, string etc) for each record, Azure Data flow task is giving null value for 'Derived Column' schema modifier as well as 'Flatten' formatter.
Input Json
[
{
"type": "UPDATE",
"key": { "id": "112710876" },
"doc": [
{
"property": "org.numberOfEmployees",
"currentValue": [
{
"value": 2256,
"scope": "Consolidated"
},
{
"value": 516,
"scope": "Individual"
}
]
}
]
},
{
"type": "UPDATE",
"key": { "id": "081243215" },
"doc": [
{
"property": "org.startDate",
"currentValue": "1979-09-14T06:08:51Z"
}
]
},
{
"type": "UPDATE",
"key": { "id": "081243216" },
"doc": [
{
"property": "org.employeeCount",
"currentValue": "20000"
}
]
},
{
"type": "UPDATE",
"key": { "id": "081243216" },
"doc": [
{
"property": "org.headOffice",
"currentValue": {
"city": "NY",
"country": "US"
}
}
]
}
]
Target Schema
{
"$schema": "http://json-schema.org/draft-04/schema#",
"type": "object",
"properties": {
"id": {
"type": "integer"
},
"startDate": {
"type": "string"
},
"numberOfEmployees": {
"type": "array",
"items": [
{
"type": "object",
"properties": {
"value": {
"type": "integer"
},
"scope": {
"type": "string"
}
}
}
]
},
"employeeCount": {
"type": "integer"
},
"headOffice": {
"type": "object",
"properties": {
"city": {
"type": "string"
},
"country": {
"type": "string"
}
}
}
}
}
Is there any way I can stringify currentValue in data flow task, if there is no direct way to transform the input data to target schema?
Any help would be appreciated.
You can stringify it in a derived column using "toString()" or you can wait for our new Stringify transformation in October :)

JSON schema validation Draft 7 two type of data for one field

I need help creating a JSON schema for a value that could be an object, or an array of objects.
lib: jsonschema==3.2.0
py: 3.8
I have 2 responses from the server:
first:
{
"result": [
{
"brand": "Test"
}
]}
second:
{
"result":
{
"brand": "Test"
}
}
As you can see the difference between both in the first case its an array of obj the second just object.
my schema:
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "http://example.com/example.json",
"type": "object",
"required": [
"result"
],
"properties": {
"result": {
"$id": "#/properties/result",
"type": ["array", "object"],
"additionalItems": true,
"items": {
"$id": "#/properties/result/items",
"anyOf": [
{
"$id": "#/properties/result/items/anyOf/0",
"type": "object",
"required": [
"brand"
],
"properties": {
"brand": {
"$id": "#/properties/result/items/anyOf/0/properties/brand",
"type": "string"
}
},
"additionalProperties": true
}
]
}
}
},
"additionalProperties": true}
In the first case when return array, it checks the "brand" type on the second when return object, no.
How I can set up 2 types for one field "result" that it could check the brand type?
Your schema can be fixed as follows:
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "http://example.com/example.json",
"type": "object",
"required": [
"result"
],
"properties": {
"result": {
"$id": "#/properties/result",
"anyOf": [
{
"$id": "#/properties/result/items/brand",
"type": "object",
"properties": {
"brand": {
"$id": "#/properties/result/items/anyOf/0/properties/brand",
"type": "string"
}
},
"required": [
"brand"
],
"additionalProperties": true
},
{
"$id": "#/properties/result/items/array",
"type": "array",
"items": {
"$ref": "#/properties/result/items/brand"
}
}
]
}
},
"additionalProperties": true
}
Demos here, here and here.
However, it is customary to extract reusable portions of a schema into a separate "definitions" section, like so:
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "http://example.com/example.json",
"definitions": {
"brand": {
"type": "object",
"properties": {
"brand": {
"$id": "#/properties/result/items/anyOf/0/properties/brand",
"type": "string"
}
},
"required": [
"brand"
],
"additionalProperties": true
}
},
"type": "object",
"required": [
"result"
],
"properties": {
"result": {
"$id": "#/properties/result",
"anyOf": [
{
"$ref": "#/definitions/brand"
},
{
"$id": "#/properties/result/items/array",
"type": "array",
"items": {
"$ref": "#/definitions/brand"
}
}
]
}
},
"additionalProperties": true
}
Demos here, here and here.
Notes:
To express that the property "result" may be of two different types, use the "anyof" keyword for the property's schema. The value of the "anyOf" should be an array with the schemas for each possible type (here the "brand" object or an array of "brand" objects) as the array items.
See: Multiple Types.
To avoid duplicating the definitions for the "brand" object, you can use the "$ref" when defining a schema for the array's items to refer back to the previously given schema for "brand". As noted above it s customary to place reused subschemas into a "definitions" section, but it is not necessary, "$ref" can refer to any schema item via the JSON Pointer syntax.
See: Reuse.
When the items of a list have a single schema, "additionalItems" should not be used.
See: List validation.

How to traverse all Fields in all nested Records in an Avro file and check a certain property in their Types?

I have an avro file which has records, then in their fields (which have uniontypes) there are other records, which also have fields with union types, and some types have a certain property connect.name which i need to check if it equals to io.debezium.time.NanoTimestamp. I`m doing this in Apache NiFi using an ExecuteScript processor with Groovy script.
A shortened example of the Avro schema:
{
"type": "record",
"name": "Envelope",
"namespace": "data.none.bpm.pruitsmdb_nautilus_dbo.fast_frequency_tables.avro.test",
"fields": [
{
"name": "before",
"type": [
"null",
{
"type": "record",
"name": "Value",
"fields": [
{
"name": "Id",
"type": {
"type": "string",
"connect.parameters": {
"__debezium.source.column.type": "UNIQUEIDENTIFIER",
"__debezium.source.column.length": "36"
}
}
},
{
"name": "CreatedOn",
"type": [
"null",
{
"type": "long",
"connect.version": 1,
"connect.parameters": {
"__debezium.source.column.type": "DATETIME2",
"__debezium.source.column.length": "27",
"__debezium.source.column.scale": "7"
},
"connect.name": "io.debezium.time.NanoTimestamp"
}
],
"default": null
},
{
"name": "CreatedById",
"type": [
"null",
{
"type": "string",
"connect.parameters": {
"__debezium.source.column.type": "UNIQUEIDENTIFIER",
"__debezium.source.column.length": "36"
}
}
],
"default": null
}
],
"connect.name": "data.none.bpm.pruitsmdb_nautilus_dbo.fast_frequency_tables.avro.test.Value"
}
],
"default": null
},
{
"name": "after",
"type": [
"null",
"Value"
],
"default": null
},
{
"name": "source",
"type": {
"type": "record",
"name": "Source",
"namespace": "io.debezium.connector.sqlserver",
"fields": [
{
"name": "version",
"type": "string"
},
{
"name": "ts_ms",
"type": "long"
},
{
"name": "snapshot",
"type": [
{
"type": "string",
"connect.version": 1,
"connect.parameters": {
"allowed": "true,last,false"
},
"connect.default": "false",
"connect.name": "io.debezium.data.Enum"
},
"null"
],
"default": "false"
}
],
"connect.name": "io.debezium.connector.sqlserver.Source"
}
},
{
"name": "op",
"type": "string"
},
{
"name": "ts_ms",
"type": [
"null",
"long"
],
"default": null
}
],
"connect.name": "data.none.bpm.pruitsmdb_nautilus_dbo.fast_frequency_tables.avro.test.Envelope"
}
My Groovy code, which obviously seems to be checking the top-level records only, and also I'm not sure whether I'm checking the property connect.name correctly:
reader.forEach{ GenericRecord record ->
record.getSchema().getFields().forEach{ Schema.Field field ->
try {
field.schema().getTypes().forEach{ Schema typeSchema ->
if(typeSchema.getProp("connect.name") == "io.debezium.time.NanoTimestamp"){
record.put(field.name(), Long(record.get(field.name()).toString().substring(0, 13)))
typeSchema.addProp("logicalType", "timestamp-millis")
}
}
} catch(Exception ex){
println("Catching the exception")
}
}
writer.append(record)
}
My question is - how to traverse all nested Records (there are top-level records' fields which have "record" type and records inside) in the avro file? And when traversing their Fields - how to check correctly that one of their types (which may go in union) has a property connect.name == io.debezium.time.NanoTimestamp and if yes, perform a transformation on the field value and add a logicalType property to the field`s type?
I think you are looking for a recursion here - there should be a function that will accept the Record as a parameter. When you hit a field that is a nested record then you'll call this function recursively.
Jiri's approach suggestion worked, a recursive function was used, here`s the full code:
import org.apache.avro.*
import org.apache.avro.file.*
import org.apache.avro.generic.*
//define input and output files
DataInputStream inputStream = new File('input.avro').newDataInputStream()
DataOutputStream outputStream = new File('output.avro').newDataOutputStream()
DataFileStream<GenericRecord> reader = new DataFileStream<>(inputStream, new GenericDatumReader<GenericRecord>())
DataFileWriter<GenericRecord> writer = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>())
def contentSchema = reader.schema //source Avro schema
def records = [] //list will be used to temporary store the processed records
//function which is traversing through all records (including nested ones)
def convertAvroNanosecToMillisec(record){
record.getSchema().getFields().forEach{ Schema.Field field ->
if (record.get(field.name()) instanceof org.apache.avro.generic.GenericData.Record){
convertAvroNanosecToMillisec(record.get(field.name()))
}
if (field.schema().getType().getName() == "union"){
field.schema().getTypes().forEach{ Schema unionTypeSchema ->
if(unionTypeSchema.getProp("connect.name") == "io.debezium.time.NanoTimestamp"){
record.put(field.name(), Long.valueOf(record.get(field.name()).toString().substring(0, 13)))
unionTypeSchema.addProp("logicalType", "timestamp-millis")
}
}
} else {
if(field.schema().getProp("connect.name") == "io.debezium.time.NanoTimestamp"){
record.put(field.name(), Long.valueOf(record.get(field.name()).toString().substring(0, 13)))
field.schema().addProp("logicalType", "timestamp-millis")
}
}
}
return record
}
//reading all records from incoming file and adding to the temporary list
reader.forEach{ GenericRecord contentRecord ->
records.add(convertAvroNanosecToMillisec(contentRecord))
}
//creating a file writer object with adjusted schema
writer.create(contentSchema, outputStream)
//adding records to the output file from the temporary list and closing the writer
records.forEach{ GenericRecord contentRecord ->
writer.append(contentRecord)
}
writer.close()

JSON:API Matching Collections with its respective Includes

What exactly is the best practice for matching JSON:API data collections with their respective includes. Considering the following code below....
What if I wanted to loop through each venue and display the Owners full information for each Venue Record. Does JSON:API expect me to just search the include array for the matching Owner Record
find(included,data[$i].relationships.owner.data.id);
Would find() loop through the included array to look for the owner that has the matching id as the collection items owner in the relationships object ?
$(data).each(function(item){
var owner = find(included,'owner', item.relationships.owner.data.id)
})
I have not found a resource that explains this or perhapes I am mis understanding the point of json:api. If someone can explain this or point to a resource that relates to my question. I would appreciate it.
{
"links": {
"self": "http://127.0.0.1/api/venues?include=owner"
},
"data": [
{
"id": "5c5b49188fd33c7a989ba9b6",
"type": "venues",
"attributes": {
"name": "Kreiger - Smith",
"address": "69675 Reilly Vista",
"location": {
"type": "Point",
"coordinates": [
-112.110492,
36.098948
]
},
"events": [
{
"_id": "ad52825a8f4812e92f87b8c6",
"name": "Cool Awesome Event!",
"user": "b3daa77b4c04a9551b8781d0",
"id": "ad52825a8f4812e92f87b8c6"
}
],
"created_at": "2019-02-07T14:27:13.207Z",
"updated_at": "2019-02-07T14:27:13.207Z"
},
"relationships": {
"owner": {
"data": {
"id": "b3daa77b4c04a9551b8781d0",
"type": "users"
}
}
}
},
{
"id": "5c5b49188fd33c7a989ba9b7",
"type": "venues",
"attributes": {
"name": "Oberbrunner Inc",
"address": "1132 Kenyon Stravenue",
"location": {
"type": "Point",
"coordinates": [
-112.110492,
36.098948
]
},
"events": [
{
"_id": "ad52825a8f4812e92f87b8c6",
"name": "Cool Awesome Event!",
"user": "b3daa77b4c04a9551b8781d0",
"id": "ad52825a8f4812e92f87b8c6"
}
],
"created_at": "2019-02-07T14:27:13.207Z",
"updated_at": "2019-02-07T14:27:13.207Z"
},
"relationships": {
"owner": {
"data": {
"id": "b3daa77b4c04a9551b8781d0",
"type": "users"
}
}
}
},
{
"id": "5c5b49188fd33c7a989ba9b8",
"type": "venues",
"attributes": {
"name": "Gibson - Muller",
"address": "8457 Hailie Canyon",
"location": {
"type": "Point",
"coordinates": [
-112.110492,
36.098948
]
},
"events": [
{
"_id": "ad52825a8f4812e92f87b8c6",
"name": "Cool Awesome Event!",
"user": "b3daa77b4c04a9551b8781d0",
"id": "ad52825a8f4812e92f87b8c6"
}
],
"created_at": "2019-02-07T14:27:13.208Z",
"updated_at": "2019-02-07T14:27:13.208Z"
},
"relationships": {
"owner": {
"data": {
"id": "a1881c06eec96db9901c7bbf",
"type": "users"
}
}
}
}
],
"included": [
{
"id": "b3daa77b4c04a9551b8781d0",
"type": "users",
"attributes": {
"username": "killerjohn",
"firstname": "John",
"lastname": "Chapman"
}
},
{
"id": "a1881c06eec96db9901c7bbf",
"type": "users",
"attributes": {
"username": "numerical25",
"firstname": "Billy",
"lastname": "Gordon"
}
}
]
}
This is my best possible solution. But is there a better way ? Seems like alot more coding just to find a collections associated included data
axios.get('http://127.0.0.1:3000/api/venues?include=owner').then(function(response) {
var venues = response.data.data;
var data = response.data;
for(x in venues) {
var owner = data.included.find(function(element) {
if(element.id == venues[x].relationships.owner.data.id) {
return element;
}
});
}
});

Kafka Cassandra Connector nested column from avro schema

How do I access the nested field from AVRO schema.
For Example I have the following Schema :
{
"type": "record",
"name": "Person",
"namespace": "com.datamountaineer.kcql.avro",
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "address",
"type": {
"type": "record",
"name": "Address",
"fields": [
{
"name": "street",
"type": {
"type": "record",
"name": "Street",
"fields": [
{
"name": "name",
"type": "string"
}
]
}
},
{
"name": "street2",
"type": [
"null",
"Street"
]
},
{
"name": "city",
"type": "string"
},
{
"name": "state",
"type": "string"
},
{
"name": "zip",
"type": "string"
},
{
"name": "country",
"type": "string"
}
]
}
}
]
}
I want to access the nested fields here So I have tried below
Query :
SELECT name, address.street.*, address.street2.name as streetName2 FROM topic
But getting below Error :
Address.street.* not available in schema
Can any one help me. How do I get ?

Resources