Find and convert values in nested dict and update db collection - python-3.x

I have converted many xml files to json using xmltodict and inserted those in arangodb.
Now I well loop over the collection and change some values in the database. LIke day, mount and year from string to int. The documents can be very nested and the values that I well change can be in different places.
This is what I have of the code.
# Get the API wrapper for "FORM16" collection.
FORM16 = db.collection('FORM16')
def recursive_items(dictionary):
for key, value in dictionary.items():
if type(value) is dict:
yield from recursive_items(value)
else:
yield (key, value)
search_key = 'LOW_VALUE'
for item in FORM16:
for key, value in recursive_items(item):
if search_key in list(key):
item[search_key] = int(item[search_key])
else:
pass
FORM16.update(item)
{'_id': 'FORM16/2098312',
'_key': '2098312',
'_rev': '_blGxlRi---',
'_old_rev': '_blGvpVO---'}
The code runs but It won’t update the database and the document that I receive that has changed is only the last document in the collection.
What do I have to change in the code to convert values in keys like day, mount and year to int?
EDIT:
This is one of the nested json's doc. that I well update
{
"DOFFIN_ESENDERS": {
"DOFFIN_APPENDIX": {
"AUTHORITY_ORGANISATION_NR": "986 105 174",
"DOFFIN_FORM_TYPE": {
"NATIONAL": {
"EXPRESSION_OF_INTEREST_URL": "https://kgv.doffin.no/ctm/Supplier/Notice/260549",
"EXTERNAL_DOCUMENT_URL": "https://kgv.doffin.no/ctm/Supplier/Documents/Folder/124452",
"LOCATION": {
"NATIONWIDE": null
},
"PUBLISH_TO_TED": null
}
}
},
"FORM_SECTION": {
"PRIOR_INFORMATION_DEFENCE": {
"CATEGORY": "ORIGINAL",
"FD_PRIOR_INFORMATION_DEFENCE": {
"AUTHORITY_PRIOR_INFORMATION_DEFENCE": {
"NAME_ADDRESSES_CONTACT_PRIOR_INFORMATION": {
"CA_CE_CONCESSIONAIRE_PROFILE": {
"ADDRESS": "Postboks 800, Postmottak",
"ATTENTION": "Ole Jan Skoglund",
"CONTACT_POINT": "Forsvarets logistikkorganisasjon",
"COUNTRY": {
"VALUE": "NO"
},
"E_MAILS": {
"E_MAIL": "olskoglund#mil.no"
},
"FAX": "+47 67863799",
"ORGANISATION": {
"NATIONALID": "986105174",
"OFFICIALNAME": "Forsvarets logistikkorganisasjon"
},
"PHONE": "+47 67863787",
"POSTAL_CODE": "LILLEHAMMER",
"TOWN": "N-2617"
},
"FURTHER_INFORMATION": {
"IDEM": null
},
"INTERNET_ADDRESSES_PRIOR_INFORMATION": {
"URL_BUYER": "https://kgv.doffin.no/ctm/Supplier/CompanyInformation/Index/1127",
"URL_GENERAL": "http://www.forsvaret.no"
}
},
"TYPE_AND_ACTIVITIES_OR_CONTRACTING_ENTITY_AND_PURCHASING_ON_BEHALF": {
"PURCHASING_ON_BEHALF": {
"PURCHASING_ON_BEHALF_NO": null
},
"TYPE_AND_ACTIVITIES": {
"TYPE_OF_ACTIVITY": {
"VALUE": "DEFENCE"
},
"TYPE_OF_CONTRACTING_AUTHORITY": {
"VALUE": "MINISTRY"
}
}
}
},
"CTYPE": "SUPPLIES",
"LEFTI_PRIOR_INFORMATION": null,
"OBJECT_WORKS_SUPPLIES_SERVICES_PRIOR_INFORMATION": {
"ADDITIONAL_INFORMATION": {
"P": "Konkurransen vil bli utført som en forhandlet prosedyre etter en planlagt kunngjøring ultimo 2015 i henhold til “Forskrift 4. oktober 2013 nr. 1185 om forsvars og sikkerhetsanskaffelser“ basert på Eu direktiv 2009/81/EC fra Europa Parlamentet."
},
"CPV": {
"CPV_ADDITIONAL": [
{
"CPV_CODE": {
"CODE": "18900000"
}
},
{
"CPV_CODE": {
"CODE": "18930000"
}
},
{
"CPV_CODE": {
"CODE": "18937000"
}
},
{
"CPV_CODE": {
"CODE": "33000000"
}
},
{
"CPV_CODE": {
"CODE": "33120000"
}
},
{
"CPV_CODE": {
"CODE": "33124000"
}
},
{
"CPV_CODE": {
"CODE": "33140000"
}
},
{
"CPV_CODE": {
"CODE": "33141000"
}
},
{
"CPV_CODE": {
"CODE": "33141100"
}
},
{
"CPV_CODE": {
"CODE": "33141200"
}
},
{
"CPV_CODE": {
"CODE": "33141300"
}
},
{
"CPV_CODE": {
"CODE": "50400000"
}
}
],
"CPV_MAIN": {
"CPV_CODE": {
"CODE": "33100000"
}
}
},
"FRAMEWORK_AGREEMENT": {
"VALUE": "YES"
},
"QUANTITY_SCOPE_WORKS_DEFENCE": {
"COSTS_RANGE_AND_CURRENCY": {
"CURRENCY": "NOK",
"RANGE_VALUE_COST": {
"HIGH_VALUE": "200000000",
"LOW_VALUE": "150000000"
}
},
"F16_DIVISION_INTO_LOTS": {
"DIV_INTO_LOT_NO": null
},
"TOTAL_QUANTITY_OR_SCOPE": {
"P": "Forsvarets logistikkorganisasjon planlegger å skifte ut Forsvarets prehospitale sanitetssystem. Vi ser derfor etter en systemleverandør som kan levere test moduler, store initielle systemleveranser og ta ansvar for effektiv etterforsyning til Forsvaret på rammeavtaler med inntil syv års varighet."
}
},
"SCHEDULED_DATE_PERIOD": {
"PERIOD_WORK_DATE_STARTING": {
"MONTHS": "84"
}
},
"TITLE_CONTRACT": {
"P": "RFI P9346 -Nytt Prehospital Sanitetssystem til Forsvaret"
},
"TYPE_CONTRACT_PLACE_DELIVERY_DEFENCE": {
"SITE_OR_LOCATION": {
"LABEL": "N-2055 Nordkisa",
"NUTS": {
"CODE": "NO"
}
},
"TYPE_CONTRACT_PI_DEFENCE": {
"TYPE_CONTRACT": {
"VALUE": "SUPPLIES"
}
}
}
},
"OTH_INFO_PRIOR_INFORMATION": {
"ADDITIONAL_INFORMATION": {
"P": "Vi ønsker svar både fra Systemleverandører og Underleverandører på denne RFI."
},
"INFORMATION_REGULATORY_FRAMEWORK": {
"TAX_LEGISLATION": {
"TAX_LEGISLATION_VALUE": "www.lovdata.no"
}
},
"NOTICE_DISPATCH_DATE": {
"DAY": "28",
"MONTH": '11',
"YEAR": "2014"
},
"RELATES_TO_EU_PROJECT_NO": null
}
},
"FORM": "16",
"LG": "NB",
"VERSION": "R2.0.8.S02"
}
},
"VERSION": "V2.0.0",
"http://www.w3.org/2001/XMLSchema-instance:noNamespaceSchemaLocation": "DOFFIN_ESENDERS.xd",
"xmlns": {
"xsi": "http://www.w3.org/2001/XMLSchema-instance"
}
}
}

It looks like your code is correct, assuming the JSON blob at the bottom is a representation of item. Just make sure the data you're passing to .update() includes a valid _key and/or _id attribute.
However, it looks like your update statement is not indented properly and/or out of order. I would put the update inline, when you make the change:
FORM16 = db.collection('FORM16')
for item in FORM16:
for key, value in recursive_items(item):
if search_key in list(key):
item[search_key] = int(item[search_key])
FORM16.update(item)
else:
pass
or in the top-level for loop:
FORM16 = db.collection('FORM16')
for item in FORM16:
for key, value in recursive_items(item):
if search_key in list(key):
item[search_key] = int(item[search_key])
else:
pass
FORM16.update(item)

I did find a functin for convert sting to int and float in JSON files.
def _decode(o):
# Note the "unicode" part is only for python2
if isinstance(o, str):
try:
return int(o)
except ValueError:
try:
return float(o)
except ValueError:
return o
elif isinstance(o, dict):
return {k: _decode(v) for k, v in o.items()}
elif isinstance(o, list):
return [_decode(v) for v in o]
else:
return o
path = 'C:/doffin/test/'
for filename in os.listdir(path):
if not filename.endswith('.json'):
continue
#26  
#fullname = os.path.join(path, filename)
fullname = os.path.join(path, filename)
with open(fullname, 'rb') as f:
jsonstr = f.read()
json_sting = json.loads(jsonstr, object_hook=_decode)
json_str2 = json.dumps(json_sting)
with open(fullname[:-4] + ".json", 'w') as f:
f.write(json_str2)
and afther that I use arango import form the shell. It works better then the API.

Related

Unable to retrive ordered job list from Google Transcoder API

i'm using the node.js client library of google transcoder api. I'm able to retrive a paginated list of some jobs, but i'm not able to order elements by start date. Here my codes:
const { TranscoderServiceClient } = require('#google-cloud/video-transcoder').v1;
class TranscoderApiController {
constructor() {
this.projectId = process.env.GOOGLE_CLOUD_PROJECT;
this.location = process.env.TASK_LOCATION;
}
async getEntries(req, res, next) {
const params = {
pageSize: req.query.pageSize ? parseInt(req.query.pageSize) : 10,
pageToken: req.query.pageToken,
filter: req.query.filter,
orderBy: req.query.orderBy
}
const client = new TranscoderServiceClient();
const result = await client.listJobs({
parent: client.locationPath(this.projectId, this.location),
pageSize: params.pageSize,
orderBy: 'createTime.seconds'
}, {
autoPaginate: false
});
if (result.length == 3 && result[2] != undefined) {
return result[2];
} else {
return result[1];
}
}
}
module.exports = new TranscoderApiController();
When i call the getEntries method i receive the following error:
"3 INVALID_ARGUMENT: The request was invalid: sort order \"createTime.seconds\" is unsupported"
If i remove the orderBy: 'createTime.seconds' line then the api works but is not ordered as i want. The result is something like that (i abbreviate the json):
{
"jobs": [
{
"labels": {},
"name": "projects/<id>/locations/europe-west1/jobs/<uuid>",
"inputUri": "",
"outputUri": "",
"state": "SUCCEEDED",
"createTime": {
"seconds": "1656602896",
"nanos": 386772728
},
"startTime": {
"seconds": "1656602900",
"nanos": 755000000
},
"endTime": {
"seconds": "1656603062",
"nanos": 428000000
},
"ttlAfterCompletionDays": 30,
"error": null,
"config": {
"inputs": [
{
"key": "input0",
"uri": "gs://<url>/render_md.mp4",
"preprocessingConfig": null
}
],
"editList": [...],
"elementaryStreams": [...],
"muxStreams": [...],
"manifests": [],
"adBreaks": [],
"spriteSheets": [],
"overlays": [],
"output": {
"uri": "gs://<url>/md.mp4/"
},
"pubsubDestination": {
"topic": "projects/<id>/topics/transcoder_api"
}
},
"jobConfig": "config"
},
...
],
"unreachable": [],
"nextPageToken": "Co8BCjgKDGV1cm9wZS13ZXN0MRIZdHJhbnNjb2Rlci5nb29nbGVhcGlzLmNvbRgBII..."
}
As you can see each job have the startTime.seconds property. I follow the syntax described here:
https://google.aip.dev/132#ordering
Any support to solve the ordered issue is appreciated.

How to perform a filter on a json result using python

I have the json response below returned from an api call.
{
"custAnalysis": [
{
"custPermId": "1234",
"custType": "Business",
"taxId": "8888",
"custAddr": {
"fullName": "Testing LIMITED",
"addr1": "6734 APAPA RD"
}
},
{
"custPermId": "5678",
"custType": "Business",
"taxId": "9999",
"custAddr": {
"fullName": "SUPERMAN LLC",
"addr1": "6734 APAPA RD"
}
},
{
"custPermId": "9234",
"custType": "Business",
"taxId": "8888",
"custAddr": {
"fullName": "DONALD LLC",
"addr1": "6734 APAPA RD"
}
}
]
}
I want to be able to search the json result above for a taxId of 8888. If taxId = 8888
return another json in the format below with the result
{
"custQueryResult": {
"custPermId": 1234,
"custPermId": 9234
}
}
I am very new to python. How can I achieve this in Python?
search_texId=8888
search_result={"custQueryResult":[]}
for obj in response_dict["custAnalysis"]:
if int(obj["taxId"])==search_texId:
search_result["custQueryResult"].append({"custPermId": int(obj["custPermId"])})
print(search_result)
Dictionary couldn't assign the same name keys. SocustPermId will be listed under custQueryResult key

How to use filter expressions on aws using python3 for nested map attribute?

I have been trying to scan DynamoDB to check for particular value in a nested map attribute named deliverables. However using scan with filter expressions is resulting in an empty result.
import boto3
result = []
dynamo_client = boto3.client("dynamodb")
paginator = dynamo_client.get_paginator("scan")
operation_parameters = {
'FilterExpression': "#Deliverable= :deliverable",
'ExpressionAttributeNames': {
'#Deliverable': 'deliverables.fc986523-a666-478e-8303-2a1c3c1dc4ba'
},
'ExpressionAttributeValues': {
':deliverable': {
"M": {
"read": {
"BOOL": True
},
"upload": {
"BOOL": True
},
"write": {
"BOOL": True
}
}
}
}
}
for page in paginator.paginate(TableName="TableName", **operation_parameters):
result.append(page["Items"])
print(result)
The items in the dynamo db look like this:
[
[
{
"deliverables":{
"M":{
"7397d832-fefb-4ba2-97a1-0f6e73d611d9":{
"M":{
"read":{
"BOOL":true
},
"upload":{
"BOOL":true
},
"write":{
"BOOL":true
}
}
},
"fc986523-a666-478e-8303-2a1c3c1dc4ba":{
"M":{
"read":{
"BOOL":true
},
"upload":{
"BOOL":true
},
"write":{
"BOOL":true
}
}
}
}
},
"username":{
"S":"username1"
},
"deniedReferences":{
"L":[
]
}
},
{
"deliverables":{
"M":{
"7397d832-fefb-4ba2-97a1-0f6e73d611d9":{
"M":{
"read":{
"BOOL":true
},
"upload":{
"BOOL":false
},
"write":{
"BOOL":false
}
}
},
"fc986523-a666-478e-8303-2a1c3c1dc4ba":{
"M":{
"read":{
"BOOL":true
},
"upload":{
"BOOL":false
},
"write":{
"BOOL":false
}
}
}
}
},
"username":{
"S":"repositoryadmin"
},
"deniedReferences":{
"L":[
]
}
}
]
]
Please let me know if you can help me solve this issue.
The problem is the [dot] here: 'ExpressionAttributeNames': { '#Deliverable': 'deliverables.fc986523-a666-478e-8303-2a1c3c1dc4ba'}
Expressions docs: DynamoDB interprets a dot in an expression attribute name as a character within an attribute's name.
operation_parameters = {
"FilterExpression": "#D0.#D1=:deliverable", # the dot goes here!
"ExpressionAttributeNames": {
"#D0": "deliverables",
"#D1": "fc986523-a666-478e-8303-2a1c3c1dc4ba"
},

elasticsearch node.js API remove an object from an array on a document using painless script results in array Index Out of Bounds

I want to remove items (an object) from an array on a document in elasticsearch, however whenever I try and run my update script using painless, I receive an Array Index Out of Bounds exception.
I'm using the javascript elasticsearch npm package to search elasticsearch for the relevant documents which then returns me data like:
"_index": "centres",
"_type": "doc",
"_id": "51bc77d1-b514-4f4e-85fa-412def6829f5",
"_score": 1,
"_source": {
"id": "cbaa7daa-f1a2-4ac3-8d7c-fc981245d21c",
"name": "Five House",
"openDays": [
{
"title": "new open Day",
"endDate": "2022-03-22T00:00:00.000Z",
"id": "82be934b-eeb1-419c-96ed-a58808b30df7"
},
{
"title": "last open Day",
"endDate": "2020-12-24T00:00:00.000Z",
"id": "8cc339b9-d2f8-4252-b68a-ed0a49cbfabd"
}
]
}
I then want to go through and remove certain items from the openDays array. I've created an array of the items I want to remove, so for the above example:
[
{
id: '51bc77d1-b514-4f4e-85fa-412def6829f5',
indexes: [
{
"title": "last open Day",
"endDate": "2020-12-24T00:00:00.000Z",
"id": "8cc339b9-d2f8-4252-b68a-ed0a49cbfabd"
}
]
}
]
I'm then trying to run an update via the elasticsearch node client like this:
for (const centre of updates) {
if (centre.indexes.length) {
await Promise.all(centre.indexes.map(async (theIndex) => {
const updated = await client.update({
index: 'centres',
type: 'doc',
id: centre.id,
body: {
script: {
lang: 'painless',
source: "ctx._source.openDays.remove(ctx._source.openDays.indexOf('openDayID'))",
params: {
"openDayID": theIndex.id
}
}
}
}).catch((err) => {throw err;});
}))
.catch((err) => {throw err;});
await client.indices.refresh({ index: 'centres' }).catch((err) => { throw err;});
}
}
When I run this though, it returns a 400 with an "array_index_out_of_bounds_exception" error:
-> POST http://localhost:9200/centres/doc/51bc77d1-b514-4f4e-85fa-412def6829f5/_update
{
"script": {
"lang": "painless",
"source": "ctx._source.openDays.remove(ctx._source.openDays.indexOf(\u0027openDayID\u0027))",
"params": {
"openDayID": "8cc339b9-d2f8-4252-b68a-ed0a49cbfabd"
}
}
}
<- 400
{
"error": {
"root_cause": [
{
"type": "remote_transport_exception",
"reason": "[oSsa7mn][172.17.0.2:9300][indices:data/write/update[s]]"
}
],
"type": "illegal_argument_exception",
"reason": "failed to execute script",
"caused_by": {
"type": "script_exception",
"reason": "runtime error",
"script_stack": [],
"script": "ctx._source.openDays.remove(ctx._source.openDays.indexOf(\u0027openDayID\u0027))",
"lang": "painless",
"caused_by": {
"type": "array_index_out_of_bounds_exception",
"reason": null
}
}
},
"status": 400
}
I'm not quite sure where I'm going wrong with this. Am I using the indexOf painless script correctly? Does indexOf allow for the searching of properties on objects in arrays?
I stumbled across this question and answer: Elasticsearch: Get object index with Painless script
The body of the update script needs changing like so:
Promise.all(...
const inline = `
def openDayID = '${theIndex.id}';
def openDays = ctx._source.openDays;
def openDayIndex = -1;
for (int i = 0; i < openDays.length; i++)
{
if (openDays[i].id == openDayID)
{
openDayIndex = i;
}
}
if (openDayIndex != -1) {
ctx._source.openDays.remove(openDayIndex);
}
`;
const updated = await client.update({
index: 'centres',
type: 'doc',
id: centre.id,
body: {
script: {
lang: 'painless',
inline: inline,
},
}
}).catch((err) => {throw err;});
await client.indices.refresh({ index: 'centres' }).catch((err) => { throw err;});
})).catch(... //end of Promise.all
I am not au fait with painless scripting, so there are most likely better ways of writing this e.g. breaking once the index of the ID is found.
I have also had to move the refresh statement into the Promise.all since if you're trying to remove more than one item from the array of objects, you'll be changing the document and changing the index. There is probably a better way of dealing with this too.
'openDayID' should be params.openDayID
And use removeIf:
"ctx._source.openDays.removeIf(el -> (el.id == params.openDayID))"

Groovy: Convert Json to Text

Would like to convert the below json record into a text using Groovy
import groovy.json.*
def js = """{
"title": {
"titleid": "222",
"titlename": "ABCD",
"titledesc": null
},
"customer": {
"customerDetail": {
"customerid": 878378743,
"customerstatus": "ACTIVE",
"customersystems": {
"customersystem1": "SYS01",
"customersystem2": null
},
"sysid": null
},
"store": {
"storeid": "LOS002",
"storename": "LAStore",
"areacode": "JDHJ8K988"
},
"persons": {
"person1": {
"personid": "123",
"personname": "IIISKDJKJSD"
},
"person2": {
"personid": "456",
"personname": "IUDFIDIKJK"
}
},
"order": {
"orderdetail": {
"orderid": "4291026",
"ordername": "ORD93999"
}
},
"product": {
"orderdate": "20190101",
"currency": "USD",
"amount": 1000.23
}
}
}
"""
def data = new JsonSlurper().parseText(js)
Expected output should as below with proper header names:
customerId,customerstatus,customersystem1,sysid,storeid,storename,person1.personid,person1.personname,orderid,orderdate,currency,amount,titlename
878378743,ACTIVE,SYS01,null,LOS002,LAStore,123,IIISKDJKJSD,4291026,20190101,USD,1000.23
This is just a single json record so how would I convert all my json records using Groovy?
The following code:
import groovy.json.*
def js = """
[
{
"title": {
"titleid": "222",
"titlename": "ABCD",
"titledesc": null
},
"customer": {
"customerDetail": {
"customerid": 878378743,
"customerstatus": "ACTIVE",
"customersystems": {
"customersystem1": "SYS01",
"customersystem2": null
},
"sysid": null
},
"store": {
"storeid": "LOS002",
"storename": "LAStore",
"areacode": "JDHJ8K988"
},
"persons": {
"person1": {
"personid": "123",
"personname": "IIISKDJKJSD"
},
"person2": {
"personid": "456",
"personname": "IUDFIDIKJK"
}
},
"order": {
"orderdetail": {
"orderid": "4291026",
"ordername": "ORD93999"
}
},
"product": {
"orderdate": "20190101",
"currency": "USD",
"amount": 1000.23
}
}
}
]
"""
/*
customerId,customerstatus,customersystem1,sysid,storeid,storename,person1.personid,person1.personname,orderid,orderdate,currency,amount,titlename
878378743,ACTIVE,SYS01,null,LOS002,LAStore,123,IIISKDJKJSD,4291026,20190101,USD,1000.23
*/
def data = new JsonSlurper().parseText(js)
def mappings = [
customerId: { n -> n.customer.customerDetail.customerid },
customerstatus: { n -> n.customer.customerDetail.customerstatus },
customersystem1: { n -> n.customer.customerDetail.customersystems.customersystem1 },
sysid: { n -> n.customer.customerDetail.sysid },
storeid: { n -> n.customer.store.storeid },
storename: { n -> n.customer.store.storename },
'person1.personid': { n -> n.customer.persons.person1.personid },
'person1.personname': { n -> n.customer.persons.person1.personname },
orderid: { n -> n.customer.order.orderdetail.orderid },
orderdate: { n -> n.customer.product.orderdate },
currency: { n -> n.customer.product.currency },
amount: { n -> n.customer.product.amount },
titlename: { n -> n.title.titlename }
]
def headers = mappings.keySet().join(',') //edited thanks to comment
println headers
data.each { item ->
def row = mappings.collect { k, v -> v(item) }.join(',')
println row
}
does what you ask for. Note that I made the json be a list of items instead of the single item since it seemed from your text that that is what you were after.
Running the above code produces:
~> groovy solution.groovy
customerId,customerstatus,customersystem1,sysid,storeid,storename,person1.personid,person1.personname,orderid,orderdate,currency,amount,titlename
878378743,ACTIVE,SYS01,null,LOS002,LAStore,123,IIISKDJKJSD,4291026,20190101,USD,1000.23,ABCD
~>
note that if this is going into some critical system and is not just a one-off ad-hoc piece of code, you should probably do things like check the return value of v(item) and log some error of otherwise handle when there is no value for a certain path in the json etc.
Should also be noted that the above code relies on the fact that a map literal in groovy (i.e. def mappings = [:]) creates an instance of java's LinkedHashMap which has predictable iteration order for things like keySet() and collect { }.
<< edit >>
For a single item json blob, you would change the code as follows:
def js = """
{
...
}
"""
def item = new JsonSlurper().parseText(js)
def mappings = ...
def headers = mappings.keySet().join(',') //edited thanks to comment
println headers
def row = mappings.collect { k, v -> v(item) }.join(',')
println row
where ... denotes that the block is unchanged from the example above.

Resources