How to Parsing Nested Json with Python? - python-3.x

I am trying to parse this nested JSON file and I am running having trouble getting every element I need.
Here is the json example:
{
"sensor-time" : {
"timezone" : "New_York",
"time" : "2020-07-15T12:45:02-04:00"
},
"status" : {
"code" : "OK"
},
"content" : {
"element" : [ {
"element-id" : 0,
"element-name" : "Line 0",
"sensor-type" : "SINGLE_SENSOR",
"data-type" : "LINE",
"from" : "2020-07-15T12:30:00-04:00",
"to" : "2020-07-15T12:45:00-04:00",
"resolution" : "FIVE_MINUTES",
"measurement" : [ {
"from" : "2020-07-15T12:30:00-04:00",
"to" : "2020-07-15T12:35:00-04:00",
"value" : [ {
"value" : 1,
"label" : "fw"
}, {
"value" : 2,
"label" : "bw"
} ]
}, {
"from" : "2020-07-15T12:35:00-04:00",
"to" : "2020-07-15T12:40:00-04:00",
"value" : [ {
"value" : 3,
"label" : "fw"
}, {
"value" : 4,
"label" : "bw"
} ]
}, {
"from" : "2020-07-15T12:40:00-04:00",
"to" : "2020-07-15T12:45:00-04:00",
"value" : [ {
"value" : 5,
"label" : "fw"
}, {
"value" : 6,
"label" : "bw"
} ]
} ]
}, {
"element-id" : 1,
"element-name" : "Test Line",
"sensor-type" : "SINGLE_SENSOR",
"data-type" : "LINE",
"from" : "2020-07-15T12:30:00-04:00",
"to" : "2020-07-15T12:45:00-04:00",
"resolution" : "FIVE_MINUTES",
"measurement" : [ {
"from" : "2020-07-15T12:30:00-04:00",
"to" : "2020-07-15T12:35:00-04:00",
"value" : [ {
"value" : 7,
"label" : "fw"
}, {
"value" : 8,
"label" : "bw"
} ]
}, {
"from" : "2020-07-15T12:35:00-04:00",
"to" : "2020-07-15T12:40:00-04:00",
"value" : [ {
"value" : 9,
"label" : "fw"
}, {
"value" : 10,
"label" : "bw"
} ]
}, {
"from" : "2020-07-15T12:40:00-04:00",
"to" : "2020-07-15T12:45:00-04:00",
"value" : [ {
"value" : 11,
"label" : "fw"
}, {
"value" : 12,
"label" : "bw"
} ]
} ]
} ]
},
"sensor-info" : {
"serial-number" : "D7:40:1:7F:4A:72",
"ip-address" : "192.168.130.44",
"name" : "DemoNew",
"group" : "Internal Test Devices",
"device-type" : "PC2"
}
}
What I am trying to get is measurement data for each element name. Please see the example below:
Here is what I tried:
data = {} # element-name ↦ direction ↦ {readings ↦ (timestamp × value) list, meta ↦ name ↦ value}
for element in json_data['content']['element']:
element_name = element['element-name']
element_data = {}
# collect
for measurement in element['measurement']:
dt = datetime.strptime(measurement['to'][:-3]+'00', '%Y-%m-%dT%H:%M:%S%z')
t = mktime(dt.timetuple())
for pair in measurement['value']:
direction = pair['label']
value = pair['value']
if not direction in element_data: element_data[direction] = []
element_data[direction].append( (t, value) )
# insert
metadata = {}
for key in element:
if not key in ['measurement', 'from', 'to']:
metadata[key] = element[key]
data[element_name] = {}
for direction in element_data:
data[element_name][direction] = {'readings': element_data[direction], 'meta': metadata}
camera_metadata = {}
for key in json_data:
if not key in ['content']:
camera_metadata[key] = json_data[key]
And her is what I get as result:
{'Line 0': {'fw': {'readings': [(1594830900.0, 1),
(1594831200.0, 3),
(1594831500.0, 5)],
'meta': {'element-id': 0,
'element-name': 'Line 0',
'sensor-type': 'SINGLE_SENSOR',
'data-type': 'LINE',
'resolution': 'FIVE_MINUTES'}},
'bw': {'readings': [(1594830900.0, 2), (1594831200.0, 4), (1594831500.0, 6)],
'meta': {'element-id': 0,
'element-name': 'Line 0',
'sensor-type': 'SINGLE_SENSOR',
'data-type': 'LINE',
'resolution': 'FIVE_MINUTES'}}},
'GP Test CL.01': {'fw': {'readings': [(1594830900.0, 7),
(1594831200.0, 9),
(1594831500.0, 11)],
'meta': {'element-id': 1,
'element-name': 'GP Test CL.01',
'sensor-type': 'SINGLE_SENSOR',
'data-type': 'LINE',
'resolution': 'FIVE_MINUTES'}},
'bw': {'readings': [(1594830900.0, 8),
(1594831200.0, 10),
(1594831500.0, 12)],
'meta': {'element-id': 1,
'element-name': 'GP Test CL.01',
'sensor-type': 'SINGLE_SENSOR',
'data-type': 'LINE',
'resolution': 'FIVE_MINUTES'}}}}
What do I need to adjust to get the result to look as a screenshot example above?

You were trying to get the information one part at a time. But to parse your json to a dataframe you need to do it all in a nested loop.
result = []
for element in json_data['content']['element']:
for m in element['measurement']:
data = {}
for val in m['value']:
data['SERIAL_NUMBER'] = json_data['sensor-info']['serial-number']
data['IP'] = json_data['sensor-info']['ip-address']
data['name'] = json_data['sensor-info']['name']
data['Group'] = json_data['sensor-info']['group']
data['Device Type'] = json_data['sensor-info']['device-type']
data['element-id'] = element['element-id']
data['Line name'] = element['element-name']
data['From time'] = m['from']
data['to time'] = m['to']
data[val['label']] = val['value']
result.append(data)
df = pd.DataFrame(result)
Output:
SERIAL_NUMBER IP name Group \
0 D7:40:1:7F:4A:72 192.168.130.44 DemoNew Internal Test Devices
1 D7:40:1:7F:4A:72 192.168.130.44 DemoNew Internal Test Devices
2 D7:40:1:7F:4A:72 192.168.130.44 DemoNew Internal Test Devices
3 D7:40:1:7F:4A:72 192.168.130.44 DemoNew Internal Test Devices
4 D7:40:1:7F:4A:72 192.168.130.44 DemoNew Internal Test Devices
5 D7:40:1:7F:4A:72 192.168.130.44 DemoNew Internal Test Devices
Device Type element-id Line name From time \
0 PC2 0 Line 0 2020-07-15T12:30:00-04:00
1 PC2 0 Line 0 2020-07-15T12:35:00-04:00
2 PC2 0 Line 0 2020-07-15T12:40:00-04:00
3 PC2 1 Test Line 2020-07-15T12:30:00-04:00
4 PC2 1 Test Line 2020-07-15T12:35:00-04:00
5 PC2 1 Test Line 2020-07-15T12:40:00-04:00
to time fw bw
0 2020-07-15T12:35:00-04:00 1 2
1 2020-07-15T12:40:00-04:00 3 4
2 2020-07-15T12:45:00-04:00 5 6
3 2020-07-15T12:35:00-04:00 7 8
4 2020-07-15T12:40:00-04:00 9 10
5 2020-07-15T12:45:00-04:00 11 12
As you can see I didn't figure out your time format. Also, I think you switched "Group" and "Device Type".

Related

Parse and modify JSON

I've a JSON with next structure and data:
[ {
"id" : 716612,
"type" : "ad",
"stats" : [ {
"day" : "2020-06-01",
"impressions" : 1956,
"clicks" : 1,
"reach" : 1782
},
{
"day" : "2020-06-13",
"spent" : "73.32",
"reach" : 1059
} ]
}, {
"id" : 414290,
"type" : "campaign",
"stats" : [ {
"day" : "2020-05-21",
"effective_cost_per_click" : "31.200",
"effective_cost_per_mille" : "108.337"
},
{
"day" : "2020-05-17",
"impressions" : 1,
"reach" : 1,
"ctr" : "0.000",
"uniq_views_count" : 1
} ]
} ]
I need to map id and type from top level with data inside stats to get result like this:
[ {
"id" : 716612,
"type" : "ad",
"day" : "2020-06-01",
"impressions" : 1956,
"clicks" : 1,
"reach" : 1782
},
{
"id" : 716612,
"type" : "ad",
"day" : "2020-06-13",
"spent" : "73.32",
"reach" : 1059
},
...
I tried with:
def json = new JsonSlurper().parseText(text)
def result = json.collectMany{ a ->
a["stats"].collectMany{ b ->
b.collect{
[id: a.id,
type: a.type
]
}
}
}
But it returns only id and type fields without stats. I thought that I'm looping through stat and just adding needed fields from above. I guess I don't get the difference between collectMany and collect?
You were close 😁
You want to collect the stat plus the id and type, so you need:
def result = json.collectMany { a ->
a.stats.collect { b ->
[ id: a.id, type: a.type ] + b
}
}

Sort JSON document by values embedded in an array of objects

I have a document in the below format. The goal is to group the document by student name and sort it by rank in the ascending order. Once that is done, iterate through the rank(within a student) and if each subsequent rank is greater than the previous one, the version field needs to be incremented. As part of a pipeline, student_name will be passed to me so matching by student name should be good instead of grouping.
NOTE: Tried it with python and works to some extent. A python solution would also be great!
{
"_id" : ObjectId("5d389c7907bf860f5cd11220"),
"class" : "I",
"students" : [
{
"student_name" : "AAA",
"Version" : 2,
"scores" : [
{
"value" : "50",
"rank" : 2
},
{
"value" : "70",
"rank" : 1
}
]
},
{
"student_name" : "BBB",
"Version" : 5,
"scores" : [
{
"value" : 80,
"rank" : 2
},
{
"value" : 100,
"rank" : 1
},
{
"value" : 100,
"rank" : 1
}
]
}
]
}
I tried this piece of code to sort
def version(student_name):
db.column.aggregate(
[
{"$unwind": "$students"},
{"$unwind": "$students.scores"},
{"$sort" : {"students.scores.rank" : 1}},
{"$group" : {"students.student_name}
]
)
for i in range(0,(len(students.scores)-1)):
if students.scores[i].rank < students.scores[i+1].rank:
tag.update_many(
{"$inc" : {"students.Version":1}}
)
The expected output for student AAA should be
{
"_id" : ObjectId("5d389c7907bf860f5cd11220"),
"class" : "I",
"students" : [
{
"student_name" : "AAA",
"Version" : 3, #version incremented
"scores" : [
{
"value" : "70",
"rank" : 1
},
{
"value" : "50",
"rank" : 2
}
]
}
I was able to sort the document.
pipeline = [
{"$unwind": "$properties"},
{"$unwind": "$properties.values"},
{"$sort" : {"$properties.values.rank" : -1}},
{"$group": {"_id" : "$properties.property_name", "values" : {"$push" : "$properties.values"}}}
]
import pprint
pprint.pprint(list(db.column.aggregate(pipeline)))

Compare two Collections in MongoDB and show the differences

I'm trying to compare two collections in mongodb. I have Collection A and Collection B and I only want to show the Differences. How is this done? I thought it could be done with the Aggregation Framework but I did not get the expected values. I just want to see which Document in Collection A is not the same as in Collection B.
Collection: A
{
"_id" : ObjectId("x"),
"p" : [
{
"t" : 1,
"p" : 123
},
{
"t" : 2,
"p" : 123
}
]
},
{
"_id" : ObjectId("y"),
"p" : [
{
"t" : 1,
"p" : 234
},
{
"t" : 2,
"p" : 234
}
]
}
Collection: B
{
"_id" : ObjectId("x"),
"p" : [
{
"t" : 1,
"p" : 123
},
{
"t" : 2,
"p" : 538458 // OTHER VALUE HERE
}
]
},
{
"_id" : ObjectId("y"),
"p" : [
{
"t" : 1,
"p" : 234
},
{
"t" : 2,
"p" : 234
}
]
}
You could export each collection by using mongoexport, this will create a file with all the documents, but make sure you omit the _id (documents maybe identical but will have different ids):
mongoexport --db db_name --collection collection_name | sed '/"_id":/s/"_id":[^,]*,//' > file_name.json
Then you can compare the two files using diff.

How to merge multiple fields in a collection?

Example entry:
{ "_id" : "00-01#mail.ru", " pass" : 123654, "field2" : 235689, "field3" : "cccp123654", "field4" : "lhfrjy" }
Desired result:
{ "_id" : "00-01#mail.ru", " pass" : 123654, 235689, "cccp123654", "lhfrjy" }
I want to have two final fields (_id and pass).
I have attempted the following:
db.emails.aggregate([
{ "$project": {
"pass": { "$setUnion": [ "$field2", "$field3" ] }
}}
])
However, this results in the following error:
2018-01-22T03:01:26.074+0000 E QUERY [thread1] Error: command failed: {
"ok" : 0,
"errmsg" : "All operands of $setUnion must be arrays. One argument is of type: string",
"code" : 17043,
"codeName" : "Location17043"
} : aggregate failed :
_getErrorWithCode#src/mongo/shell/utils.js:25:13
doassert#src/mongo/shell/assert.js:16:14
assert.commandWorked#src/mongo/shell/assert.js:370:5
DBCollection.prototype.aggregate#src/mongo/shell/collection.js:1319:5
#(shell):1:1
Can someone assist?
we can convert $objectToArray and $slice after 1 element in array
> db.io.aggregate(
[
{$addFields : {arr : {$objectToArray : "$$ROOT"}}},
{$project : { pass : {$slice : ["$arr.v", 1, 20 ] }}}
]
).pretty()
result
{
"_id" : "00-01#mail.ru",
"pass" : [
123654,
235689,
"cccp123654",
"lhfrjy"
]
}
>

what should be the mongo query for this

Below if a document from my collection of over 20,000,000 documents.
I need to find documents by a particular zip, out of these documents I need to select one record from each postal address (ADDR, CITY, STATE, ZIP, APT) and which has a age value of 18 or higher.
The results need to be limited to a number as well which is entered by the end-user.
{
"_id" : ObjectId("55e86e98f493590878bb45d7"),
"RecordID" : 84096380,
"FN" : "Michael",
"MI" : "",
"LN" : "Horn",
"NAME_PRE" : "MR",
"ADDR" : "160 Yankee Camp Rd",
"CITY" : "Telford",
"ST" : "TN",
"ZIP" : 37690,
"APT" : "",
"Z4" : 2200,
"DPC" : 605,
"CAR_RTE" : "R001",
"WALK_SEQ" : 228,
"LOT" : "0136A",
"FIPS_ST" : 47,
"FIPS_CTY" : 179,
"LATITUDE" : 36.292787,
"LONGITUDE" : -82.568171,
"ADDR_TYP" : 1,
"MSA" : 3660,
"CBSA" : 27740,
"ADDR_LINE" : 3,
"DMA_SUPPR" : "",
"GEO_MATCH" : 1,
"CENS_TRACT" : 61900,
"CENS_BLK_GRP" : 1,
"CENS_BLK" : 17,
"CENS_MED_HOME_VALUE" : 953,
"CENS_MED_HH_INCOME" : 304,
"CRA" : "",
"Z4_TYP" : "S",
"DSF_IND" : 1,
"DPD_IND" : "N",
"PHONE_FLAG" : "Y",
"PHONE" : NumberLong("4237730233"),
"TIME_ZN" : "E",
"GENDER" : "M",
"NEW_TO_BLD" : "",
"SOURCES" : 19,
"BASE_VER_DT" : 20101,
"COMP_ID" : NumberLong("3769001836"),
"IND_ID" : 1,
"INF_HH_RANK" : 1,
"HOME_OWNR_SRC" : "V",
"DOB_YR" : 1975,
"DOB_MON" : 7,
"DOB_DAY" : 10,
"EXACT_AGE" : 39,
"AGE" : 39,
"HH_INCOME" : "D"
}
if you are using mongoose, we can chain the operations by dot(.) operator. Since i see all your needs is conditional here is the example -
Person.
find({
ZIP: "37690",
ADDR : "",
STATE : "", //so on
AGE: { $gt: 18 }
}).
limit(10).
exec(callback);
more info - http://mongoosejs.com/docs/queries.html
You need to use aggregate operation.
var pipeline = [
{
$match: {ZIP: 37690, AGE: {$gt: 18}}
}, {
$group: {
_id: {ADDR: '$ADDR', CITY: '$CITY', STATE: '$STATE', ZIP: '$ZIP', APT: '$APT'},
PHONE: {$first: '$PHONE'}
}
},
{$limit: 10}
];
db.mycoll.aggregate(pipeline)
enhance the above to project whatever fields you require in results
I think This query will solve your problem.
Person.find({
ZIP: "37690",
AGE: { $gt: 18 }
}).
limit(50).
exec(callback);

Resources