JSON Extract to dataframe using python - python-3.x

I have a JSON file and the structure of the file is as below
[json file with the structure][1]
I am trying to get all the details into dataframe or tabular form, Tried using denormalize and could not get the actual result.
{
"body": [{
"_id": {
"s": 0,
"i": "5ea6c8ee24826b48cc560e1c"
},
"fdfdsfdsf": "V2_1_0",
"dsd": "INDIA-",
"sdsd": "df-as-3e-ds",
"dsd": 123,
"dsds": [{
"dsd": "s_10",
"dsds": [{
"dsdsd": "OFFICIAL",
"dssd": {
"dsds": {
"sdsd": "IND",
"dsads": 0.0
}
},
"sadsad": [{
"fdsd": "ABC",
"dds": {
"dsd": "INR",
"dfdsfd": -1825.717444
},
"dsss": [{
"id": "A:B",
"dsdsd": "A.B"
}
]
}, {
"name": "dssadsa",
"sadds": {
"sdsads": "INR",
"dsadsad": 180.831415
},
"xcs": "L:M",
"sds": "L.M"
}
]
}
]
}
]
}
]
}

This structure is far too nested to put directly into a dataframe. First, you'll need to use the ol' flatten_json function. This function isn't in a library (to my knowledge), but you see it around a lot. Save it somewhere.
def flatten_json(nested_json):
"""
Flatten json object with nested keys into a single level.
Args:
nested_json: A nested json object.
Returns:
The flattened json object if successful, None otherwise.
"""
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
Applying it to your data:
import json
with open('deeply_nested.json', r) as f:
flattened_json = flatten_json(json.load(f))
df = pd.json_normalize(flattened_json)
df.columns
Index(['body_0__id_s', 'body_0__id_i', 'body_0_schemaVersion',
'body_0_snapUUID', 'body_0_jobUUID', 'body_0_riskSourceID',
'body_0_scenarioSets_0_scenario',
'body_0_scenarioSets_0_modelSet_0_modelPolicyLabel',
'body_0_scenarioSets_0_modelSet_0_valuation_pv_unit',
'body_0_scenarioSets_0_modelSet_0_valuation_pv_value',
'body_0_scenarioSets_0_modelSet_0_measures_0_name',
'body_0_scenarioSets_0_modelSet_0_measures_0_value_unit',
'body_0_scenarioSets_0_modelSet_0_measures_0_value_value',
'body_0_scenarioSets_0_modelSet_0_measures_0_riskFactors_0_id',
'body_0_scenarioSets_0_modelSet_0_measures_0_riskFactors_0_underlyingRef',
'body_0_scenarioSets_0_modelSet_0_measures_1_name',
'body_0_scenarioSets_0_modelSet_0_measures_1_value_unit',
'body_0_scenarioSets_0_modelSet_0_measures_1_value_value',
'body_0_scenarioSets_0_modelSet_0_measures_1_riskFactors',
'body_0_scenarioSets_0_modelSet_0_measures_1_underlyingRef'],
dtype='object')

Related

Remove empty Keys from JSON arrays using Groovy

I would like to remove the array SEO from the json when the keys "Description" and "Title" in the has no value.
json:
[
{
"SEO": [
{
"Description": "",
"Title": ""
}
],
"accesoires": [
"1167296"
],
"shortCode": "S-576",
"spareParts": [
"800236"
]
}]
I tried the below code but i'm not able to remove the array.
def Message processData(Message message) {
def body = message.getBody(String);
def json = new JsonSlurper().parseText(body)
json.each{
it.SEO.each{
if(!(it.findResults{k, v -> v?.size() > 0 && v[0]?.length() > 0 ? v[0] : null })){
json.remove("SEO")
} } }
def out= JsonOutput.toJson(json)
message.setBody(out)
return message}
To remove the array "SEO" from the JSON when the keys "Description" and "Title" have no value, you can use the following Groovy code:
def jsonString = '[{"SEO": [{"Description": "", "Title": ""}], "accesoires": ["1167296"], "shortCode": "S-576", "spareParts": ["800236"]}]'
def json = new JsonSlurper().parseText(jsonString)
for (item in json) {
if (!item.SEO[0].Description && !item.SEO[0].Title) {
item.remove('SEO')
}
}
println(JsonOutput.toJson(json))
This will first parse the JSON string into a list of maps using JsonSlurper. Then it iterates through each map in the list and checks if the "Description" and "Title" keys in the "SEO" array are empty. If they are, it removes the "SEO" array from the map using the remove() method. Finally, it prints the modified JSON using the JsonOutput.toJson() method.

Is this the best way to parse a Json output from Google Ads Stream

Is this the best way to parse a Json output from Google Ads Stream. I am parsing the json with pandas & it is taking too much time
record counts is around 700K
[{
"results": [
{
"customer": {
"resourceName": "customers/12345678900",
"id": "12345678900",
"descriptiveName": "ABC"
},
"campaign": {
"resourceName": "customers/12345678900/campaigns/12345",
"name": "Search_Google_Generic",
"id": "12345"
},
"adGroup": {
"resourceName": "customers/12345678900/adGroups/789789",
"id": "789789",
"name": "adgroup_details"
},
"metrics": {
"clicks": "500",
"conversions": 200,
"costMicros": "90000000",
"allConversionsValue": 5000.6936,
"impressions": "50000"
},
"segments": {
"device": "DESKTOP",
"date": "2022-10-28"
}
}
],
"fieldMask": "segments.date,customer.id,customer.descriptiveName,campaign.id,campaign.name,adGroup.id,adGroup.name,segments.device,metrics.costMicros,metrics.impressions,metrics.clicks,metrics.conversions,metrics.allConversionsValue",
"requestId": "fdhfgdhfgjf"
}
]
This is the sample json.I am saving the stream in json file and then reading using pandas and trying to dump in csv file
I want to convert it to CSV format, Like
with open('Adgroups.json', encoding='utf-8') as inputfile:
df = pd.read_json(inputfile)
df_new = pd.DataFrame(columns= ['Date', 'Account_ID', 'Account', 'Campaign_ID','Campaign',
'Ad_Group_ID', 'Ad_Group','Device',
'Cost', 'Impressions', 'Clicks', 'Conversions', 'Conv_Value'])
for i in range(len(df['results'])):
results = df['results'][i]
for result in results:
new_row = pd.Series({ 'Date': result['segments']['date'],
'Account_ID': result['customer']['id'],
'Account': result['customer']['descriptiveName'],
'Campaign_ID': result['campaign']['id'],
'Campaign': result['campaign']['name'],
'Ad_Group_ID': result['adGroup']['id'],
'Ad_Group': result['adGroup']['name'],
'Device': result['segments']['device'],
'Cost': result['metrics']['costMicros'],
'Impressions': result['metrics']['impressions'],
'Clicks': result['metrics']['clicks'],
'Conversions': result['metrics']['conversions'],
'Conv_Value': result['metrics']['allConversionsValue']
})
df_new = df_new.append(new_row, ignore_index = True)
df_new.to_csv('Adgroups.csv', encoding='utf-8', index=False)
Don't use df.append. It's very slow because it has to copy the dataframe over and over again. I think it's being deprecated for this reason.
You can build the rows using list comprehension before constructing the data frame:
import json
with open("Adgroups.json") as fp:
data = json.load(fp)
columns = [
"Date",
"Account_ID",
"Account",
"Campaign_ID",
"Campaign",
"Ad_Group_ID",
"Ad_Group",
"Device",
"Cost",
"Impressions",
"Clicks",
"Conversions",
"Conv_Value",
]
records = [
(
r["segments"]["date"],
r["customer"]["id"],
r["customer"]["descriptiveName"],
r["campaign"]["id"],
r["campaign"]["name"],
r["adGroup"]["id"],
r["adGroup"]["name"],
r["segments"]["device"],
r["metrics"]["costMicros"],
r["metrics"]["impressions"],
r["metrics"]["clicks"],
r["metrics"]["conversions"],
r["metrics"]["allConversionsValue"],
)
for d in data
for r in d["results"]
]
df = pd.DataFrame(records, columns=columns)

Extract nested json map

Have JSON received from some REST API:
{
"advertiser_id": {
"8253":{
"name":"Signify",
"id":8253
},
"2920":{
"name":"Hyundai",
"id":2920
}
}
}
I wan't to extract maps inside numbers like 8253, 2920 but without hard mapping these numbers, they can be different at any time. Anyway these numbers just duplicates id inside.
Expected output after transformation:
[
{
"name":"Signify",
"id":8253
},
{
"name":"Hyundai",
"id":2920
}
]
Tried with:
import groovy.json.*
def json = '''
'''
def p = new JsonSlurper().parseText(json)
def result = p["advertiser_id"].collectEntries{ k, v ->
[
id: v.id,
name: v.name
]
}
But it returns only one "object":
{
"id": 8905,
"name": "Spotify"
}
Also achieved wrong result with next code:
def a = p["advertiser_id"].collectMany {
it.value.collect{ k, v ->
[
id: k,
name: v
]
}
}
you want to build a list - so you need collect instead of collectEntries
def p = new JsonSlurper().parseText(json)
def result = p["advertiser_id"].collect{ k, v ->
[
id: v.id,
name: v.name
]
}
and you are not doing any transformation to nested objects - so, you could simplify the code to this:
def result = p.advertiser_id.collect{ k, v -> v }

nested dictionary in list to dataframe python

Have a json input from api:
{
"api_info": {
"status": "healthy"
},
"items": [
{
"timestamp": "time",
"stock_data": [
{
"ticker": "string",
"industry": "string",
"Description": "string"
}
]
"ISIN":xxx,
"update_datetime": "time"
}
]
}
have initially run
apiRawData = requests.get(url).json()['items']
then ran the json_normalize method:
apiExtractedData = pd.json_normalize(apiRawData,'stock_data',errors='ignore')
Here is the initial output where the stock_data is still contained within a list.
stock_data ISIN update_datetime
0 [{'description': 'zzz', 'industry': 'C', 'ticker... xxx time
stock_data
ISIN
update_datetime
0
[{'description': 'zzz', 'industry': 'C', 'ticker...]
123
time
What i would like to achieve is a dataframe showing the headers and the corresponding rows:
description
industry
ticker
ISIN
update_datetime
0
'zzz'
'C'
xxx
123
time
Do direct me if there is already an existing question answered :) cheers.
I think you can simply convert your existing data frame into your expected one by using below code:
apiExtractedData['description'] = apiExtractedData['stock_data'].apply(lambda x: x[0]['description'])
apiExtractedData['industry'] = apiExtractedData['stock_data'].apply(lambda x: x[0]['industry'])
apiExtractedData['ticker'] = apiExtractedData['stock_data'].apply(lambda x: x[0]['ticker'])
And then just delete your stock_data column:
apiExtractedData = apiExtractedData.drop(['stock_data'], axis = 1)

Iterate over N nested list and dictionaries

I have the following structure of JSON/Dict.
[
{
"childrens": [
{
"childrens": [
{
"name": "somenam1"
}
],
"name": "B999"
}
],
"name": "11111"
},
{
"childrens": [
{
"childrens": [
{
"name": "somename2"
},
{
"name": "somename3"
}
],
"name": "B5555"
},
{
"childrens": [
{
"name": "somename4"
}
],
"name": "B2222"
}
],
"name": "2222"
}
]
I want to iterate over all dictionaries and list inside root list and create single string for each dictionary inside root list.
Output will look like this (two lines):
1111|B999|somename1
2222|B5555|somename2|somename3|B2222|somename4
Also this is just an example i can have N nested childrens.
Looks like a good candidate for recursion:
def flatten(child):
if not child:
return child
return [child['name']] + [name for c in child.get('childrens', []) for name in flatten(c)]
In []:
for child in data:
print('|'.join(flatten(child)))
Out[]:
11111|B999|somenam1
2222|B5555|somename2|somename3|B2222|somename4
Sure you can just pass add a level arg and return that:
def flatten(child, level=0):
if not child:
return child
return [level] + [l for c in child.get('childrens', []) for l in flatten(c, level+1)]
In []:
for child in data:
print('|'.join(str(level) for level in flatten(child)))
Out[]:
0|1|2
0|1|2|2|1|2
Here's a solution by recursion
data_json = '[{"childrens":[{"childrens":[{"name":"somenam1"}],"name":"B999"}],"name":"11111"},{"childrens":[{"childrens":[{"name":"somename2"},{"name":"somename3"}],"name":"B5555"},{"childrens":[{"name":"somename4"}],"name":"B2222"}],"name":"2222"}]'
data = json.loads(data_json)
def get_names(data_dict):
if ("childrens" in data_dict):
ret_dict = "|".join(map(get_names, data_dict["childrens"]))
return data_dict["name"] + "|" + ret_dict
else:
return data_dict["name"]
def get_all_name(data):
for i in data:
print(get_names(i))
get_all_name(data)

Resources