how to define bigquery schema when build apache beam data pipeline - python-3.x

I create a data pipeline with apache beam, but it can not insert the data to bigquery.
I use beam.ParDo to process the data, and yield the data row by row,
below is the code.
project = 'project_name'
dataset = 'XXX'
class parser_data(beam.DoFn):
def process(self, data):
ZZ = [{"NN":d["NNN"], "descrip":d} for d in data["colZ"]]
ret = pd.DataFrame(data['colD'])
ret["colA"] = data["colA"]
ret["colB"] = data["colB"]
ret["colC"] = data["colC"]
ret = pd.merge(ret, pd.DataFrame(ZZ), on=["NN"], how="left")
ret = ret[["colA", "colB", "colC", "NN", "sample", "descrip"]]
print(ret)
ret_dict = ret.to_dict("records")
print(ret_dict)
for i in range(len(ret_dict)):
yield ret_dict[i]
options = PipelineOptions(
runner = 'DirectRunner',
region = 'us-west1',
project = project,
job_name = "test-tmp",
streaming = False,
setup_file = './setup.py',
subnetwork = "XXXXXXX",
service_account_email = "XXXXXX",
temp_location='XXXXXX',
staging_location="XXXXXX",
use_public_ips = False
)
d = {
'colA': '1',
'colB': 'Strawberry',
'colC': 2,
'colD': [{"NN":"AA", "sample":1}, {"NN":"AA", "sample":2}, {"NN":"BB", "sample":3}, {"NN":"CC", "sample":4}, {"NN":"CC", "sample":5}],
'colZ': [{"NNN":"AA", "name":"123", "timeperiod":"152"}, {"NNN":"BB", "name":"1212513", "timeperiod":"1952"}, {"NNN":"CC", "name":"13", "timeperiod":"14152"}],
}
schema = {
'fields':[
{'name': 'colA', 'type': 'STRING', 'mode': 'REQUIRED'},
{'name': 'colB', 'type': 'STRING', 'mode': 'REQUIRED'},
{'name': 'colC', 'type': 'STRING', 'mode': 'REQUIRED'},
{'name': 'NN', 'type': 'STRING', 'mode': 'REQUIRED'},
{'name': 'sample', 'type': 'STRING', 'mode': 'REQUIRED'},
{
'name': 'descrip', 'type': 'RECORD', 'mode': 'NULLABLE',
'fields':[
{"name": "NNN", "type": "STRING", 'mode': 'NULLABLE'},
{"name": "name", "type": "STRING", 'mode': 'NULLABLE'},
{"name": "timeperiod", "type": "STRING", 'mode': 'NULLABLE'},
]
},
]
}
with beam.Pipeline(options=options) as pipeline:
data = (
pipeline | 'get data' >> beam.Create([d])
)
ret_A = (
data | "Process A data " >> beam.ParDo(parser_data())
| "Insert data into BQ" >> beam.io.WriteToBigQuery(
f"{project}:{dataset}.TestJsonData",
schema=schema,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
)
)
The error is below
RuntimeError: BigQuery job beam_bq_job_LOAD_testtmp_LOAD_STEP_820_9672b886a985a9a36a9c3805cee3be5e_3f26019c07d746ef92c0893574156f5b failed. Error Result: <ErrorProto
location: 'gs://XXXXXXXX/dataflow_temp/bq_load/db11e8430c10470382be2565136d53fb/{project}.{dataset}.TestJsonData/39e0d645-8484-4033-a1c4-3e4a825d6fee'
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details.'
reason: 'invalid'> [while running '[25]: Insert data into BQ/BigQueryBatchFileLoads/WaitForDestinationLoadJobs']
Also the print function show the data, so I think the problem is in the bigquery schema, but I can not find it
Anyone have any idea?

Related

How XML data can be stored in the Google Bigquery using Python?

I want to save XML data into google bigquery. For that,first I convert the XML data into pandas dataframe and then data is stored in the bigquery using to_gbq. But it takes too much time to save data into bigquery.
While saving data into bigquery, I want to append new data, remove data or row from bigquery if the incoming data does not contain that row and also update the data if the incoming data is updated.
code 1 (pandas to bigquery):
df_cols = ['Author', 'Title', 'Genre', 'Price', 'Date', 'Description']
rows = []
for node in root:
author = node.find("author").text if node is not None else None
title = node.find("title").text if node is not None else None
genre = node.find("genre").text if node is not None else None
price = node.find("price").text if node is not None else None
date = node.find("publish_date").text if node is not None else None
description = node.find("description").text if node is not None else None
rows.append({"Author": author, "Title": title, "Genre": genre, "Price": price, "Date": date, "Description": description})
df = pd.DataFrame(rows, columns=df_cols)
# Define target table in BQ
target_table = "table_name"
project_id = "project_id"
credential_file = "credentials.json"
credential = Credentials.from_service_account_file(credential_file)
# Save Pandas dataframe to BQ
df.to_gbq(target_table, project_id=project_id, if_exists='replace', progress_bar=True, credentials=credential)
The above code worked for me. But it takes too much time. It takes around 1 second for each row for inserting in the bigquery.
Secondly, I tried to insert data into bigquery from json.
code (json to bigquery):
import pandas as pd
import numpy as np
from google.cloud import bigquery
import os, json
import xmltodict
import xml.etree.ElementTree as et
### Converts schema dictionary to BigQuery's expected format for job_config.schema
def format_schema(schema):
formatted_schema = []
for row in schema:
formatted_schema.append(bigquery.SchemaField(row['author'], row['title'], row['genre'], row['price'], row['publish_date'], row['description']))
return formatted_schema
df_cols = ['author', 'title', 'genre', 'price', 'publish_date', 'description']
xtree = et.parse('test.xml')
xroot = xtree.getroot()
rows = []
for node in xroot:
res = []
for el in df_cols:
if node is not None and node.find(el) is not None:
res.append(node.find(el).text)
else:
res.append(None)
rows.append({df_cols[i]: res[i]
for i, _ in enumerate(df_cols)})
print(rows)
out_df = pd.DataFrame(rows, columns=df_cols)
### Convert dataframe to JSON object
json_data = out_df.to_json(orient = 'records')
print(json_data)
json_object = json.dumps(json_data)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"credentials.json"
table_schema = {
'name': 'author',
'type': 'STRING',
'mode': 'NULLABLE'
}, {
'name': 'title',
'type': 'STRING',
'mode': 'NULLABLE'
}, {
'name': 'genre',
'type': 'STRING',
'mode': 'NULLABLE'
}, {
'name': 'price',
'type': 'STRING',
'mode': 'NULLABLE'
}, {
'name': 'publish_date',
'type': 'STRING',
'mode': 'NULLABLE'
}, {
'name': 'description',
'type': 'STRING',
'mode': 'NULLABLE'
}
project_id = 'project-id'
dataset_id = 'id_name'
table_id = 'table_name'
client = bigquery.Client(project = project_id)
dataset = client.dataset(dataset_id)
table = dataset.table(table_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
print(table_schema)
job_config.schema = format_schema(table_schema)
print(job_config.schema)
# Default is append
job = client.load_table_from_json(json_object, table, job_config = job_config).result()
destination_table = client.get_table('{}.{}.{}'.format(project_id, dataset_id, table_id))
print("Loaded {} rows.".format(destination_table.num_rows))
But I got an error like below:
formatted_schema.append(bigquery.SchemaField(row['author'], row['title'], row['genre'], row['price'], row['publish_date'], row['description']))
KeyError: 'author'
Can anyone suggest a solution to store XML data into google bigquery without taking much time by considering conditions like (delete,append,update etc)?

Do DynamoDB update expressions allow condition expressions to target specific components of an update expressions?

I have an item where multiple attributes may need to be updated (but not always), and each of them requires a different set of conditions. In the below example, I would like to do the following:
Set a new JobTitle if it has changed, and the LastUpdated value is more recent
Append a Project ID to the ProjectsList list and Project to Projects if a new Project ID listed.
Is it possible to run all of these updates within a single update_item expression, or is it necessary to use transactions?
Example item:
{
'PK': 'PERSON#123456789',
'SK': 'PERSON#123456789',
'ID': '123456789',
'Name': 'Bob Smith',
'Birthday': '01/01/2000',
'Street': '65 LEXINGTON AVE',
'Zip': '12345',
'City': 'NEW YORK',
'State': 'NY',
'JobTitle': 'Project Manager',
'LastUpdated': '20210101',
'Projects': [
{
'Name': 'Project A',
'ID': '987654321'
},
{
'Name': 'Project B',
'ID': '756394733'
},
],
"ProjectList": ['987654321', '756394733']
}
Each separate update expressions (via a Table resource):
# Update job title if changed
r = table.update_item(
Key = {
"PK": data['PK'],
"SK": data['SK']
},
UpdateExpression = "SET #JobTitle = :JobTitle",
ExpressionAttributeNames = {
"#JobTitle": "JobTitle",
"#LastUpdated": "LastUpdated"
},
ExpressionAttributeValues = {
":JobTitle": "Senior Project Manager",
":LastUpdated": '20210102'
},
ConditionExpression = "#LastUpdated < :LastUpdated AND #JobTitle <> :JobTitle"
)
# Update projects if not in list
r = table.update_item(
Key = {
"PK": data['PK'],
"SK": data['SK']
},
UpdateExpression = """
SET #Projects = list_append(if_not_exists(#Projects, :empty_list), :ProjectMap),
#ProjectList = list_append(if_not_exists(#ProjectList, :empty_list), :ProjectID)
""",
ConditionExpression = "not(contains(#ProjectList, :ProjectID))",
ExpressionAttributeNames = {
"#Projects": "Projects",
"#ProjectList": "ProjectList"
},
ExpressionAttributeValues = {
":empty_list": [],
":ProjectMap": [{
"Name": "Project C"
"ID": "7463848373"
}],
":ProjectID": ["7463848373"]
}
)
What I want to do
# (1) Update job title if changed or (2) update projects if not in list
r = table.update_item(
Key = {
"PK": data['PK'],
"SK": data['SK']
},
UpdateExpression = """
SET
#JobTitle = :JobTitle,
#Projects = list_append(if_not_exists(#Projects, :empty_list), :ProjectMap),
#ProjectList = list_append(if_not_exists(#ProjectList, :empty_list), :ProjectID)
""",
ExpressionAttributeNames = {
"#JobTitle": "JobTitle",
"#LastUpdated": "LastUpdated",
"#Projects": "Projects",
"#ProjectList": "ProjectList"
},
ExpressionAttributeValues = {
":empty_list": [],
":JobTitle": "Senior Project Manager",
":LastUpdated": '20210102'
":ProjectMap": [{
"Name": "Project C"
"ID": "7463848373"
}],
":ProjectID": ["7463848373"]
},
ConditionExpression = "(#LastUpdated < :LastUpdated AND #JobTitle <> :JobTitle) OR not(contains(#ProjectList, :ProjectID))"
)
I'm skeptical that this is possible due to this line in the AWS documentation:
For these data manipulation operations, you can specify a condition expression to determine which items should be modified. If the condition expression evaluates to true, the operation succeeds; otherwise, the operation fails.

How to add new key in the existing dictionary and derive a nested dictionary from it in python?

Iam trying to add new key inside the existing dictionary to create a new nested dictionary
Below is the existing dictionary
I need to make a nested dictionary from the below dictionary
{'userId': 'thanks',
'jobTitleName': 'Program Directory',
'firstName': 'Tom', 'lastName': 'Hanks',
'preferredFullName': 'Tom Hanks',
'employeeCode': 'E3',
'region': 'CA',
'phoneNumber': '+00408-2222222',
'emailAddress': 'tomhanks#gmail.com',
'Full Name': 'TomHanks'}
This is what i tried:
key1=['userId','jobTitleName','firstName','lastName','employeeCode']
key2=['Full Name','phoneNumber','region','emailAddress']
jsonValue={
{'userId': 'thanks',
'jobTitleName': 'Program Directory',
'firstName': 'Tom', 'lastName': 'Hanks',
'preferredFullName': 'Tom Hanks',
'employeeCode': 'E3',
'region': 'CA',
'phoneNumber': '+00408-2222222',
'emailAddress': 'tomhanks#gmail.com',
'Full Name': 'TomHanks'}
}
empDetails={}
for k in key1:
empDetails[k]=jsonValue[k]
print("Key1", empDetails)
for k2 in key2:
empDetails['otherDetails'][k2]=jsonValue[k2]
But its not working
Expected:
Now i need to add new key as 'otherDetails' to derive a nested dictionary as follows
{'userId': 'thanks',
'jobTitleName': 'Program Directory',
'firstName': 'Tom', 'lastName': 'Hanks',
'preferredFullName': 'Tom Hanks',
'employeeCode': 'E3',
otherDetails{
'region': 'CA',
'phoneNumber': '+00408-2222222',
'emailAddress': 'tomhanks#gmail.com',
'Full Name': 'TomHanks'
}
}
Appreciate if anyone can give right solution?
Thanks
There are a couple of Problems in you code. First in your jsonValue you put a dict inside of a dict, but don't specify a key here. From context I assume you actually want to use an Array here (since you most likely have an array of employee data, If I'm wrong here just comment)
Then you try to assign to empDetails['otherDetails'][k2] however, you never initialize the dict in empDetails['otherDetails'] . because of this you actually try to assign to None (Because empDetails['otherDetails'][k2] will evaluate to None[k2]
key1 = ['userId', 'jobTitleName', 'firstName', 'lastName', 'employeeCode']
key2 = ['Full Name', 'phoneNumber', 'region', 'emailAddress']
jsonValue = [{
'userId': 'thanks',
'jobTitleName': 'Program Directory',
'firstName': 'Tom', 'lastName': 'Hanks',
'preferredFullName': 'Tom Hanks',
'employeeCode': 'E3',
'region': 'CA',
'phoneNumber': '+00408-2222222',
'emailAddress': 'tomhanks#gmail.com',
'Full Name': 'TomHanks'
}
]
for employee in jsonValue:
empDetails = {'otherDetails': {}}
for k in key1:
empDetails[k] = employee[k]
print("Key1", empDetails)
for k2 in key2:
empDetails['otherDetails'][k2] = employee[k2]
print("Key1", empDetails)
Filter out which keys you want to keep, then filter out the keys you want to move to the inner dict, then insert the inner dict.
from pprint import pprint
d = {
"userId": "thanks",
"jobTitleName": "Program Directory",
"firstName": "Tom",
"lastName": "Hanks",
"preferredFullName": "Tom Hanks",
"employeeCode": "E3",
"region": "CA",
"phoneNumber": "+00408-2222222",
"emailAddress": "tomhanks#gmail.com",
"Full Name": "TomHanks",
}
# Outer keys you want to keep
keys_to_keep = {'userId','jobTitleName','firstName','lastName','preferredFullName', 'employeeCode'}
# Keys you want to move into inner dict
keys_to_move = {'Full Name','phoneNumber','region','emailAddress'}
# Create dict to insert into
new_dict = {k: d[k] for k in keys_to_keep}
# Create dict to insert into above dict
insert_dict = {k: d[k] for k in keys_to_move}
# Insert inner dict
new_dict['otherDetails'] = insert_dict
pprint(new_dict)
Output:
{'employeeCode': 'E3',
'firstName': 'Tom',
'jobTitleName': 'Program Directory',
'lastName': 'Hanks',
'userId': 'thanks',
'preferredFullName': 'Tom Hanks',
'otherDetails': {'Full Name': 'TomHanks',
'emailAddress': 'tomhanks#gmail.com',
'phoneNumber': '+00408-2222222',
'region': 'CA'},
}

If condition with or in python

If i have list containing Null empty string and some value then i can i use condition for both empty and value.In my case if i am using to check "89" and "Null" it is not processing message for value instead shows empty value can not processed.Value should be processed for value and not for Null condition.
payload=[{'id': 'Room1',
'pressure': {'metadata': {}, 'type': 'Number', 'value': 'Null'},
`'temperature':{'metadata': {}, 'type': 'Number', 'value': '89`'},
'type': 'RoomTest'}]
attrs=['temperature','pressure']
x=(len(payload))
for i in range(x):
for j in attrs:
y=payload[i][j]['value']
print(y)
for item in y:
print(item)
if item is ["Null", ""]:
print("empty value can not processed")
if item is not["Null",""]:
print("successfully processed for value")
I have also tried using "any" but still same result.
Thanks in Advance.
You can do this
payload=[{'id': 'Room1',
'pressure': {'metadata': {}, 'type': 'Number', 'value': 'Null'},
'temperature':{'metadata': {}, 'type': 'Number', 'value': '89'},
'type': 'RoomTest'}]
attrs=['temperature','pressure']
for item in payload:
for attr in attrs:
value = item[attr]['value']
if value in ["Null", ""]:
print("empty value can not processed")
else:
print("successfully processed for value")

Nested Dictionary using python

I am trying to write nested type of dictionary in python. I am providing my input and expected output and my tried code.
This is my input:
input = [['10', 'PS_S1U_X2_LP', 'permit', 'origin', 'igp', 'RM_S1U_X2_LP'],
['20', '', 'permit', '', '', 'RM_S1U_X2_LP'],
['10', 'MPLS-LOOPBACK', 'permit', '', '', 'MPLS-LOOPBACK-RLFA'],
]
And my desired output is:
output =
"route_policy_list": [
{
"policy_terms": [],
"route_policy_statement": [
{
"entry": "10",
"prefix_list": "PS_S1U_X2_LP",
"action_statements": [
{
"action_value": "igp",
"action": "permit",
"action_statement": "origin"
}
]
},
{
"entry": "20",
"prefix_list": "",
"action_statements": [
{
"action_value": "",
"action": "permit",
"action_statement": ""
}
]
}
],
"name": "RM_S1U_X2_LP"
},
{
"policy_terms": [],
"route_policy_statement": [
{
"entry": "10",
"prefix_list": "MPLS-LOOPBACK",
"action_statements": [
{
"action_value": "",
"action": "permit",
"action_statement": ""
}
]
}
],
"name": "MPLS-LOOPBACK-RLFA"
}
]
And I have tried this code:
from collections import defaultdict
res1 = defaultdict(list)
for fsm1 in input:
name1 = fsm1.pop()
action = fsm1[2]
action_statement = fsm1[3]
action_value = fsm1[4]
item1 = dict(zip(['entry','prefix_list'],fsm1))
res1['action'] = action
res1['action_statement'] = action_statement
res1['action_value'] = action_value
res1[name].append(item1)
print(res1)
Please help me to get desired output as mentioned above as i am new to coding and struggling to write.
Here is the final code. I used setdefault method to group the data first then used simple for loop to represent the data in requested way.
# Input
input = [['10', 'PS_S1U_X2_LP', 'permit', 'origin', 'igp', 'RM_S1U_X2_LP'],
['20', '', 'permit', '', '', 'RM_S1U_X2_LP'],
['10', 'MPLS-LOOPBACK', 'permit', '', '', 'MPLS-LOOPBACK-RLFA'],
]
# Main code
d = {}
final = []
for i in input:
d.setdefault(i[-1], []).append(i[:-1])
for i, v in d.items():
a = {}
a["policy_terms"] = []
a["route_policy_statement"] = [{"entry": j[0], "prefix_list":j[1], "action_statements":[{"action_value":j[-2], "action": j[-4], "action_statement": j[-3]}]} for j in v]
a["name"] = i
final.append(a)
final_dict = {"route_policy_list": final}
print (final_dict)
# Output
# {'route_policy_list': [{'policy_terms': [], 'route_policy_statement': [{'entry': '10', 'prefix_list': 'PS_S1U_X2_LP', 'action_statements': [{'action_value': 'origin', 'action': 'PS_S1U_X2_LP', 'action_statement': 'permit'}]}, {'entry': '20', 'prefix_list': '', 'action_statements': [{'action_value': '', 'action': '', 'action_statement': 'permit'}]}], 'name': 'RM_S1U_X2_LP'}, {'policy_terms': [], 'route_policy_statement': [{'entry': '10', 'prefix_list': 'MPLS-LOOPBACK', 'action_statements': [{'action_value': '', 'action': 'MPLS-LOOPBACK', 'action_statement': 'permit'}]}], 'name': 'MPLS-LOOPBACK-RLFA'}]}
I hope this helps and count!
It seems like every sublist in input consists of the same order of data, so I would create another list of indices such as
indices = ['entry', 'prefix_list', 'action', 'action_statement', 'action_value', 'name']
and then just hard code the values, because it seems you want specific values in specific places.
dic_list = []
for lst in input:
dic = {'policy terms' : [],
'route_policy_statements' : {
indices[0] : lst[0],
indices[1] : lst[1],
'action_statements' : {
indices[2] : lst[2],
indices[3] : lst[3],
indices[4] : lst[4]
},
indices[5] : lst[5]
}
}
dic_list.append(dic)

Resources