How to Create Partition table at runtime over Apache Beam using Python - python-3.x

I am trying to Create a new partition Bigquery table on runtime with following code, but i am not getting option to pass column names "_time" over which partition need to be done on my new BQ table.
Can any please please help me on it.
My Code
#------------Import Lib-----------------------#
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
import os, sys
import argparse
import logging
from apache_beam.options.pipeline_options import SetupOptions
from datetime import datetime
#------------Set up BQ parameters-----------------------#
# Replace with Project Id
project = 'xxxx'
#plitting Of Records----------------------#
class Transaction_DB_UC2(beam.DoFn):
def process(self, element):
logging.info(element)
result = json.loads(element)
data_time = result.get('_time', 'null')
data_dest = result.get('dest', 'null')
data_DBID = result.get('DBID', 'null')
data_SESSIONID = result.get('SESSIONID', 'null')
data_USERHOST = result.get('USERHOST', 'null')
data_raw = result.get('_raw', 'null')
data_ACTION = result.get('ACTION', 'null')
data_host = result.get('host', 'null')
data_result = result.get('result', 'null')
data_DBUSER = result.get('DBUSER', 'null')
data_OS_USERNAME = result.get('OS_USERNAME', 'null')
data_ACTION_NAME = result.get('ACTION', 'null').replace('100','LOGON').replace('101','LOGOFF')
return [{"_time": data_time[:-8], "dest": data_dest, "DBID": data_DBID, "SESSIONID": data_SESSIONID, "_raw": data_raw, "USERHOST": data_USERHOST, "ACTION": data_ACTION, "host": data_host, "result": data_result, "DBUSER": data_DBUSER, "OS_USERNAME": data_OS_USERNAME, "ACTION_NAME": data_ACTION_NAME}]
def run(argv=None, save_main_session=True):
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
help='Input file to process.')
parser.add_argument(
'--pro_id',
dest='pro_id',
type=str,
default='ORACLE_SEC_DEFAULT',
help='project id')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
p1 = beam.Pipeline(options=pipeline_options)
#data_f = sys.argv[1]
logging.info('***********')
logging.info(known_args.input)
data_loading = (
p1
|'Read from File' >> beam.io.ReadFromText(known_args.input,skip_header_lines=0)
)
project_id = "xxxxx"
dataset_id = 'test123'
table_schema_DB_UC2 = ('_time:DATETIME, dest:STRING, DBID:STRING, SESSIONID:STRING, _raw:STRING, USERHOST:STRING, ACTION:STRING, host:STRING, result:STRING, DBUSER:STRING, OS_USERNAME:STRING, ACTION_NAME:STRING')
# Persist to BigQuery
# WriteToBigQuery accepts the data as list of JSON objects
#---------------------Index = DB-UC2----------------------------------------------------------------------------------------------------------------------
result = (
data_loading
| 'Clean-DB-UC2' >> beam.ParDo(Transaction_DB_UC2())
| 'Write-DB-UC2' >> beam.io.WriteToBigQuery(
table=known_args.pro_id,
dataset=dataset_id,
project=project_id,
schema=table_schema_DB_UC2,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
result = p1.run()
result.wait_until_finish()
if __name__ == '__main__':
#logging.getLogger().setLevel(logging.INFO)
path_service_account = 'ml-fbf8cabcder.json'
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = path_service_account
run()
I want to create partition on filed "_time", please suggest how it can be achieved.
Thanks.

I believe that you can do that with additional_bq_parameters (Note the limitations) with the timePartitioning parameter.
When creating a new BigQuery table, there are a number of extra parameters
that one may need to specify. For example, clustering, partitioning, data
encoding, etc. It is possible to provide these additional parameters by
passing a Python dictionary as additional_bq_parameters (Reference).
In your case, you could add to your WriteToBigQuery transform the timePartitioning parameter with the required type and optional field fields (Note that field must be a top-level TIMESTAMP or DATE field):
additional_bq_parameters={'timePartitioning': {
'type': 'DAY',
'field': '_time'
}}
I didn't have the time to try it out yet. I'll try to reproduce tomorrow.
Let me know if it works for you.
EDIT
Finally got the chance to try the timePartitioning parameter to create a partitioned table and it worked.
Here is a simple pipeline code to test it.
#!/usr/bin/env python
import apache_beam as beam
PROJECT='YOUR_PROJECT'
BUCKET='YOUR_BUCKET'
def run():
argv = [
'--project={0}'.format(PROJECT),
'--job_name=YOUR_JOB_NAME',
'--save_main_session',
'--staging_location=gs://{0}/staging/'.format(BUCKET),
'--temp_location=gs://{0}/staging/'.format(BUCKET),
'--region=us-central1',
'--runner=DataflowRunner'
]
p = beam.Pipeline(argv=argv)
table_schema = {'fields': [
{'name': 'country', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': '_time', 'type': 'DATETIME', 'mode': 'NULLABLE'},
{'name': 'query', 'type': 'STRING', 'mode': 'NULLABLE'}]}
additional_bq_parameters = {
'timePartitioning': {'type': 'DAY', 'field': '_time'}}
elements = (p | beam.Create([
{'country': 'mexico', '_time': '2020-06-10 22:19:26', 'query': 'acapulco'},
{'country': 'canada', '_time': '2020-12-11 15:42:32', 'query': 'influenza'},
]))
elements | beam.io.WriteToBigQuery(
table='YOUR_DATASET.YOUR_NEW_TABLE',
schema=table_schema,
additional_bq_parameters=additional_bq_parameters,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
)
p.run()
if __name__ == '__main__':
run()

Related

How to call a loop under main fuction python

I am working on a script which i have to modify in order loop through the multiple resources within a functions.
Below are the items which we need to loop through to get the data from and, this is coming from Config_local
BASE_URL = "https://synergy.hpe.example.com/rest/"
RES_EXT = [ 'resource-alerts?count=500&start=1',
'resource-alerts?count=500&start=501'
'resource-alerts?count=500&start=1001,
'resource-alerts?count=500&start=1501'
]
While i am looping through above list under def main(): section and taking get_resource_alerts_response() under loop the data coming out of the loop getting over-written and thus returning only last loop data only.
Main Script:
import os
import shutil
import smtplib
from email.message import EmailMessage
import pandas as pd
pd.set_option('expand_frame_repr', True)
import requests
from Config_local import (
BASE_URL,
DST_DIR,
OUTFILE,
PASSWORD,
SRC_DIR,
TIMESTAMP_DST,
USERNAME,
SUBJECT,
FROM,
TO,
EMAIL_TEMPLATE,
SMTP_SERVER,
RES_EXT,
)
class FileMoveFailure(Exception):
pass
class SynergyRequestFailure(Exception):
pass
class SessionIdRetrievalFailure(SynergyRequestFailure):
pass
class ResourceAlertsRetrievalFailure(SynergyRequestFailure):
pass
def move_csv_files():
for csv_file in os.listdir(SRC_DIR):
if csv_file.endswith(".csv") and os.path.isfile(os.path.join(SRC_DIR, csv_file)):
try:
shutil.move(
os.path.join(f"{SRC_DIR}/{csv_file}"),
f"{DST_DIR}/{csv_file}-{TIMESTAMP_DST}.log"
)
except OSError as os_error:
raise FileMoveFailure(
f'Moving file {csv_file} has failed: {os_error}'
)
def get_session_id(session):
try:
response = session.post(
url=f"{BASE_URL}/login-sessions",
headers={
"accept": "application/json",
"content-type": "application/json",
"x-api-version": "120",
},
json={
"userName": USERNAME,
"password": PASSWORD
},
verify=False
)
except requests.exceptions.RequestException as req_exception:
# you should also get this logged somewhere, or at least
# printed depending on your use case
raise SessionIdRetrievalFailure(
f"Could not get session id: {req_exception}"
)
json_response = response.json()
if not json_response.get("sessionID"):
# always assume the worse and do sanity checks & validations
# on fetched data
raise KeyError("Could not fetch session id")
return json_response["sessionID"]
#def get_all_text(session, session_id):
# all_text = ''
# for res in RES_EXT:
# url= f"{BASE_URL}{res}"
# newresult = get_resource_alerts_response(session, session_id, url)
# all_text += newresult
# print(f"{all_text}")
# return str(all_text)
#
def get_resource_alerts_response(session, session_id, res):
try:
return session.get(
url=f"{BASE_URL}{res}",
headers={
"accept": "application/json",
"content-type": "text/csv",
"x-api-version": "2",
"auth": session_id,
},
verify=False,
stream=True
)
except requests.exceptions.RequestException as req_exception:
# you should also get this logged somewhere, or at least
# printed depending on your use case
raise ResourceAlertsRetrievalFailure(
f"Could not fetch resource alerts: {req_exception}"
)
def resource_alerts_to_df(resource_alerts_response):
with open(OUTFILE, 'wb') as f:
for chunk in resource_alerts_response.iter_content(chunk_size=1024*36):
f.write(chunk)
return pd.read_csv(OUTFILE)
def send_email(df):
server = smtplib.SMTP(SMTP_SERVER)
msg = EmailMessage()
msg['Subject'], msg['From'], msg['To'] = SUBJECT, FROM, TO
msg.set_content("Text version of your html template")
msg.add_alternative(
EMAIL_TEMPLATE.format(df.to_html(index=False)),
subtype='html'
)
server.send_message(msg)
def main():
move_csv_files()
session = requests.Session()
session_id = get_session_id(session)
for res in RES_EXT:
resource_alerts_response = get_resource_alerts_response(session,
session_id, res)
print(resource_alerts_response)
df = resource_alerts_to_df(resource_alerts_response)
print(df)
send_email(df)
if __name__ == '__main__':
main()
any help or hint will be much appreciated.
This is a copy of the code which I recall we had over SO but not what you want now, However, as the whole code body is okay and the idea of for loop is also looks good, you Just need to tweek it to meet the requirement.
1- You need to create and empty DataFrame assignment Synergy_Data = pd.DataFrame()
2- then you can append the data you received from for loop ie resource_alerts_response which becomes df = resource_alerts_to_df(resource_alerts_response)
3- lastly, you can append this df to the empty Synergy_Data and then call that under your if __name__ == '__main__' to send an e-mail. Also don't forget to declare Synergy_Data as a global variable.
Synergy_Data = pd.DataFrame()
def main():
global Synergy_Data
move_csv_files()
session = requests.Session()
session_id = get_session_id(session)
for res in RES_EXT:
resource_alerts_response = get_resource_alerts_response(session,
session_id, res)
df = resource_alerts_to_df(resource_alerts_response)
Synergy_Data = Synergy_Data.append(df)
if __name__ == '__main__':
main()
send_email(Synergy_Data)
Hope this will be helpful.
Only the last response is being used because this value is overwritten in resource_alerts_response on each iteration of the loop. You may consider acting on the data on each iteration or storing it for use later i.e. after the loop. I've included these options with modifications to the main() function below.
Option 1
Send an email for each resource alert response
def main():
move_csv_files()
session = requests.Session()
session_id = get_session_id(session)
for res in RES_EXT:
resource_alerts_response = get_resource_alerts_response(session,
session_id, res)
print(resource_alerts_response)
# Indent lines below so that the operations below are executed in each loop iteration
df = resource_alerts_to_df(resource_alerts_response)
print(df)
send_email(df)
Option 2
Merge all resource alert responses and send one email
def main():
move_csv_files()
session = requests.Session()
session_id = get_session_id(session)
df_resource_alerts_responses = None
for res in RES_EXT:
resource_alerts_response = get_resource_alerts_response(session,
session_id, res)
print(resource_alerts_response)
df = resource_alerts_to_df(resource_alerts_response)
if df_resource_alerts_responses is None:
df_resource_alerts_responses = df
else:
df_resource_alerts_responses = df_resource_alerts_responses.append(df, ignore_index=True)
print(df_resource_alerts_responses)
if df_resource_alerts_responses is not None:
send_email(df_resource_alerts_responses)

JSONDecodeError : Extra data: line 2 column 1

I'm trying to read json zipped file from S3 buckets and writing to a dynamo db table using aws lambda service and I chose python boto3 language. After I read the s3 data, while trying to run json.loads I'm getting this error.
My code looks something like -
import json
import gzip
import boto3
from io import BytesIO
s3 = boto3.resource('s3')
dynamodb = boto3.resource('dynamodb')
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
json_file_name = event['Records'][0]['s3']['object']['key']
json_object = s3.Object(bucket, json_file_name)
n = json_object.get()['Body'].read()
gzipfile = BytesIO(n)
gzipfile = gzip.GzipFile(fileobj=gzipfile)
content = gzipfile.read().decode('utf-8')
jsonDict = json.loads(content)
# Write items to dynamo db table
table = dynamodb.Table('mahbis01-AccountService-LedgerSummary-Duplicate')
table.put_item(Item=jsonDict)
return {
'statusCode': 200,
'body': json.dumps('Hello from Lambda!')
}
When I printed content, I see the values like -
{
"Item":{
"SubsId":{
"S":"255_0_908764"
}
}
}{
"Item":{
"SubsId":{
"S":"255_0_908765"
}
}
}{
"Item":{
"SubsId":{
"S":"255_0_908766"
}
}
}{
"Item":{
"SubsId":{
"S":"255_0_908767"
}
}
}
How can I get rid of this and write the data to dynamo db?
Your content is obviously incorrect json. Assuming the content string has constant format, you can convert it to json using:
jsonDict = json.loads('['+content.replace('}{','},{')+']')
which will give you valid list of dicts:
[{'Item': {'SubsId': {'S': '255_0_908764'}}}, {'Item': {'SubsId': {'S': '255_0_908765'}}}, {'Item': {'SubsId': {'S': '255_0_908766'}}}, {'Item': {'SubsId': {'S': '255_0_908767'}}}]
Then you can iterate over it and process how you want, e.g.:
for item in jsonDict:
print(item)
# or upload to dynamodb

Put Pandas Dataframe into influxDB

I would like to put this DataFrame (DataFrame ) into influxdb. Unfortunately I have the following error message: "ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()."
Trouble with this 2 lines?
tags = { 'tag1': df[['tag1']], 'tag2': df[['tag2']] }
client.write_points(df, 'test', tags = tags)
I do not understand why. Thank you for your help
def main(host='10.0.0.3', port=8086):
"""Instantiate the connection to the InfluxDB client."""
user = ''
password = ''
dbname = 'ess_2'
protocol = 'line'
client = DataFrameClient(host, port, user, password, dbname)
print("Create database: " + dbname)
client.create_database(dbname)
t = time.strftime('%Y-%m-%d %H:%M:%S')
headers = ['time','tag1','tag2','field1','field2']
data = [[t,'MSFT','NYSE',1.3,2.5],[t,'APPL','NYSE',3.5,4.24]]
df = pd.DataFrame(data, columns = headers)
df['time'] = pd.to_datetime(df['time'])
df = df.set_index('time')
tags = { 'tag1': df[['tag1']], 'tag2': df[['tag2']] } #DEBUG
client.write_points(df, 'test', tags = tags) #DEBUG
print("Read DataFrame")
client.query("select * from demo")
def parse_args():
"""Parse the args from main."""
parser = argparse.ArgumentParser(
description='example code to play with InfluxDB')
parser.add_argument('--host', type=str, required=False,
default='10.0.0.3',
help='hostname of InfluxDB http API')
parser.add_argument('--port', type=int, required=False, default=8086,
help='port of InfluxDB http API')
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
main(host=args.host, port=args.port)

python requests error check-mk API

I am trying to add dictionary data to our check-mk Web API wit Python requests, but I keep getting an Error of missing keys:
{"result": "Check_MK exception: Missing required key(s): aux_tags, tag_groups", "result_code": 1}
Here is my code:
import json
import requests
params_get = (
('action', 'get_hosttags'),
('_username', 'user'),
('_secret', 'secret'),
('request_format', 'json'),
)
params_set = (
('action', 'set_hosttags'),
('_username', 'user'),
('_secret', 'secret'),
('request_format', 'json'),
)
url = 'http://monitoringtest.local.domain/test/check_mk/webapi.py'
tags = ['tag1', 'tag2']
response_data = requests.get(url, params=params_get)
data = response_data.json()
new_data_tags = data['result']['tag_groups']
new_data = data['result']
# new_tags = []
for tag in tags:
new_data['aux_tags'].append({'id': tag, 'title': 'tags in datacenter'})
# new_tags.extend([{'aux_tags': [], 'id': tag, 'title': tag.upper() + ' Tag'}])
# all_tags = new_data_tags.extend([{'tags': new_tags, 'id': 'group1', 'title': 'tags in datacenter'}])
json.dump(data['result'], open("new_file", "w"))
response_data_new = requests.get(url, params=params_set, json=json.dumps(data['result']))
# response_data_new = requests.put(url, params=params_set)
# requests.post(url, params=params_set)
print(response_data_new.text)
# print(data['result'])
# json.dump(data['result'], open("new_file", "w"))
When I use curl every thing works well with a success message:
{"result": null, "result_code": 0}
Do you have any idea what causes the error?
Thanks
I found the mistake, it was just not focused. The Data variable contains two keys that are sent as well result at the beginning and result_code at the end, which need to be truncated. I just had to modify the response as follows, and sending the data wit POST:
resp = requests.post(url, params=params_set, data={'request': json.dumps(data['result'])})
Thanks #DeepSpace

Nexus 3 Query builder how to retrieve artefacts in descending order

Is there a way we can retrieve artefacts in date descending order?
I currently have below script here as an example:
import org.sonatype.nexus.repository.storage.Asset
import org.sonatype.nexus.repository.storage.Query
import org.sonatype.nexus.repository.storage.StorageFacet
import groovy.json.JsonOutput
import groovy.json.JsonSlurper
def request = new JsonSlurper().parseText(args)
assert request.groupId: 'groupId parameter is required'
assert request.repoName: 'repoName parameter is required'
assert request.startDate: 'startDate parameter is required, format: yyyy-mm-dd'
log.info("Gathering Asset list for repository: ${request.repoName} as of startDate: ${request.startDate}")
def repo = repository.repositoryManager.get(request.repoName)
StorageFacet storageFacet = repo.facet(StorageFacet)
def tx = storageFacet.txSupplier().get()
tx.begin()
Iterable<Asset> assets = tx.
findAssets(Query.builder()
.where('group = ').param(request.groupId)
.and('last_updated > ').param(request.startDate)
.build(), [repo])
def urls = assets.collect { "/repository/${repo.name}/${it.name()}" }
tx.commit()
def result = JsonOutput.toJson([
assets : urls,
since : request.startDate,
repoName: request.repoName
])
return result
with:
Query.builder()
.where('group = ').param(request.groupId)
.and('last_updated > ').param(request.startDate)
.build()
def urls = assets.collect { "/repository/${repo.name}/${it.name()}" }
Is there a way we can change above script to retrieve things in date descending order?
You can simply add a suffix.
Query.builder()
.where('group = ').param(request.groupId)
.and('last_updated > ').param(request.startDate)
.suffix('order by last_updated desc')
.build()
def urls = assets.collect { "/repository/${repo.name}/${it.name()}" }
Nexus is using OrientDB behind the scene. You can find query examples here:
https://orientdb.com/docs/2.0/orientdb.wiki/Tutorial-SQL.html

Resources