Aws Programmatic launch batch job - python-3.x

Hey I have the following function to launch a batch job.
My batch job has two parameters to be passed in
--source
--destination
def kickoff_transfer_batch(self,item):
try:
batch = boto3.client('batch')
bucket, key = get_s3_bucket_and_key(item.source)
jobName = 'transfer-'+ key
jobQueue = 'aws-strikeforce-on-demand-restore-prod'
jobDefinition = 'aws-strikeforce-transfer-prod'
source = '--source ' + item.source
destination ='--destination ' + item.destination
command = []
command.append(source)
command.append(destination)
submit_job_response = batch.submit_job(
jobName=jobName,
jobQueue=jobQueue,
jobDefinition=jobDefinition,
containerOverrides={'command': command}
)
job_id = submit_job_response['jobId']
print('Submitted job {} {} to the job queue {}'.format(jobName, job_id, jobQueue))
except Exception as err:
item.errored = True
print("failed: " + item.source)
print("error: " + str(err))
stack_trace = traceback.format_exc()
self._log_error_notes(item.source, err, stack_trace)
My job is being launched from batch, but my container is failing to launch due to how I am passing in --source and --dest.
Here is the error log
main.py: error: unrecognized arguments: --source file_test.txt --destination file_test.txt
How can I fix my command list to pass in the proper arguments.
When I launch job at the command line I would just type
--source file, --dest file

The answer to this for future reference
def kickoff_transfer_batch(self,item):
try:
batch = boto3.client('batch')
bucket, key = get_s3_bucket_and_key(item.source)
jobName = 'transfer-'+ key
jobQueue = 'aws-strikeforce-on-demand-restore-prod'
jobDefinition = 'aws-strikeforce-transfer-prod'
command = '--source '+ item.source + '--destination ' + item.destination
command = command.split()
submit_job_response = batch.submit_job(
jobName=jobName,
jobQueue=jobQueue,
jobDefinition=jobDefinition,
containerOverrides={'command': command}
)
job_id = submit_job_response['jobId']
print('Submitted job {} {} to the job queue {}'.format(jobName, job_id, jobQueue))
except Exception as err:
item.errored = True
print("failed: " + item.source)
print("error: " + str(err))
stack_trace = traceback.format_exc()
self._log_error_notes(item.source, err, stack_trace)

Related

my python async script does not process requests quickly?

I have developed a Python script that allows (in theory) to convert WAV files to MP3 files. I'm trying to make it ASYNC (asyncio) to convert multiple files simultaneously and reduce the processing time.
but that I converted 1 or 10 the time spent and the same. I'm not very good at using async yet
class SoundsProcessing:
async def cconvert(self, sem, audioFileIndex, audioFile, aOutputFormat = 'mp3'):
try:
async with sem:
inputfile = tempfile.NamedTemporaryFile()
inputfile.write(audioFile)
outputfile = tempfile.NamedTemporaryFile()
AudioSegment.from_wav(inputfile.name).export(outputfile.name+'.'+aOutputFormat, format=aOutputFormat)
inputfile.close()
audio = await readAsyncFile(outputfile.name+'.'+aOutputFormat)
self.audioFiles[audioFileIndex] = audio
outputfile.close()
logger.add('INFO', "Audio Files conversion: " + outputfile.name + " indexed " + str(audioFileIndex) + " is Done")
return audio
except Exception as e:
logger.add('WARNING', "Audio Files conversion: " + str(e))
return False
async def audioConversion(self, aOutputAudioFormat = 'mp3'):
tasks = []
sem = asyncio.Semaphore(10)
start_at = time.time()
logger.add('INFO', "Audio Files conversion: Start session for " + str(len(self.audioFiles)) + " files/treatment")
audioFileIndex = 0
for audioFile in self.audioFiles:
task = asyncio.ensure_future(self.cconvert(sem, audioFileIndex, audioFile, aOutputAudioFormat))
tasks.append(task)
audioFileIndex = audioFileIndex + 1
responses = asyncio.gather(*tasks)
await responses
time_lapse = round(time.time() - start_at, 2)
query_by_seconds = round(len(self.audioFiles) / time_lapse, 2)
logger.add('INFO', "Audio Files conversion: End session on " + str(time_lapse) + " seconds ("+str(query_by_seconds)+"q/s)")
def convert(self, aOutputAudioFormat = 'mp3'):
self.results = {}
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
future = asyncio.ensure_future(self.audioConversion(aOutputAudioFormat))
loop.run_until_complete(future)
you can rewrite the code correctly as "async" - and at first glance it is ok - but it won't be any faster: this is a CPU bound task - that is, the process takes time in the .from_wav call. The asyncio loop will be blocked while that does not return.
What you can try, if you have a multi-core machine is to write the body of your cconvert method, the part inside the with sem: block in a synchronous method (forget the async file access), and run that in a ProcessPollExecutor, using the loop.run_in_executor call, passing in an explicit concurrent.futures.ProcessPoolExecutor instance.

"Error 'ResourceSummaryCollection' object is not subscriptable" OCI PythonSDK

I have python script that queries OCI Compute Instances with a defined tag and value of test and will take the results and stop any instance with those tags. I am expecting the query to return the result as a json object of according to this:
https://docs.oracle.com/en-us/iaas/Content/Search/Tasks/queryingresources.htm
However whenever I take the result and apply it to a variable I get the following error:
"Error 'ResourceSummaryCollection' object is not subscriptable"
I noticed that the query response is not returning as a json object. It is returning as a list.
Update:
This is the error I am getting after updating my script: "Error 'list' object has no attribute 'identifier'". I was expecting the query to return as a json object. When I try to convert the list into a json object via the json dumps method I get object is not serializable. Bottom line is, how do I pass the OCID "Identifier" from the query?
My function:
def do(signer):
print("Searching for untagged instance", flush=True)
# results = ""
# message = ""
# resp = ""
try:
search_client = oci.resource_search.ResourceSearchClient(config={}, signer=signer)
print("Search client initialized",flush=True)
PredefinedTag = "apps"
key= "test"
value= "test"
structured_search = oci.resource_search.models.StructuredSearchDetails(
query="query instance resources where (definedTags.namespace = '{}' && definedTags.key = '{}' && definedTags.value = '{}')".format(PredefinedTag,key,value),
type='Structured',
matching_context_type=oci.resource_search.models.SearchDetails.MATCHING_CONTEXT_TYPE_NONE)
print("Step1",flush=True)
#old
results = search_client.search_resources(structured_search).data
# print(results.items)
# print(results.items.identifier)
# print(results['items'][0]['identifier'])
print("Step2",flush=True)
instanceId = results(results.items.identifier)
# instanceId = results['items'][0]['identifier']
print("Step3",flush=True)
resp = perform_action(signer, instanceId , 'STOP')
print("Step4",flush=True)
except oci.exceptions.ServiceError as e:
print('RQS Search failed with Service Error: {0}'.format(e),flush=True)
raise
except oci.exceptions.RequestException as e:
print('RQS Search failed w/ a Request exception. {0}'.format(e),flush=True)
raise
return resp
Was able to reference the response from the query:
def do(signer):
print("Searching for untagged instance", flush=True)
try:
search_client = oci.resource_search.ResourceSearchClient(config={}, signer=signer)
print("Search client initialized",flush=True)
PredefinedTag = "apps"
key= "test"
value= "test"
structured_search = oci.resource_search.models.StructuredSearchDetails(
query="query instance resources where (definedTags.namespace = '{}' && definedTags.key = '{}' && definedTags.value = '{}')".format(PredefinedTag,key,value),
type='Structured',
matching_context_type=oci.resource_search.models.SearchDetails.MATCHING_CONTEXT_TYPE_NONE)
print("Step1",flush=True)
results = search_client.search_resources(structured_search)
for result in results.data.items:
print(result.identifier + " has availability tag checking status")
print(result.identifier + " status is " + result.lifecycle_state)
instanceId = result.identifier
resp = perform_action(signer, instanceId , 'START')
except oci.exceptions.ServiceError as e:
print('RQS Search failed with Service Error: {0}'.format(e),flush=True)
raise
except oci.exceptions.RequestException as e:
print('RQS Search failed w/ a Request exception. {0}'.format(e),flush=True)
raise
return resp

Is there a way to Keep track of all the bad records that are allowed while loading a ndjson file into Bigquery

I have a requirement where I need to keep track of all the bad records that were not feeded into bigquery after allowing max_bad_records. So I need them written in a File on storage for Future reference. I'm using BQ API for Python, Is there a way we can achieve this? I think if we are allowing max_bad_records we dont have the details of failed loads in BQ Load Job.
Thanks
Currently, there isn't a direct way of accessing and saving the bad records. However, you can access some job statistics including the reason why the record was skipped within BigQuery _job_statistics().
I have created an example, in order to demonstrate how the statistics will be shown. I have the following sample .csv file in a GCS bucket:
name,age
robert,25
felix,23
john,john
As you can see, the last row is a bad record, because I will import age as INT64 and there is a string in that row. In addition, I used the following code to upload it to BigQuery:
from google.cloud import bigquery
client = bigquery.Client()
table_ref = client.dataset('dataset').table('table_name')
job_config = bigquery.LoadJobConfig(
schema=[
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("age", "INT64"),
]
)
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job_config.skip_leading_rows = 1
job_config.max_bad_records = 5
#job_config.autodetect = True
# The source format defaults to CSV, so the line below is optional.
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://path/file.csv"
load_job = client.load_table_from_uri(
uri, table_ref, job_config=job_config
) # API request
print("Starting job {}".format(load_job.job_id))
load_job.result() # Waits for table load to complete.
print("Job finished.")
destination_table = client.get_table(table_ref)
print("Loaded {} rows.".format(destination_table.num_rows))
#Below all the statistics that might be useful in your case
job_state = load_job.state
job_id = load_job.job_id
error_result = load_job.error_result
job_statistics = load_job._job_statistics()
badRecords = job_statistics['badRecords']
outputRows = job_statistics['outputRows']
inputFiles = job_statistics['inputFiles']
inputFileBytes = job_statistics['inputFileBytes']
outputBytes = job_statistics['outputBytes']
print("***************************** ")
print(" job_state: " + str(job_state))
print(" non fatal error: " + str(load_job.errors))
print(" error_result: " + str(error_result))
print(" job_id: " + str(job_id))
print(" badRecords: " + str(badRecords))
print(" outputRows: " + str(outputRows))
print(" inputFiles: " + str(inputFiles))
print(" inputFileBytes: " + str(inputFileBytes))
print(" outputBytes: " + str(outputBytes))
print(" ***************************** ")
print("------ load_job.errors ")
The output from the statistics :
*****************************
job_state: DONE
non fatal errors: [{u'reason': u'invalid', u'message': u"Error while reading data, error message: Could not parse 'john' as INT64 for field age (position 1) starting at location 23", u'location': u'gs://path/file.csv'}]
error_result: None
job_id: b2b63e39-a5fb-47df-b12b-41a835f5cf5a
badRecords: 1
outputRows: 2
inputFiles: 1
inputFileBytes: 33
outputBytes: 26
*****************************
As it is shown above, the erros field returns the non fatal errors, which includes the bad records. In other words, it retrieves individual errors generated by the job. Whereas, the error_result returns the error information as the job as a whole.
I believe these statistics might help you analyse your bad records. Lastly, you can output them into a file, using write(), such as:
with open("errors.txt", "x") as f:
f.write(load_job.errors)
f.close()

Python subprocess Popen not outputting sqlplus error strings

When I run sqlplus in Python using subprocess I get no output when there are SQL errors, or update or insert statements returning number of rows updated or inserted. When I run select statements with no errors I do get the output.
Here is my code:
This creates a string with newlines that are then written to process.stdin.write()
def write_sql_string(process, args):
sql_commands = ''
sql_commands += "WHENEVER SQLERROR EXIT SQL.SQLCODE;\n"
sql_line = '#' + args.sql_file_name
if crs_debug:
print('DEBUG: ' + 'sys.argv', ' '.join(sys.argv))
if len(args.sql_args) > 0:
len_argv = len(args.sql_args)
sql_commands += "SET VERIFY OFF\n"
for i in range(0, len_argv):
sql_line += ' "' + args.sql_args[i] + '"'
sql_commands += sql_line + "\n"
# if prod_env:
sql_commands += "exit;\n"
if crs_debug:
print('DEBUG: ' + 'sql_line: ' + sql_line)
process.stdin.write(sql_commands)
This code executes the SQL commands
def execute_sql_file(username, dbpass, args):
db_conn_str = username + '/' + dbpass + '#' + args.dbname
# '-S', - Silent
sqlplus_cmd = ['sqlplus', '-S', '-L', db_conn_str]
if crs_debug:
print('DEBUG: ' + ' '.join(sqlplus_cmd))
process = subprocess.Popen(sqlplus_cmd,
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
write_sql_string(process, args)
stdout, stderr = process.communicate()
# Get return code of sql query
stdout_lines = stdout.split("\n")
print('STDOUT')
for line in stdout_lines:
line = line.rstrip()
print(line)
stderr_lines = stderr.split("\n")
print('STDERR')
for line in stderr_lines:
line = line.rstrip()
print(line)
sqlplus_rc = process.poll()
# Check if sqlplus returned an error
if sqlplus_rc != 0:
print("FAILURE in " + script_name + " in connecting to Oracle exit code: " + str(sqlplus_rc))
print(stderr_data)
sys.exit(sqlplus_rc)
When I run I run my code for a SQL file that requires parameters, if I run it with missing parameters I get no output. If I run it with parameters I get the correct output.
Here is an example SQL file sel_dual.sql:
SELECT 'THIS IS TEXT &1 &2' FROM dual;
As an example command line:
run_sql_file.py dbname sql_file [arg1]...[argn]
If I run the script with
run_sql_file.py dbname sel_dual.py
I get no output, even though it should ask for a parameter and give other error output.
If I run the script with
run_sql_file.py dbname sel_dual.py Seth F
I get the proper output:
'THISISTEXTSETHF'
----------------------------------------------------------------------------
THIS IS TEXT Seth F
The args referred to is the result of processing args with the argparse module:
parser = argparse.ArgumentParser(description='Run a SQL file with ptional arguments using SQLPlus')
parser.add_argument('dbname', help='db (environment) name')
parser.add_argument('sql_file_name', help='sql file')
parser.add_argument('sql_args', nargs='*', help='arguments for sql file')
args = parser.parse_args()
Does anybody know what could be causing this? I've omitted the rest of the script since it basically gets command arguments and validates that the SQL file exists.
I am running sqlplus version Release 12.1.0.2.0 Production. I am running Python version 3.7.6. I am running on Linux (not sure what version). The kernel release is 4.1.12-124.28.5.el7uek.x86_64.

Boto3/Lambda - Join multiple outputs from a loop and send in one email using AWS SNS

New to Python/Boto3, this should be an easy one but still learning :)
I have a Lambda function which creates a number of snapshots and works fine:
def create_snapshot():
volumes = ec2_client.describe_volumes(
Filters=[
{'N'...
...
for volume in volumes...
....
snap_name = 'Backup of ' + snap_desc
....
snap = ec2_client.create_snapshot(
VolumeId=vol_id,
Description=snap_desc
)
I then want to receive an email from AWS SNS to let me know which snapshots the function created, which I do using:
message = sns.publish(
TopicArn=SNSARN,
Subject=("Function executed"),
Message=("%s created" % snap_name)
)
The issue is that this creates an email for each snapshot, instead of one email listing all the snapshots. Should I create another function that calls all values produced by snap_desc, or can I send all values for snap_desc in the function? And most importantly what's the best way of doing this?
Cheers!
Scott
####################### UPDATE (Thanks #omuthu) #######################
I set an array inside and outside the loop, and put the string into the message. This produced the following being sent in one message:
The following snapshots have been created:
['vol-0e0b9a5dfb8379fc0 (Instance 1 - /dev/sda1)', 'vol-03aac6b65df64661e (Instance 4 - /dev/sda1)', 'vol-0fdde765dfg452631 (Instance 2 - /dev/sda1)', 'vol-0693a9568b11f625f (Instance 3 - /dev/sda1)', etc.
Okay got it sorted, finally!
def create_snapshot():
volumes = ec2_client.describe_volumes(
Filters=[
{'N'...
...
inst_list = []
for volume in volumes...
vol_id = volume['VolumeId']
....
snap_desc = vol_id
for name in volume['Tags']:
tag_key = name['Key']
tag_val = name['Value']
if tag_key == 'Name':
snap_desc = vol_id + ' (' + tag_val + ')'
....
....
....
if backup_mod is False or (current_hour + 10) % backup_mod != 0:
...
continue
else:
print("%s is scheduled this hour" % vol_id)
for name in volume['Tags']:
inst_tag_key = name['Key']
inst_tag_val = name['Value']
if inst_tag_key == 'Name':
inst_list.append(inst_tag_val)
snap = ec2_client.create_snapshot(
VolumeId=vol_id,
Description=snap_desc,
)
print("%s created" % snap['SnapshotId'])
msg = str("\n".join(inst_list))
if len(inst_list) != 0:
message = sns.publish(
TopicArn=SNSARN,
Subject=("Daily Lambda snapshot function complete"),
Message=("The following snapshots have been created today:\n\n" + msg + "\n")
)
print("Response: {}".format(message))

Resources