Lambda/boto3/python loop - python-3.x

This code acts as an early warning system for ADFS failures, which works fine when run locally. Problem is that when I run it in Lambda, it loops non stop.
In short:
lambda_handler() runs pagecheck()
pagecheck() produces the info needed then passes 2 lists (msgdet_list, error_list) and an int (error_count) to notification().
notification() collates and prints the output. The output is two key variables (notificationheader and notificationbody).
I've #commentedOut the SNS piece which would usually email the info, and am using print() to instead send the info to CloudWatch logs until I can get the loop sorted. Logs:
CloudWatch logs
If I run this locally, it produces a clean single output. In Lambda, the function will loop until it times out. It's almost like every time the lists are updated, they're passed to the notification() module and it's run. I can limit the function time, but would rather fix the code!
Cheers,
tac
# This python/boto3/lambda script sends a request to an Office 365 landing page, parses return details to confirm a successful redirect to /
# the organisation ADFS homepage, authenticates homepge is correct, raises any errors, and sends a consolodated report to /
# an AWS SNS topic.
# Run once to produce pageserver and htmlchar values for global variables.
# Import required modules
import boto3
import urllib.request
from urllib.request import Request, urlopen
from datetime import datetime
import time
import re
import sys
# Global variables to be set
url = "https://outlook.com/CONTOSSO.com"
adfslink = "https://sts.CONTOSSO.com/adfs/ls/?client-request-id="
# Input after first run
pageserver = "Microsoft-HTTPAPI/2.0 Microsoft-HTTPAPI/2.0"
htmlchar = 18600
# Input AWS SNS ARN
snsarn = 'arn:aws:sns:ap-southeast-2:XXXXXXXXXXXXX:Daily_Check_Notifications_CONTOSSO'
sns = boto3.client('sns')
def pagecheck():
# Present the request to the webpage as if coming from a user in a browser
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
values = {'name' : 'user'}
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
data = data.encode('ascii')
# "Null" the Message Detail and Error lists
msgdet_list = []
error_list = []
request = Request(url)
req = urllib.request.Request(url, data, headers)
response = urlopen(request)
with urllib.request.urlopen(request) as response:
# Get the URL. This gets the real URL.
acturl = response.geturl()
msgdet_list.append("\nThe Actual URL is:")
msgdet_list.append(str(acturl))
if adfslink not in acturl:
error_list.append(str("Redirect Fail"))
# Get the HTTP resonse code
httpcode = response.code
msgdet_list.append("\nThe HTTP code is: ")
msgdet_list.append(str(httpcode))
if httpcode//200 != 1:
error_list.append(str("No HTTP 2XX Code"))
# Get the Headers as a dictionary-like object
headers = response.info()
msgdet_list.append("\nThe Headers are:")
msgdet_list.append(str(headers))
if response.info() == "":
error_list.append(str("Header Error"))
# Get the date of request and compare to UTC (DD MMM YYYY HH MM)
date = response.info()['date']
msgdet_list.append("The Date is: ")
msgdet_list.append(str(date))
returndate = str(date.split( )[1:5])
returndate = re.sub(r'[^\w\s]','',returndate)
returndate = returndate[:-2]
currentdate = datetime.utcnow()
currentdate = currentdate.strftime("%d %b %Y %H%M")
if returndate != currentdate:
date_error = ("Date Error. Returned Date: ", returndate, "Expected Date: ", currentdate, "Times in UTC (DD MMM YYYY HH MM)")
date_error = str(date_error)
date_error = re.sub(r'[^\w\s]','',date_error)
error_list.append(str(date_error))
# Get the server
headerserver = response.info()['server']
msgdet_list.append("\nThe Server is: ")
msgdet_list.append(str(headerserver))
if pageserver not in headerserver:
error_list.append(str("Server Error"))
# Get all HTML data and confirm no major change to content size by character lenth (global var: htmlchar).
html = response.read()
htmllength = len(html)
msgdet_list.append("\nHTML Length is: ")
msgdet_list.append(str(htmllength))
msgdet_list.append("\nThe Full HTML is: ")
msgdet_list.append(str(html))
msgdet_list.append("\n")
if htmllength // htmlchar != 1:
error_list.append(str("Page HTML Error - incorrect # of characters"))
if adfslink not in str(acturl):
error_list.append(str("ADFS Link Error"))
error_list.append("\n")
error_count = len(error_list)
if error_count == 1:
error_list.insert(0, 'No Errors Found.')
elif error_count == 2:
error_list.insert(0, 'Error Found:')
else:
error_list.insert(0, 'Multiple Errors Found:')
# Pass completed results and data to the notification() module
notification(msgdet_list, error_list, error_count)
# Use AWS SNS to create a notification email with the additional data generated
def notification(msgdet_list, error_list, errors):
datacheck = str("\n".join(msgdet_list))
errorcheck = str("\n".join(error_list))
notificationbody = str(errorcheck + datacheck)
if errors >1:
result = 'FAILED!'
else:
result = 'passed.'
notificationheader = ('The daily ADFS check has been marked as ' + result + ' ' + str(errors) + ' ' + str(error_list))
if result != 'passed.':
# message = sns.publish(
# TopicArn = snsarn,
# Subject = notificationheader,
# Message = notificationbody
# )
# Output result to CloudWatch logstream
print('Response: ' + notificationheader)
else:
print('passed')
sys.exit()
# Trigger the Lambda handler
def lambda_handler(event, context):
aws_account_ids = [context.invoked_function_arn.split(":")[4]]
pagecheck()
return "Successful"
sys.exit()

Your CloudWatch logs contain the following error message:
Process exited before completing request
This is caused by invoking sys.exit() in your code. Locally your Python interpreter will just terminate when encountering such a sys.exit().
AWS Lambda on the other hand expects a Python function to just return and handles sys.exit() as an error. As your function probably got invoked asynchronously AWS Lambda retries to execute it twice.
To solve your problem, you can replace the occurences of sys.exit() with return or even better, just remove the sys.exit() calls, as there would be already implicit returns in the places where you use sys.exit().

Related

Using ctrader-fix to download historical data from cTrader

I am using the python package ctrader-fix (https://pypi.org/project/ctrader-fix/) to download historical price data from ctrader's API (https://help.ctrader.com/fix/).
The code does not make clear to me at least where exactly I declare the symbol (e.g. 'NatGas') through its SymbolID code number (in the case of 'NatGas' the SymbolID code number is 10055) for which I request historical data but also it does not make clear where I specify the timeframe I am interested on (e.g. 'H' for hourly data) and the number of records I want to retrieve.
section of ctrader where the FIX SymbolID number of 'NatGas' is provided
The code that is provided is the following (I have filled the values except the username).
config = {
'Host': '',
'Port': 5201,
'SSL': False,
'Username': '****************',
'Password': '3672075',
'BeginString': 'FIX.4.4',
'SenderCompID': 'demo.pepperstoneuk.3672025',
'SenderSubID': 'QUOTE',
'TargetCompID': 'cServer',
'TargetSubID': 'QUOTE',
'HeartBeat': '30'
}
client = Client(config["Host"], config["Port"], ssl = config["SSL"])
def send(request):
diferred = client.send(request)
diferred.addCallback(lambda _: print("\nSent: ", request.getMessage(client.getMessageSequenceNumber()).replace("", "|")))
def onMessageReceived(client, responseMessage): # Callback for receiving all messages
print("\nReceived: ", responseMessage.getMessage().replace("", "|"))
# We get the message type field value
messageType = responseMessage.getFieldValue(35)
# we send a security list request after we received logon message response
if messageType == "A":
securityListRequest = SecurityListRequest(config)
securityListRequest.SecurityReqID = "A"
securityListRequest.SecurityListRequestType = 0
send(securityListRequest)
# After receiving the security list we send a market order request by using the security list first symbol
elif messageType == "y":
# We use getFieldValue to get all symbol IDs, it will return a list in this case
# because the symbol ID field is repetitive
symboldIds = responseMessage.getFieldValue(55)
if config["TargetSubID"] == "TRADE":
newOrderSingle = NewOrderSingle(config)
newOrderSingle.ClOrdID = "B"
newOrderSingle.Symbol = symboldIds[1]
newOrderSingle.Side = 1
newOrderSingle.OrderQty = 1000
newOrderSingle.OrdType = 1
newOrderSingle.Designation = "From Jupyter"
send(newOrderSingle)
else:
marketDataRequest = MarketDataRequest(config)
marketDataRequest.MDReqID = "a"
marketDataRequest.SubscriptionRequestType = 1
marketDataRequest.MarketDepth = 1
marketDataRequest.NoMDEntryTypes = 1
marketDataRequest.MDEntryType = 0
marketDataRequest.NoRelatedSym = 1
marketDataRequest.Symbol = symboldIds[1]
send(marketDataRequest)
# after receiving the new order request response we stop the reactor
# And we will be disconnected from API
elif messageType == "8" or messageType == "j":
print("We are done, stopping the reactor")
reactor.stop()
def disconnected(client, reason): # Callback for client disconnection
print("\nDisconnected, reason: ", reason)
def connected(client): # Callback for client connection
print("Connected")
logonRequest = LogonRequest(config)
send(logonRequest)
# Setting client callbacks
client.setConnectedCallback(connected)
client.setDisconnectedCallback(disconnected)
client.setMessageReceivedCallback(onMessageReceived)
# Starting the client service
client.startService()
# Run Twisted reactor, we imported it earlier
reactor.run()
Can you explain the code to me and provide instructions on how to get for example hourly data for NatGas (1,000 observations)?`

How to get the list of followers from an Instagram account without getting banned?

I am trying to scrape all the followers of some particular Instagram accounts. I am using Python 3.8.3 and the latest version of Instaloader library. The code I have written is given below:
# Import the required libraries:
import instaloader
import time
from random import randint
# Start time:
start = time.time()
# Create an instance of instaloader:
loader = instaloader.Instaloader()
# Credentials & target account:
user_id = USERID
password = PASSWORD
target = TARGET # Account of which the list of followers need to be scraped;
# Login or load the session:
loader.login(user_id, password)
# Obtain the profile metadata of the target:
profile = instaloader.Profile.from_username(loader.context, target)
# Print the list of followers and save it in a text file:
try:
# The list to store the collected user handles of the followers:
followers_list = []
# Variables used to apply pauses to slow down scraping:
count = 0
short_counter = 1
short_pauser = randint(19, 24)
long_counter = 1
long_pauser = randint(4900, 5000)
# Fetch the followers one by one:
for follower in profile.get_followers():
sleeper = randint(840, 1020)
# Short pause for the process:
if (short_counter % short_pauser == 0):
short_counter = 0
short_pauser = randint(19, 24)
print('\nShort Pause.\n')
time.sleep(1)
# Long pause for the process:
if (long_counter % long_pauser == 0):
long_counter = 0
long_pauser = randint(4900, 5000)
print('\nLong pause.\n')
time.sleep(sleeper)
# Append the list and print the follower's user handle:
followers_list.append(follower.username)
print(count,'', followers_list[count])
# Increment the counters accordingly:
count = count + 1
short_counter = short_counter + 1
long_counter = long_counter + 1
# Store the followers list in a txt file:
txt_file = target + '.txt'
with open(txt_file, 'a+') as f:
for the_follower in followers_list:
f.write(the_follower)
f.write('\n')
except Exception as e:
print(e)
# End time:
end = time.time()
total_time = end - start
# Print the time taken for execution:
print('Time taken for complete execution:', total_time,'s.')
I am getting the following error after scraping some data:
HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.
HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.
400 Bad Request
In fact, the error occurs when Instagram detects unusual activity and disables the account for a while and prompts the user to change the password.
I have tried -
(1) Slowing down the process of scraping.
(2) Adding pauses in between in order to make the program more human-like.
Still, no progress.
How to bypass such restrictions and get the complete list of all the followers?
If getting the entire list is not possible, what is the best way to get at least 20,000 followers list (from multiple accounts) without getting banned / disabled account / facing such inconveniences?

Flurry CSV Download (API) Event Count Mismatch

I was testing a CSV download of my app events using the API.
I noticed that the CSV had different event counts for different calls for the same time period.
All data (for each download) was correct for my app and the requested time periods.
Does anyone knows if they sample the data to create the file for download?
Edited to include sample call, code for extraction, and result for 2 calls for the same time period.
Call
str_init = '20191101'
str_end = '20191102'
# Call data extraction for Flurry from IOS app
get_csv_from_flurry(str_init, str_end, 'IOS')
Code for Extraction
from datetime import datetime
from dateutil import parser
import requests
import json
import time
from functions.ribon_path import ribon_root_path_join
from functions.ribon_s3_integration import ribon_upload_to_s3
"""
Make CSV extraction from flurry based on initial date (yyyy-mm-dd), end date (yyyy-mm-dd) and platform
Save Uncompressed CSV locally for processing
Save compressed file (parquet) to S3 for backup
"""
def get_csv_from_flurry(str_ini, str_end, str_platform):
# Convert time period to datetime format
dt_ini = parser.parse(str_ini)
dt_end = parser.parse(str_end)
def unix_time_millis(dt):
# Convert date periods to unix milisecon epoch
epoch = datetime.utcfromtimestamp(0)
return (dt - epoch).total_seconds() * 1000.0
epoch_ini = unix_time_millis(dt_ini)
epoch_end = unix_time_millis(dt_end)
#print(epoch_ini)
#print(epoch_fim)
if str_platform == 'IOS' :
Flurry_apiKey = 'XXX'
else :
Flurry_apiKey = 'YYY'
# Build the parameters of the post request to the flurry API
url = 'https://rawdata.flurry.com/pulse/v1/rawData'
payload = {"data": {
"type":"rawData",
"attributes":{
"startTime": epoch_ini,
"endTime": epoch_end,
"outputFormat": "CSV",
"apiKey": Flurry_apiKey
}
}
}
headers = {"accept": "application/vnd.api+json",
"authorization": "Bearer ZZZ",
"cache-control": "no-cache",
"content-type": "application/vnd.api+json"
}
#print(payload)
# Make the request
print('Make Request to Flurry')
r = requests.post(url, data=json.dumps(payload), headers=headers)
#print(r.content)
# Test the return, get the status, download url and request id
test = r.json()
#print(teste['data']['attributes']['s3URI'])
#print(teste['data']['id'])
r_s3URI = test['data']['attributes']['s3URI']
r_id = test['data']['id']
# Check if the download link is ready
url = 'https://rawdata.flurry.com/pulse/v1/rawData/' + r_id + '?fields[rawData]=requestStatus,s3URI'
#print(url)
payload = {}
headers = {"accept": "application/vnd.api+json",
"authorization": "Bearer ZZZ",
"cache-control": "no-cache",
"content-type": "application/vnd.api+json"
}
print('Request OK')
# Check each minute if the download link is ready
print('Start Pooling to Check if the File is Ready for Download')
while r_s3URI == None:
time.sleep(60)
# Make the request
r = requests.get(url, data=json.dumps(payload), headers=headers)
print(r.content)
test = r.json()
#print(test['data']['attributes']['s3URI'])
r_s3URI = test['data']['attributes']['s3URI']
# When the download is ready, get the file and save
# Set local folder to save file
flurry_filename = str_ini + '_' + str_end + '_' + str_platform + '.csv.gz'
flurry_path_gz = ribon_root_path_join('data', 'Flurry_Download', flurry_filename)
# Download the file
print('Start Flurry Download')
myfile = requests.get(r_s3URI)
open(flurry_path_gz, 'wb').write(myfile.content)
On the link there is an image with the 2 files I got, they are not the same size and don't have the same number of records
With the help from Flurry Support, I found out the differences.
For API downloads older than 15 days, the API calls are giving the same numbers every time.
API calls for dates up to 15 days most times get different results (newer calls with more records). The older the call the smaller the difference, so I agree with the support that this can be accounted for late arriving events.
Flurry is not online and works by queuing data on the mobile and dumping that to the server.

How can I return a string from a Google BigQuery row iterator object?

My task is to write a Python script that can take results from BigQuery and email them out. I've written a code that can successfully send an email, but I am having trouble including the results of the BigQuery script in the actual email. The query results are correct, but the actual object I am returning from the query (results) always returns as a Nonetype.
For example, the email should look like this:
Hello,
You have the following issues that have been "open" for more than 7 days:
-List issues here from bigquery code
Thanks.
The code reads in contacts from a contacts.txt file, and it reads in the email message template from a message.txt file. I tried to make the bigquery object into a string, but it still results in an error.
from google.cloud import bigquery
import warnings
warnings.filterwarnings("ignore", "Your application has authenticated using end user credentials")
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from string import Template
def query_emailtest():
client = bigquery.Client(project=("analytics-merch-svcs-thd"))
query_job = client.query("""
select dept, project_name, reset, tier, project_status, IssueStatus, division, store_number, top_category,
DATE_DIFF(CURRENT_DATE(), in_review, DAY) as days_in_review
from `analytics-merch-svcs-thd.MPC.RESET_DETAILS`
where in_review IS NOT NULL
AND IssueStatus = "In Review"
AND DATE_DIFF(CURRENT_DATE(), in_review, DAY) > 7
AND ready_for_execution IS NULL
AND project_status = "Active"
AND program_name <> "Capital"
AND program_name <> "SSI - Capital"
LIMIT 50
""")
results = query_job.result() # Waits for job to complete.
return results #THIS IS A NONETYPE
def get_queryresults(results): #created new method to put query results into a for loop and store it in a variable
for i,row in enumerate(results,1):
bq_data = (i , '. ' + str(row.dept) + " " + row.project_name + ", Reset #: " + str(row.reset) + ", Store #: " + str(row.store_number) + ", " + row.IssueStatus + " for " + str(row.days_in_review)+ " days")
print (bq_data)
def get_contacts(filename):
names = []
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
names.append(a_contact.split()[0])
emails.append(a_contact.split()[1])
return names, emails
def read_template(filename):
with open(filename, 'r', encoding='utf-8') as template_file:
template_file_content = template_file.read()
return Template(template_file_content)
names, emails = get_contacts('mycontacts.txt') # read contacts
message_template = read_template('message.txt')
results = query_emailtest()
bq_results = get_queryresults(query_emailtest())
import smtplib
# set up the SMTP server
s = smtplib.SMTP(host='smtp-mail.outlook.com', port=587)
s.starttls()
s.login('email', 'password')
# For each contact, send the email:
for name, email in zip(names, emails):
msg = MIMEMultipart() # create a message
# bq_data = get_queryresults(query_emailtest())
# add in the actual person name to the message template
message = message_template.substitute(PERSON_NAME=name.title())
message = message_template.substitute(QUERY_RESULTS=bq_results) #SUBSTITUTE QUERY RESULTS IN MESSAGE TEMPLATE. This is where I am having trouble because the Row Iterator object results in Nonetype.
# setup the parameters of the message
msg['From']='email'
msg['To']='email'
msg['Subject']="This is TEST"
# body = str(get_queryresults(query_emailtest())) #get query results from method to put into message body
# add in the message body
# body = MIMEText(body)
#msg.attach(body)
msg.attach(MIMEText(message, 'plain'))
# query_emailtest()
# get_queryresults(query_emailtest())
# send the message via the server set up earlier.
s.send_message(msg)
del msg
Message template:
Dear ${PERSON_NAME},
Hope you are doing well. Please find the following alert for Issues that have been "In Review" for greater than 7 days.
${QUERY_RESULTS}
If you would like more information, please visit this link that contains a complete dashboard view of the alert.
ISE Services
The BQ result() function returns a generator, so I think you need to change your return to yield from.
I'm far from a python expert, but the following pared-down code worked for me.
from google.cloud import bigquery
import warnings
warnings.filterwarnings("ignore", "Your application has authenticated using end user credentials")
def query_emailtest():
client = bigquery.Client(project=("my_project"))
query_job = client.query("""
select field1, field2 from `my_dataset.my_table` limit 5
""")
results = query_job.result()
yield from results # NOTE THE CHANGE HERE
results = query_emailtest()
for row in results:
print(row.field1, row.field2)

BurpSuite API - Get Response from edited requests

I have a problem with Burpsuite API that I can't find a proper function to print out the response for edited requests . I'm developing a new plugin for burpsuite with python . myscript is simply takes requests from proxy then it edit headers and send it again .
from burp import IBurpExtender
from burp import IHttpListener
import re,urllib2
class BurpExtender(IBurpExtender, IHttpListener):
def registerExtenderCallbacks(self, callbacks):
self._callbacks = callbacks
self._helpers = callbacks.getHelpers()
callbacks.setExtensionName("Burp Plugin Python Demo")
callbacks.registerHttpListener(self)
return
def processHttpMessage(self, toolFlag, messageIsRequest, currentRequest):
# only process requests
if messageIsRequest:
requestInfo = self._helpers.analyzeRequest(currentRequest)
#timestamp = datetime.now()
#print "Intercepting message at:", timestamp.isoformat()
headers = requestInfo.getHeaders()
#print url
if(requestInfo.getMethod() == "GET"):
print "GET"
print requestInfo.getUrl()
response = urllib2.urlopen(requestInfo.getUrl())
print response
elif(requestInfo.getMethod() == "POST"):
print "POST"
print requestInfo.getUrl()
#for header in headers:
#print header
bodyBytes = currentRequest.getRequest()[requestInfo.getBodyOffset():]
bodyStr = self._helpers.bytesToString(bodyBytes)
bodyStr = re.sub(r'=(\w+)','=<xss>',bodyStr)
newMsgBody = bodyStr
newMessage = self._helpers.buildHttpMessage(headers, newMsgBody)
print "Sending modified message:"
print "----------------------------------------------"
print self._helpers.bytesToString(newMessage)
print "----------------------------------------------\n\n"
currentRequest.setRequest(newMessage)
return
You need to print the response but you don't do anything in case messageIsRequest is false. When messageIsRequest is false it means that the currentRequest is a response and you can print out the response as you did for the request. I did it in Java like this:
def processHttpMessage(self, toolFlag, messageIsRequest, httpRequestResponse):
if messageIsRequest:
....
else
HTTPMessage = httpRequestResponse.getResponse()
print HTTPMessage
There is even a method that lets you bind request and response together when using a proxy. It can be found in IInterceptedProxyMessage:
/**
* This method retrieves a unique reference number for this
* request/response.
*
* #return An identifier that is unique to a single request/response pair.
* Extensions can use this to correlate details of requests and responses
* and perform processing on the response message accordingly.
*/
int getMessageReference();
I don't think it is supported for HTTPListeners.
I am writing the extensions in Java and tried to translate to Python for this anwser. I haven't tested this code and some bugs might be introduced due to translation.

Resources