django and asyncio - fetch data asynchronously from remote REST endpoint

django and asyncio - fetch data asynchronously from remote REST endpoint - python-3.x

I'm trying to rewrite a django management command in an asynchronous way using asyncio and aiohttp. Those are the files involved:
# rest_async.py
async def t_search_coro(token, loop, **kwargs):
"""
ws T Search Query:
kwargs:
- modification_start_date: (str) Format: YYYY-MM-DDTHH:MM:SS (e.g.: 2013-02-26T11:00:00)
- modification_end_date: (str) Format: YYYY-MM-DDTHH:MM:SS (e.g.: 2013-02-26T11:00:00)
- lo_type: (str) LO Type. Defaults to 'Event'
- status: (str) T Status of the LO. Required
- portal: portal. Default: settings.PORTAL
- page_nr: PageNumber querystring parameter. Default: 1
"""
path = '/services/api/TSearch'
method = 'GET'
modification_start_date = kwargs.pop('modification_start_date')
modification_end_date = kwargs.pop('modification_end_date')
lo_type = kwargs.pop('lo_type', 'Event')
status = kwargs.pop('status')
portal = kwargs.pop('portal', settings.PORTAL)
page_nr = kwargs.pop('page_nr', 1)
debugging = kwargs.pop('debugging', True)
signature_kws = get_signature_kwargs(token, path, method)
headers = signature_kws.get('headers')
params = {
'LOType': lo_type,
'Status': status,
'PageNumber': page_nr,
'format': 'JSON'
}
if modification_start_date is not None:
params['ModificationStartDate'] = modification_start_date
if modification_end_date is not None:
params['ModificationEndDate'] = modification_end_date
service_end_point = 'https://{}.example.net{}'.format(portal, path)
print("fetching data: {} - {}".format(modification_start_date, modification_end_date))
async with aiohttp.ClientSession(loop=loop) as session:
async with session.get(url=service_end_point, params=params, headers=headers) as resp:
assert resp.status == 200
return await resp.read()
# utils_async.py
async def fetch_t_data_coro(
loop, lo_type='Session', modification_start_date=now()-timedelta(hours=22), modification_end_date=now(),
status='Completed', **kwargs):
date_fmt = "%Y-%m-%dT%H:%M:%S"
if (modification_end_date - modification_start_date).total_seconds() > timedelta(days=1).total_seconds():
raise Exception("modification start/end datetime interval must be within 24 hrs."
"\nmod. start date: {}\nmod. end date: {}".format(
modification_start_date.strftime(date_fmt), modification_end_date.strftime(date_fmt)
))
debugging = kwargs.pop('debugging', False)
page_nr = kwargs.get('page_nr', 1)
modification_start_date = modification_start_date.strftime(date_fmt)
modification_end_date = modification_end_date.strftime(date_fmt)
rtn_data = []
params = {
'LOType': lo_type, 'Status': status, 'PageNumber': page_nr, 'format': 'JSON'
}
already_added = set()
while True:
data = await rest_async.t_search_coro(
token, loop, modification_start_date=modification_start_date, modification_end_date=modification_end_date,
lo_type=lo_type, status=status, page_nr=page_nr, debugging=debugging
)
data_dict = json.loads(data.decode('utf-8'))
if 'data' not in data_dict:
break
total_pages = data_dict['data'][0]['T_Item']['TotalPages']
t_raw_data = data_dict['data'][0]['T_Item']['T']
for item in t_raw_data:
_h = hash(json.dumps(item, sort_keys=True))
if _h in already_added:
continue
already_added.add(_h)
rtn_data.append(item)
if page_nr >= total_pages:
break
page_nr += 1
return rtn_data
# load_data_async.py (actual django management command)
import asyncio
from datetime import timedelta, datetime
import argparse
import logging
from django.core.management.base import BaseCommand
from django.utils.timezone import now
from myapp.utils_async import fetch_transcript_data_coro
RUNNING_INTERVAL_MINS = 60
logger = logging.getLogger('my_proj')
MAX_BACKDAYS = 160
BACKDAYS_HOURS = {3, 9, 15, 21}
DEFAULT_TIMEFRAME=24
GO_BACK_DAYS = 30
GO_BACK_DAYS_TIMEFRAME = 24
class Command(BaseCommand):
help = "fetch data asynchrounously"
def add_arguments(self, parser):
parser.add_argument(
'--timeframe', action='store', dest='timeframe', default=DEFAULT_TIMEFRAME, type=int,
help='Timeframe hours to be used (default to 24, range: 1 to 24)'
)
parser.add_argument(
'--backdays', action='store', dest='backdays', default=None, type=int,
help='repeat the command execution (for the same timeframe) n days before the current day'
)
parser.add_argument('--start-date', type=valid_date_type)
parser.add_argument('--end-date', type=valid_date_type)
def handle(self, *args, **options):
self.loop = asyncio.get_event_loop()
self.loop.run_until_complete(self._handle(*args, **options))
async def _handle(self, *args, **options):
timeframe = options.get('timeframe')
backdays = options.get('backdays', None)
start_date = options.get('start_date')
end_date = options.get('end_date')
backdays = backdays + 1 if backdays is not None else 1
if all([start_date is not None, end_date is not None]):
days_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
else:
days_range = [now() - timedelta(days=x) for x in range(backdays)]
for mod_end_datetime in days_range:
mod_start_datetime = mod_end_datetime - timedelta(minutes=RUNNING_INTERVAL_MINS * timeframe)
data = await fetch_t_data_coro(
loop=self.loop, modification_start_date=mod_start_datetime, modification_end_date=mod_end_datetime
)
def valid_date_type(arg_date_str):
try:
return datetime.strptime(arg_date_str, "%Y-%m-%d")
except ValueError:
msg = "Given Date ({0}) not valid! Expected format, YYYY-MM-DD!".format(arg_date_str)
raise argparse.ArgumentTypeError(msg)
I then tried to run the cmd as:
python manage.py load_data_async --start-date 2018-04-20 --end-date 2018-06-6
the command runs without errors, however it seems from the print statement that the coroutines are executed sequentially, in the same way of the original synchrounous code:
# output
fetching data: 2018-04-19T00:00:00 - 2018-04-20T00:00:00
fetching data: 2018-04-19T00:00:00 - 2018-04-20T00:00:00
fetching data: 2018-04-20T00:00:00 - 2018-04-21T00:00:00
fetching data: 2018-04-20T00:00:00 - 2018-04-21T00:00:00
fetching data: 2018-04-20T00:00:00 - 2018-04-21T00:00:00
fetching data: 2018-04-20T00:00:00 - 2018-04-21T00:00:00
fetching data: 2018-04-20T00:00:00 - 2018-04-21T00:00:00
fetching data: 2018-04-20T00:00:00 - 2018-04-21T00:00:00
fetching data: 2018-04-20T00:00:00 - 2018-04-21T00:00:00
fetching data: 2018-04-21T00:00:00 - 2018-04-22T00:00:00
fetching data: 2018-04-21T00:00:00 - 2018-04-22T00:00:00
fetching data: 2018-04-21T00:00:00 - 2018-04-22T00:00:00
fetching data: 2018-04-22T00:00:00 - 2018-04-23T00:00:00
fetching data: 2018-04-23T00:00:00 - 2018-04-24T00:00:00
fetching data: 2018-04-24T00:00:00 - 2018-04-25T00:00:00
fetching data: 2018-04-24T00:00:00 - 2018-04-25T00:00:00
fetching data: 2018-04-25T00:00:00 - 2018-04-26T00:00:00
fetching data: 2018-04-25T00:00:00 - 2018-04-26T00:00:00
fetching data: 2018-04-25T00:00:00 - 2018-04-26T00:00:00
fetching data: 2018-04-26T00:00:00 - 2018-04-27T00:00:00
fetching data: 2018-04-26T00:00:00 - 2018-04-27T00:00:00
fetching data: 2018-04-26T00:00:00 - 2018-04-27T00:00:00
fetching data: 2018-04-26T00:00:00 - 2018-04-27T00:00:00
fetching data: 2018-04-26T00:00:00 - 2018-04-27T00:00:00
fetching data: 2018-04-26T00:00:00 - 2018-04-27T00:00:00
fetching data: 2018-04-26T00:00:00 - 2018-04-27T00:00:00
...
...
fetching data: 2018-05-22T00:00:00 - 2018-05-23T00:00:00
fetching data: 2018-05-22T00:00:00 - 2018-05-23T00:00:00
fetching data: 2018-05-23T00:00:00 - 2018-05-24T00:00:00
fetching data: 2018-05-23T00:00:00 - 2018-05-24T00:00:00
fetching data: 2018-05-24T00:00:00 - 2018-05-25T00:00:00
fetching data: 2018-05-25T00:00:00 - 2018-05-26T00:00:00
fetching data: 2018-05-25T00:00:00 - 2018-05-26T00:00:00
fetching data: 2018-05-25T00:00:00 - 2018-05-26T00:00:00
fetching data: 2018-05-25T00:00:00 - 2018-05-26T00:00:00
fetching data: 2018-05-26T00:00:00 - 2018-05-27T00:00:00
fetching data: 2018-05-27T00:00:00 - 2018-05-28T00:00:00
fetching data: 2018-05-28T00:00:00 - 2018-05-29T00:00:00
fetching data: 2018-05-29T00:00:00 - 2018-05-30T00:00:00
fetching data: 2018-05-30T00:00:00 - 2018-05-31T00:00:00
fetching data: 2018-05-30T00:00:00 - 2018-05-31T00:00:00
fetching data: 2018-05-30T00:00:00 - 2018-05-31T00:00:00
fetching data: 2018-05-31T00:00:00 - 2018-06-01T00:00:00
fetching data: 2018-05-31T00:00:00 - 2018-06-01T00:00:00
fetching data: 2018-06-01T00:00:00 - 2018-06-02T00:00:00
fetching data: 2018-06-01T00:00:00 - 2018-06-02T00:00:00
fetching data: 2018-06-01T00:00:00 - 2018-06-02T00:00:00
fetching data: 2018-06-01T00:00:00 - 2018-06-02T00:00:00
fetching data: 2018-06-02T00:00:00 - 2018-06-03T00:00:00
fetching data: 2018-06-02T00:00:00 - 2018-06-03T00:00:00
fetching data: 2018-06-02T00:00:00 - 2018-06-03T00:00:00
fetching data: 2018-06-03T00:00:00 - 2018-06-04T00:00:00
fetching data: 2018-06-03T00:00:00 - 2018-06-04T00:00:00
fetching data: 2018-06-04T00:00:00 - 2018-06-05T00:00:00
fetching data: 2018-06-04T00:00:00 - 2018-06-05T00:00:00
fetching data: 2018-06-05T00:00:00 - 2018-06-06T00:00:00
fetching data: 2018-06-05T00:00:00 - 2018-06-06T00:00:00
fetching data: 2018-06-05T00:00:00 - 2018-06-06T00:00:00
fetching data: 2018-06-05T00:00:00 - 2018-06-06T00:00:00
Anyone noticed something wrong? or this is the correct behavior?
I have no experience with asyncio but I was expecting not a sequential execution...
python version: 3.6.3

The code seems to await the fetch_t_data_coro invocations one by one, which forces them to run in sequence.
To run them in parallel, you can use asyncio.gather:
coros = []
for mod_end_datetime in days_range:
mod_start_datetime = mod_end_datetime - timedelta(minutes=RUNNING_INTERVAL_MINS * timeframe)
coros.append(fetch_t_data_coro(
loop=self.loop, modification_start_date=mod_start_datetime, modification_end_date=mod_end_datetime
))
data_list = await asyncio.gather(*coros)
Two unrelated notes:
The code instantiates aiohttp.ClientSession in each t_search_coro. This is an anti-pattern - you should create a single ClientSession at top-level and pass it down to individual coroutines (even ones running in parallel), so that they all share the same session instance.
Beginning with Python 3.5.3, asyncio.get_event_loop() will correctly pick up the running event loop when called from a coroutine. As a result, you don't need to send the loop object down the coroutine invocations, just call get_event_loop when you need it (which in your code you don't, since ClientSession also correctly infers the event loop on its own).

Related

count successful and unsuccessful post requests for asynchronous post call/request

I need help in implementing the logic to count number of successful post calls which are asynchronous in nature (status_code=200) as well as failed_calls (status_code != 200)
I am new to coroutines. Would appreciate if someone can suggest a better way of making a post asynchronous call which can be retried, polled for status, and that can emit metrics for successful post requests as well.
Following is my code:
asyncio.get_event_loop().run_in_executor(
None,
self.publish_actual,
event_name,
custom_payload,
event_message_params,
)
which calls publish_actual:
def publish_actual(
self,
event_name: str,
custom_payload={},
event_message_params=[],
):
"""Submits a post request using the request library
:param event_name: name of the event
:type event_name: str
:param key: key for a particular application
:param custom_payload: custom_payload, defaults to {}
:type custom_payload: dict, optional
:param event_message_params: event_message_params, defaults to []
:type event_message_params: list, optional
"""
json_data = {}
path = f"/some/path"
self.request(path, "POST", json=json_data)
which calls following request function
def request(self, api_path, method="GET", **kwargs):
try:
self._validate_configuration()
headers = {}
api_endpoint = self.service_uri.to_url(api_path)
logger.debug(api_endpoint)
if "headers" in kwargs and kwargs["headers"]:
headers.update(kwargs["headers"])
headers = {"Content-Type": "application/json"}
begin = datetime.now()
def build_success_metrics(response, *args, **kwargs):
tags = {
"name": "success_metrics",
"domain": api_endpoint,
"status_code": 200,
}
build_metrics(tags)
def check_for_errors(response, *args, **kwargs):
response.raise_for_status()
response = self.session.request(
method=method,
url=api_endpoint,
headers=headers,
timeout=self.timeout,
hooks={"response": [build_success_metrics, check_for_errors]},
**kwargs,
)
end = datetime.now()
logger.debug(
f"'{method}' request against endpoint '{api_endpoint}' took {round((end - begin).total_seconds() * 1000, 3)} ms"
)
logger.debug(f"response: {response}")
except RequestException as e:
tags = {
"name": "error_metrics",
"domain": api_endpoint,
"exception_class": e.__class__.__name__,
}
build_metrics(tags)
return f"Exception occured: {e}"
Let me know if anything else is required from my end to explain what exactly I have done and what I am trying to achieve.

There is not much await and async in your example so I've just addressed the counting part of your question in general terms in asyncio. asyncio.Queue is good for this because you can separate out the counting from the cause quite simply.
import asyncio
import aiohttp
class Count():
def __init__(self, queue: asyncio.Queue):
self.queue = queue
self.good = 0
self.bad = 0
async def count(self):
while True:
result = await self.queue.get()
if result == 'Exit':
return
if result == 200:
self.good += 1
else:
self.bad += 1
async def request(q: asyncio.Queue):
async with aiohttp.ClientSession() as session:
for _ in range(5): # just poll 30 times in this instance
await asyncio.sleep(0.1)
async with session.get(
'https://httpbin.org/status/200%2C500', ssl=False
) as response:
q.put_nowait(response.status)
q.put_nowait('Exit')
async def main():
q = asyncio.Queue()
cnt = Count(q)
tasks = [cnt.count(), request(q)]
await asyncio.gather(*[asyncio.create_task(t) for t in tasks])
print(cnt.good, cnt.bad)
if __name__ == "__main__":
asyncio.run(main())
Output is random given httpbin response. Should add to 5.
4 1

ERROR - Runtime.HandlerNotFound: Handler 'lambda_handler' missing on module 'lambda_function'

My apologies for basic question. I am completely new to AWS as well as Python. I am trying to do sample code but facing a error. I'm trying to read some data from a dynamodb table, but facing error like this in AWS Cloudwatch logs:
"Runtime.HandlerNotFound: Handler 'lambda_handler' missing on module 'lambda_function'".
And Postman is throwing error as
"message": "Internal server error"
The code is:
import boto3
class userProfile:
def __init__(self):
dynamodb = boto3.resource('dynamodb')
self.table = dynamodb.Table('user_data')
def Create_table():
pass
def Read_table(self, event):
response = self.table.get_item(
Key = {
'user_name' : event['user_name']
}
)
if 'Item' in response:
return response['Item']
else:
return {
'statusCode':'404',
'body' : 'User Name ' + 'id ' + 'not found'
}
def Update_tabel():
pass
def lambda_handler(event, context):
if event:
user_Object = userProfile()
if event["tasktype"] == "read":
read_result = user_Object.Read_table(event['data'])
return read_result
else:
return {
'statusCode': '404',
'body': 'Not found'
}

AWS API Gateway/Lambda/DynamoDB - .get_item() not finding item in table

I currently have POSTed items into the DynamoDB (the date is a string):
dynamodb
When I try accessing this via a GET, I get 404 not found (not a 502 so it appears the lambda response is OK):
get request
This is the code in my lambda function:
def lambda_handler(event, context):
logger.info(event)
httpMethod = event['httpMethod']
path = event['path']
if httpMethod == getMethod and path == answersPath:
response = buildResponse(200)
elif httpMethod == getMethod and path == dailyAnswerPath:
response = getAnswer(event['queryStringParameters']['day'])
else:
response = buildResponse(404, 'Not Found')
return response
def getAnswer(day):
try:
response = table.get_item(
Key = {
'day': day
}
)
if 'answer' in response:
return buildResponse(200, response['answer'])
else:
return buildResponse(404, {'Message':'Day %s not found' %day})
except:
logger.exception('getAnswer error day %s' %day)
def buildResponse(statusCode, body=None):
response = {
'statusCode': statusCode,
'headers': {
'Content-Type':'application/json',
'Access-Control-Allow-Origin': '*'
}
}
if body is not None:
response['body'] = json.dumps(body, cls=encodeddd)
return response

Load a series of payload requests and perform pagination for each one of them

Hi I'm trying to send a series of payload requests and for each one of them to perform a pagination. To achieve this for each one of the "payload request" a variable "offset" should be incremented. Unfortunatelly the code outputs the following error "TypeError: 'bytes' object does not support item assignment".
# -*- coding: utf-8 -*-
import scrapy
import json
class KauflandBasicProductsSpider(scrapy.Spider):
name = 'kaufland_basic_products'
allowed_domains = ['www.shopme.io']
custom_settings = {'ITEM_PIPELINES': {'groceries.pipelines.BasicProducts': 365}}
categories = [
"8d7a9abf-b90b-4c07-9e18-ed2283dfd71f",
"24ddb04a-f9b9-44f8-b78d-00ef5cd79977",
"3502a7bd-7459-4a51-91df-17375b15e03e"
]
def start_requests(self):
for category_id in self.categories:
payload = {"category_id": category_id}
yield scrapy.Request(
url='www.shopme.io/v1/feed',
)
def parse(self, response):
payload_var = response.request.body
# offset_var = int(payload_var['offset'])
resp = json.loads(response.body)
# print(resp)
products = resp.get('feed').get('items')[0].get('items')
# # print(products)
for product in products:
yield{
'product' : product.get('name'),
'price' : product.get('price'),
'price_promo' : 'n/a',
'weight' : 'n/a',
'weight_text' : 'n/a',
# 'brand' : product.get('brand').get('name'),
'country_of_origin' : 'n/a',
'source' : product.get('vendor_name'),
'link' : product.get('image_url')
# 'current_page' : response.meta['current_page']
# 'user-agent' : response.get('User-Agent').decode('utf-8')
}
count_available_products = resp.get('feed').get('count')
# increment_number = len(resp.get('feed').get('items')[0].get('items'))
# if current_payload['offset'] <= count_available_products:
# current_payload['offset'] += increment_number
if count_available_products >= 12:
# offset_var += 12
offset_var = response.meta['offset']
offset_var += 12
payload_var['offset'] = offset_var
yield scrapy.Request(
url="https://disco.deliveryhero.io/verticals/api/v1/feed",
method="POST",
body=json.dumps(payload_var),
headers={
'Content-Type': 'application/json'
},
callback=self.parse
)

The answer to your question is that when you retrieve the request body you have a bytes object, not a dictionary. That's why you get a TypeError exception.
payload_var = response.request.body # This is a bytes object
...
payload_var['offset'] = offset_var # This raises an exception
The solution is using json.loads() method to convert it back into a dict.
payload_var = json.loads(response.request.body)
There were other options for you to avoid this error, using payload as a class variable (just like categories) is one of them.

python requests error check-mk API

I am trying to add dictionary data to our check-mk Web API wit Python requests, but I keep getting an Error of missing keys:
{"result": "Check_MK exception: Missing required key(s): aux_tags, tag_groups", "result_code": 1}
Here is my code:
import json
import requests
params_get = (
('action', 'get_hosttags'),
('_username', 'user'),
('_secret', 'secret'),
('request_format', 'json'),
)
params_set = (
('action', 'set_hosttags'),
('_username', 'user'),
('_secret', 'secret'),
('request_format', 'json'),
)
url = 'http://monitoringtest.local.domain/test/check_mk/webapi.py'
tags = ['tag1', 'tag2']
response_data = requests.get(url, params=params_get)
data = response_data.json()
new_data_tags = data['result']['tag_groups']
new_data = data['result']
# new_tags = []
for tag in tags:
new_data['aux_tags'].append({'id': tag, 'title': 'tags in datacenter'})
# new_tags.extend([{'aux_tags': [], 'id': tag, 'title': tag.upper() + ' Tag'}])
# all_tags = new_data_tags.extend([{'tags': new_tags, 'id': 'group1', 'title': 'tags in datacenter'}])
json.dump(data['result'], open("new_file", "w"))
response_data_new = requests.get(url, params=params_set, json=json.dumps(data['result']))
# response_data_new = requests.put(url, params=params_set)
# requests.post(url, params=params_set)
print(response_data_new.text)
# print(data['result'])
# json.dump(data['result'], open("new_file", "w"))
When I use curl every thing works well with a success message:
{"result": null, "result_code": 0}
Do you have any idea what causes the error?
Thanks

I found the mistake, it was just not focused. The Data variable contains two keys that are sent as well result at the beginning and result_code at the end, which need to be truncated. I just had to modify the response as follows, and sending the data wit POST:
resp = requests.post(url, params=params_set, data={'request': json.dumps(data['result'])})
Thanks #DeepSpace

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

django and asyncio - fetch data asynchronously from remote REST endpoint - python-3.x

Related

count successful and unsuccessful post requests for asynchronous post call/request

ERROR - Runtime.HandlerNotFound: Handler 'lambda_handler' missing on module 'lambda_function'

AWS API Gateway/Lambda/DynamoDB - .get_item() not finding item in table

Load a series of payload requests and perform pagination for each one of them

python requests error check-mk API

Categories

Resources