I need help in implementing the logic to count number of successful post calls which are asynchronous in nature (status_code=200) as well as failed_calls (status_code != 200)
I am new to coroutines. Would appreciate if someone can suggest a better way of making a post asynchronous call which can be retried, polled for status, and that can emit metrics for successful post requests as well.
Following is my code:
asyncio.get_event_loop().run_in_executor(
None,
self.publish_actual,
event_name,
custom_payload,
event_message_params,
)
which calls publish_actual:
def publish_actual(
self,
event_name: str,
custom_payload={},
event_message_params=[],
):
"""Submits a post request using the request library
:param event_name: name of the event
:type event_name: str
:param key: key for a particular application
:param custom_payload: custom_payload, defaults to {}
:type custom_payload: dict, optional
:param event_message_params: event_message_params, defaults to []
:type event_message_params: list, optional
"""
json_data = {}
path = f"/some/path"
self.request(path, "POST", json=json_data)
which calls following request function
def request(self, api_path, method="GET", **kwargs):
try:
self._validate_configuration()
headers = {}
api_endpoint = self.service_uri.to_url(api_path)
logger.debug(api_endpoint)
if "headers" in kwargs and kwargs["headers"]:
headers.update(kwargs["headers"])
headers = {"Content-Type": "application/json"}
begin = datetime.now()
def build_success_metrics(response, *args, **kwargs):
tags = {
"name": "success_metrics",
"domain": api_endpoint,
"status_code": 200,
}
build_metrics(tags)
def check_for_errors(response, *args, **kwargs):
response.raise_for_status()
response = self.session.request(
method=method,
url=api_endpoint,
headers=headers,
timeout=self.timeout,
hooks={"response": [build_success_metrics, check_for_errors]},
**kwargs,
)
end = datetime.now()
logger.debug(
f"'{method}' request against endpoint '{api_endpoint}' took {round((end - begin).total_seconds() * 1000, 3)} ms"
)
logger.debug(f"response: {response}")
except RequestException as e:
tags = {
"name": "error_metrics",
"domain": api_endpoint,
"exception_class": e.__class__.__name__,
}
build_metrics(tags)
return f"Exception occured: {e}"
Let me know if anything else is required from my end to explain what exactly I have done and what I am trying to achieve.
There is not much await and async in your example so I've just addressed the counting part of your question in general terms in asyncio. asyncio.Queue is good for this because you can separate out the counting from the cause quite simply.
import asyncio
import aiohttp
class Count():
def __init__(self, queue: asyncio.Queue):
self.queue = queue
self.good = 0
self.bad = 0
async def count(self):
while True:
result = await self.queue.get()
if result == 'Exit':
return
if result == 200:
self.good += 1
else:
self.bad += 1
async def request(q: asyncio.Queue):
async with aiohttp.ClientSession() as session:
for _ in range(5): # just poll 30 times in this instance
await asyncio.sleep(0.1)
async with session.get(
'https://httpbin.org/status/200%2C500', ssl=False
) as response:
q.put_nowait(response.status)
q.put_nowait('Exit')
async def main():
q = asyncio.Queue()
cnt = Count(q)
tasks = [cnt.count(), request(q)]
await asyncio.gather(*[asyncio.create_task(t) for t in tasks])
print(cnt.good, cnt.bad)
if __name__ == "__main__":
asyncio.run(main())
Output is random given httpbin response. Should add to 5.
4 1
Related
I want receive http updates and WS updates in parallel.
For this targets i use threading, but i still receive only updates from WS. It's looks like
def http_req_update is blocked by infiniti messages from def on_message
Could please someone help me with it ?
import json
import time
from websocket import WebSocketApp
import requests
import threading
class BithumbWebSocketApp(WebSocketApp):
def __init__(self, url, **kwargs):
super(BithumbWebSocketApp, self).__init__(url, **kwargs)
def _request_orderbookdepth(self, channel, event=None, payload=None, auth_required=True):
current_time = int(time.time())
data = {
"time": current_time,
"type": "orderbookdepth",
"symbols": ["BTC_KRW"],
}
data = json.dumps(data)
print('request1', data)
self.send(data)
def subscribe(self, channel, payload=None, auth_required=True):
self._request_orderbookdepth(channel, "subscribe", payload, auth_required)
def unsubscribe(self, channel, payload=None, auth_required=True):
self._request_orderbookdepth(channel, "unsubscribe", payload, auth_required)
def on_open(ws):
print('Connected')
ws.subscribe("wss://pubwss.bithumb.com/pub/ws", "BTC_KRW", False)
msg_lst = []
def on_message(ws, message):
print('message', message)
msg = json.loads(message.encode('utf-8'))
print('msg1: ', msg)
msg_lst.append({
"msg": msg,
"type": msg['type'],
"list": msg['content']['list'],
"datetime": msg['content']['datetime'],
})
lst_to_json = json.dumps(msg_lst)
def ws_update():
app = BithumbWebSocketApp("wss://pubwss.bithumb.com/pub/ws",
on_open=on_open,
on_message=on_message)
app.run_forever(ping_interval=5)
def http_req_update():
currency = 'BTC_KRW' # ALL
url = f"https://api.bithumb.com/public/orderbook/{currency}"
headers = {
"accept": "application/json",
"content-type": "application/json"
}
response = requests.get(url, headers=headers)
print('snapshot_response', response.text)
if __name__ == "__main__":
trd1 = threading.Thread(target=ws_update)
trd2 = threading.Thread(target=http_req_update)
trd1.start()
trd2.start()
I have asyncio crawler, that visits URLs and collects new URLs from HTML responses. I was inspired that great tool: https://github.com/aio-libs/aiohttp/blob/master/examples/legacy/crawl.py
Here is a very simplified piece of workflow, how it works:
import asyncio
import aiohttp
class Requester:
def __init__(self):
self.sem = asyncio.BoundedSemaphore(1)
async def fetch(self, url, client):
async with client.get(url) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
return response, data
async def run(self, urls):
async with aiohttp.ClientSession() as client:
for url in urls:
await self.sem.acquire()
task = asyncio.create_task(self.fetch(url, client))
task.add_done_callback(lambda t: self.sem.release())
def http_crawl(self, _urls_list):
loop = asyncio.get_event_loop()
crawl_loop = asyncio.ensure_future(self.run(_urls_list))
loop.run_until_complete(crawl_loop)
r = Requester()
_url_list = ['https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
What I need now is to add some very slow beautifulsoap based function. I need that function do not block main loop and work as background process. For instance, I will handle HTTP responses.
I read python docs about it and found that: https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor
I tried to add it to my code, but it does not work as should (I use cpu_bound only for demo):
import asyncio
import aiohttp
import concurrent.futures
def cpu_bound():
return sum(i * i for i in range(10 ** 7))
class Requester:
def __init__(self):
self.sem = asyncio.BoundedSemaphore(1)
async def fetch(self, url, client):
async with client.get(url) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
####### Blocking operation #######
loop = asyncio.get_running_loop()
with concurrent.futures.ProcessPoolExecutor() as pool:
result = await loop.run_in_executor(pool, cpu_bound)
print('custom process pool', result)
#################################
return response, data
async def run(self, urls):
async with aiohttp.ClientSession() as client:
for url in urls:
await self.sem.acquire()
task = asyncio.create_task(self.fetch(url, client))
task.add_done_callback(lambda t: self.sem.release())
def http_crawl(self, _urls_list):
loop = asyncio.get_event_loop()
crawl_loop = asyncio.ensure_future(self.run(_urls_list))
loop.run_until_complete(crawl_loop)
r = Requester()
_url_list = ['https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
For now, it doesn't work as expected, it blocks HTTP requests every time:
URL: https://www.google.com have code: 200
custom process pool 333333283333335000000
URL: https://images.google.com have code: 200
custom process pool 333333283333335000000
URL: https://maps.google.com have code: 200
custom process pool 333333283333335000000
URL: https://mail.google.com have code: 200
custom process pool 333333283333335000000
URL: https://news.google.com have code: 200
custom process pool 333333283333335000000
URL: https://video.google.com have code: 200
custom process pool 333333283333335000000
How to correctly put the task in the background inside the main asyncio process?
Are there best practices on how to do that in a simple way, or I should use Redis for task planning?
I believe that since you are setting your BoundedSemaphore to 1 it is only allowing one instance of your task to run at a time.
You can use the ratelimiter package to limit the number of concurrent requests in a certain amount of time.
I would also upload code that works for me. It is two independent async queues, and one of them spawn high-CPU consumption process in a separate loop:
import asyncio
import functools
import aiohttp
import concurrent.futures
def cpu_bound(num):
return sum(i * i for i in range(10 ** num))
class Requester:
def __init__(self):
self.threads = 3
self.threads2 = 10
self.pool = concurrent.futures.ProcessPoolExecutor()
async def fetch(self, url):
try:
timeout = aiohttp.ClientTimeout(total=10)
async with self.client.get(url, allow_redirects=False, verify_ssl=False, timeout=timeout) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
resp_list = {'url': str(response.real_url), 'data': str(data), 'headers': dict(response.headers)}
return resp_list
except Exception as err:
print(err)
return {}
async def heavy_worker(self, a):
while True:
resp_list = await a.get()
if resp_list.keys():
####### Blocking operation #######
try:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(self.pool, functools.partial(cpu_bound, num=5))
print('wappalazer', result)
except Exception as err:
print(err)
#################################
a.task_done()
else:
a.task_done()
async def fetch_worker(self, q, a):
while True:
url = await q.get()
resp_list = await self.fetch(url)
q.task_done()
await a.put(resp_list)
async def main(self, urls):
# Create an queues those we will use to store our "workload".
q = asyncio.Queue()
a = asyncio.Queue()
# Create workers tasks to process the queue concurrently.
workers_fetch = [asyncio.create_task(self.fetch_worker(q, a)) for _ in range(self.threads)]
workers_heavy = [asyncio.create_task(self.heavy_worker(a)) for _ in range(self.threads2)]
for url in urls:
await q.put(url)
# wait for all tasks to be processed
await q.join()
await a.join()
# Cancel our worker tasks.
for worker in workers_fetch:
worker.cancel()
await asyncio.gather(*workers_fetch , return_exceptions=True)
for worker in workers_heavy:
worker.cancel()
await asyncio.gather(*workers_heavy , return_exceptions=True)
async def run(self, _urls_list):
async with aiohttp.ClientSession() as self.client:
task_for_first_run = asyncio.create_task(self.main(_urls_list))
await asyncio.sleep(1)
await task_for_first_run
print("All tasks completed")
def http_crawl(self, _urls_list):
asyncio.run(self.run(_urls_list))
r = Requester()
_url_list = ['http://aaaaaaaaaaaaaaaa.aaaaaaaaaaaaaaaaaaa.aa', 'https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com',
'https://news.google.com','https://video.google.com','https://books.google.com', 'https://www.google.com',
'https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com',
'https://video.google.com','https://books.google.com', 'https://www.google.com','https://images.google.com',
'https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com',
'https://books.google.com', 'https://www.google.com','https://images.google.com','https://maps.google.com',
'https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com',
'https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com',
'https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
I am trying to connect through websocket and keeping it alive to retrieve continously message from the server. I wrote the clientHelper and find the socketManager and ReconnectingWebsocket on the internet and I have not much idea on what going wrong with it as I do not receive anything thought the clientHelper.process_user_message function.
Can someone point me out the error please?
import websockets as ws
import asyncio
from client import Client
class ClientHelper(Client):
def __init__(self, api_key, api_secret):
super().__init__(api_key, api_secret)
self.loop = asyncio.get_event_loop()
def _request(self, method, uri, signed, force_params=False, **kwargs):
kwargs = self._get_request_kwargs(method, signed, force_params, **kwargs)
response = getattr(self.session, method)(uri, **kwargs)
return self._handle_response(response, method)
def _request(self, method, path, signed=False, version=API_VERSION, **kwargs):
uri = self._create_api_uri(path, signed, version)
return self._request(method, uri, signed, **kwargs)
def get_listen_key(self):
res = self._request('post', 'userDataStream', signed=True, data={})
return res['listenKey']
async def start_websockets(self):
self.sm = SocketManager(self, self.loop)
await self.sm.start_socket(self.process_user_message)
async def process_user_message(self, msg):
self.msg = msg
print(msg)
async def main(self)
await self.start_websockets()
while True:
await client.getInfo()
if 'data' in self.msg:
print(self.msg['data'])
def start(self):
self.loop.run_until_complete(self.main())
class SocketManager:
STREAM_URL = url
def __init__(self, client, loop, user_timeout=DEFAULT_USER_TIMEOUT):
self._client = client
self._loop = loop
self._conns = None
async def _start_user_socket(self, path, coro, prefix='ws/'):
if path in self._conns:
return False
self._conns[path] = ReconnectingWebsocket(self._loop, path, coro, prefix)
return path
async def start_user_socket(self, coro):
user_listen_key = await self._client.stream_get_listen_key() # manage to get the key from serveur
conn_key = await self._start_user_socket('user', user_listen_key, coro)
return conn_key
class ReconnectingWebsocket:
STREAM_URL = url
MAX_RECONNECTS = 5
MAX_RECONNECT_SECONDS = 60
MIN_RECONNECT_WAIT = 0.1
TIMEOUT = 10
def __init__(self, loop, path, coro, prefix='ws/'):
self._loop = loop
self._log = logging.getLogger(__name__)
self._path = path
self._coro = coro
self._prefix = prefix
self._reconnects = 0
self._conn = None
self._socket = None
self._connect()
def _connect(self):
self._conn = asyncio.ensure_future(self._run(), loop=self._loop)
async def _run(self):
keep_waiting = True
async with ws.connect(self.STREAM_URL) as socket:
self._socket = socket
self._reconnects = 0
try:
while keep_waiting:
try:
#evt = await self._coro(evt_obj)
evt = await asyncio.wait_for(self._socket.recv(), timeout=self.TIMEOUT)
except asyncio.TimeoutError:
#self._log.debug("no message in {} seconds".format(self.TIMEOUT))
print("no message in {} seconds".format(self.TIMEOUT))
await self.send_ping()
except asyncio.CancelledError:
#self._log.debug("cancelled error")
print("cancelled error")
await self.send_ping()
else:
try:
evt_obj = json.loads(evt)
except ValueError:
#self._log.debug('error parsing evt json:{}'.format(evt))
print('error parsing evt json:{}'.format(evt))
else:
await self._coro(evt_obj)
except ws.ConnectionClosed as e:
#self._log.debug('ws connection closed:{}'.format(e))
print('ws connection closed:{}'.format(e))
await self._reconnect()
except Exception as e:
#self._log.debug('ws exception:{}'.format(e))
print('ws exception:{}'.format(e))
await self._reconnect()
def _get_reconnect_wait(self, attempts: int) -> int:
expo = 2 ** attempts
return round(random() * min(self.MAX_RECONNECT_SECONDS, expo - 1) + 1)
async def _reconnect(self):
await self.cancel()
self._reconnects += 1
if self._reconnects < self.MAX_RECONNECTS:
reconnect_wait = self._get_reconnect_wait(self._reconnects)
await asyncio.sleep(reconnect_wait)
self._connect()
else:
self._log.error('Max reconnections {} reached:'.format(self.MAX_RECONNECTS))
async def send_ping(self):
if self._socket:
await self._socket.ping()
async def cancel(self):
self._conn.cancel()
self._socket = None
I would simply like to associate responses from aiohttp asynchronous HTTP requests with an identifier. I am using the following code to hit the API and extract contactproperty object which requires an external field (contacid) in order to call its API:
def get_contact_properties(self, office_name, api_key, ids, chunk_size=100, **params):
properties_pages = []
batch = 0
while True:
chunk_ids = [ids[i] for i in range(batch * chunk_size + 1, chunk_size * (1 + batch) + 1)]
urls = ["{}/{}".format(self.__get_base_url(), "contacts/{}/properties?api_key={}".format(contactid, api_key))
for contactid in chunk_ids]
responses_raw = self.get_responses(urls, self.get_office_token(office_name), chunk_size)
try:
responses_json = [json.loads(response_raw) for response_raw in responses_raw]
except Exception as e:
print(e)
valid_responses = self.__get_valid_contact_properties_responses(responses_json)
properties_pages.append(valid_responses)
if len(valid_responses) < chunk_size: # this is how we know there are no more pages with data
break
else:
batch = batch + 1
ids is a list of ids. The problem is that I do not know which response corresponds to which id so that later I can link it to contact entity using contacid. This is my fetch() function so I was wondering how to edit this function to return the contactid along with output.
async def __fetch(self, url, params, session):
async with session.get(url, params=params) as response:
output = await response.read()
return (output)
async def __bound_fetch(self, sem, url, params, session):
# Getter function with semaphore.
async with sem:
output = await self.__fetch(url, params, session)
return output
You can return the url (or whatever key identifies your request) together with the output.
Regarding using the data, I think you should read the response directly as JSON, especially since aiohttp can do this for you automatically.
async def __fetch(self, url, params, session):
async with session.get(url, params=params) as response:
try:
data = await response.json()
except ValueError as exc:
print(exc)
return None
return data
async def __bound_fetch(self, sem, url, params, session):
# Getter function with semaphore.
async with sem:
output = await self.__fetch(url, params, session)
return {"url": url, "data": data}
You did not post the get_responses function but I'm guessing something like this should work:
responses = self.get_responses(urls, self.get_office_token(office_name), chunk_size)
Responses will be a list of {"url": url, data: "data"} (data can be None for invalid responses); however with the code above one invalid request will not affect the others.
I have 2 URLs and 60k+ requests. Basically, I need to post every request to both URLs, then compare their responses, but not to wait for the response to post another request.
I've tried to do it with aiohttp and asyncio
import asyncio
import time
import aiohttp
import os
from aiofile import AIOFile
testURL = ""
prodURL = ""
directoryWithRequests = ''
directoryToWrite = ''
headers = {'content-type': 'application/soap+xml'}
i = 1
async def fetch(session, url, reqeust):
global i
async with session.post(url=url, data=reqeust.encode('utf-8'), headers=headers) as response:
if response.status != 200:
async with AIOFile(directoryToWrite + str(i) + '.xml', 'w') as afp:
await afp.write(reqeust)
i += 1
return await response.text()
async def fetch_all(session, urls, request):
results = await asyncio.gather(*[asyncio.create_task(fetch(session, url, request)) for url in urls])
return results
async def asynchronousRequests(requestBody):
urls = [testURL, prodURL]
global i
with open(requestBody) as my_file:
body = my_file.read()
async with aiohttp.ClientSession() as session:
htmls = await fetch_all(session, urls, body)
# some conditions
async def asynchronous():
try:
start = time.time()
futures = [asynchronousRequests(directoryWithRequests + i) for i in os.listdir(directoryWithRequests)]
for future in asyncio.as_completed(futures):
result = await future
print("Process took: {:.2f} seconds".format(time.time() - start))
except Exception as e:
print(str(e))
if __name__ == '__main__':
try:
# AsyncronTest
ioloop = asyncio.ProactorEventLoop()
ioloop.run_until_complete(asynchronous())
ioloop.close()
if i == 1:
print('Regress is OK')
else:
print('Number of requests to check = {}'.format(i))
except Exception as e:
print(e)
I believe that the code above works, but it creates N futures, where the N equals to the number of request files. This brings to sort of ddos because the server can't response to that number of requests at the same time.
Found suitable solution. Basically it's just 2 async tasks:
tasks = [
postRequest(testURL, client, body),
postRequest(prodURL, client, body)
]
await asyncio.wait(tasks)
It's not the same performance as the code in the question with afortable number of requests, but as least it doesn't ddos the server that much.