playwright and httpx coroutine conflicts,How to solve it? - python-3.x

I created a BaseSpider class that needs to use playwright and httpx, but in the case of coroutines, they conflict.
code:
class BaseSpider():
def __init__(self) :
self.play_wright = sync_playwright().start()
async def req(self, client: AsyncClient, i):
res = await client.get('https://www.baidu.com')
print(f'{i + 1},status_code = {res.status_code}')
return res
async def main(self):
async with httpx.AsyncClient() as client:
response = await client.get('https://www.baidu.com')
print(response)
if __name__ == '__main__':
base = BaseSpider()
asyncio.run(base.main())
run error:asyncio.run() cannot be called from a running event loop
I comment # self.play_wright = sync_playwright().start().They're ready to work.
How to get Playwright and HTTPX to run together in coroutines?
Thank you for your help!

Related

Running coroutines in different thread with same event loop

I want to run a coroutine in a different thread and get the result that the coroutine returns.
class Main:
def __init__(self, result_from_io_task=None):
self._io_task_result = result_from_io_task
async def io_task(self):
await asyncio.sleep(2)
return "slept of 2s"
def non_async_func(self):
#This can't be made async.
if not self._io_task_result:
#run io_task and get its result
#event loop will be running in the main thread so I can fire the task
task = asyncio.create_task(self.io_task)
#can't await task since I am in non-async func and I cannot
#return from non_async_func until and unless I know what
#self.io_task has returned. Tried following but my app hangs forever.
while not task.done():
pass
I also tried, but it doesn't work "
def run_in_thread(coro, loop):
output = []
def run():
fut = asyncio.run_coroutine_threadsafe(coro, loop)
output.append(fut)
thr = Thread(target=run)
thr.start()
return output
async def main():
main_obj = Main(result_from_io_task=None)
v = main_obj.non_async_func()
How can I spawn a new thread and run the given coroutine using event loop running in main thread
Unfortunately, my codebase depends on python < 3.8 and asyncio.to_thread is not available in python 3.7
Based on the example of my answer, I'm introducing another implementation of the asynchronous decorator that does not use asyncio.to_thread() function but uses ThreadPoolExecutor instead.
import asyncio
import requests
import concurrent.futures
def asynchronous(func):
async def wrapper(*args, **kwargs):
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(func, *args, **kwargs)
return future.result()
return wrapper
#asynchronous
def request(url):
with requests.Session() as session:
response = session.get(url)
try:
return response.json()
except requests.JSONDecodeError:
return response.text
async def main():
task = asyncio.create_task(request("https://google.com/"))
print("waiting for response...")
result = await task
print(result)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()

Python: manager.Queue() with asyncio. How to resolve deadlock?

I am trying to figure out how to have a websocket based server listen to incoming requests, place them in a queue for another process to do work, then place the results in another queue where the websocket based server can wait for said result and send the response back to the client.
This is just me trying to learn and gain more experience with both asyncio and sharing data between processes. I am using Python 3.9.2 64bit.
Right now I am stuck with a deadlock as commented in the "producer_handler" function in the server code. Here is the code I am playing with:
Server:
import asyncio
import logging
import time
from multiprocessing import Manager, Process
import websockets
logging.root.setLevel(0)
def server(recievequeue, sendqueue):
async def consumer_handler(websocket, path):
while True:
logging.info('Waiting for request')
try:
request = await websocket.recv()
except Exception as exception:
logging.warning(f'consumer_handler Error: {exception}')
break
logging.info(f'Request: {request}')
recievequeue.put(request)
logging.info('Request placed in recievequeue')
async def producer_handler(websocket, path):
while True:
logging.info('Waiting for response')
response = sendqueue.get()# Deadlock is here.
try:
await websocket.send(response)
except Exception as exception:
logging.warning(f'producer_handler Error: {exception}')
break
logging.info('Response sent')
async def handler(websocket, path):
consumer_task = asyncio.ensure_future(consumer_handler(websocket, path))
producer_task = asyncio.ensure_future(producer_handler(websocket, path))
done, pending = await asyncio.wait([producer_task, consumer_task], return_when=asyncio.FIRST_COMPLETED)
for task in done:
logging.info(f'Canceling: {task}')
task.cancel()
for task in pending:
logging.info(f'Canceling: {task}')
task.cancel()
eventloop = asyncio.get_event_loop()
eventloop.run_until_complete(websockets.serve(handler, 'localhost', 8081, ssl=None))
eventloop.run_forever()
def message_handler(recievequeue, sendqueue):
while True:
# I just want to test getting a message from the recievequeue, and placing it in the sendqueue
request = recievequeue.get()
logging.info(f'Request: {request}')
time.sleep(3)
data = str(time.time())
logging.info(f'Work completed # {data}')
sendqueue.put(data)
def main():
logging.info('Starting Application')
manager = Manager()
sendqueue = manager.Queue()
recievequeue = manager.Queue()
test_process_1 = Process(target=server, args=(recievequeue, sendqueue), name='Server')
test_process_1.start()
test_process_2 = Process(target=message_handler, args=(recievequeue, sendqueue), name='Message Handler')
test_process_2.start()
test_process_1.join()
if __name__ == '__main__':
main()
And the client:
import asyncio
import logging
import websockets
logging.root.setLevel(0)
URI = "wss://localhost:8081"
async def test():
async def consumer_handler(connection):
while True:
try:
request = await connection.recv()
except Exception as exception:
logging.warning(f'Error: {exception}')
break
logging.info(request)
async def producer_handler(connection):
while True:
await asyncio.sleep(5)
try:
await connection.send('Hello World')
except Exception as exception:
logging.warning(f'Error: {exception}')
break
async with websockets.connect(URI, ssl=None) as connection:
consumer_task = asyncio.ensure_future(consumer_handler(connection))
producer_task = asyncio.ensure_future(producer_handler(connection))
while True:
await asyncio.wait([consumer_task, producer_task], return_when=asyncio.FIRST_COMPLETED)
def main():
logging.info('Starting Application')
eventloop = asyncio.get_event_loop()
try:
eventloop.run_until_complete(test())
eventloop.run_forever()
except Exception as exception:
logging.warning(f'Error: {exception}')
if __name__ == '__main__':
main()
If I remove the queues the server and multiple client can talk back and forth with no issues. I just can't figure out how to get() and put() the requests and responses. Any help would be appreciated!
So after looking through other posts I noticed others talking about deadlocks and using run_in_executor. After some more testing I found replacing the line causing the deadlock with the following code resolved the issue:
response = await eventloop.run_in_executor(None, sendqueue.get)

Run slow background blocking task from asyncio loop

I have asyncio crawler, that visits URLs and collects new URLs from HTML responses. I was inspired that great tool: https://github.com/aio-libs/aiohttp/blob/master/examples/legacy/crawl.py
Here is a very simplified piece of workflow, how it works:
import asyncio
import aiohttp
class Requester:
def __init__(self):
self.sem = asyncio.BoundedSemaphore(1)
async def fetch(self, url, client):
async with client.get(url) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
return response, data
async def run(self, urls):
async with aiohttp.ClientSession() as client:
for url in urls:
await self.sem.acquire()
task = asyncio.create_task(self.fetch(url, client))
task.add_done_callback(lambda t: self.sem.release())
def http_crawl(self, _urls_list):
loop = asyncio.get_event_loop()
crawl_loop = asyncio.ensure_future(self.run(_urls_list))
loop.run_until_complete(crawl_loop)
r = Requester()
_url_list = ['https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
What I need now is to add some very slow beautifulsoap based function. I need that function do not block main loop and work as background process. For instance, I will handle HTTP responses.
I read python docs about it and found that: https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor
I tried to add it to my code, but it does not work as should (I use cpu_bound only for demo):
import asyncio
import aiohttp
import concurrent.futures
def cpu_bound():
return sum(i * i for i in range(10 ** 7))
class Requester:
def __init__(self):
self.sem = asyncio.BoundedSemaphore(1)
async def fetch(self, url, client):
async with client.get(url) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
####### Blocking operation #######
loop = asyncio.get_running_loop()
with concurrent.futures.ProcessPoolExecutor() as pool:
result = await loop.run_in_executor(pool, cpu_bound)
print('custom process pool', result)
#################################
return response, data
async def run(self, urls):
async with aiohttp.ClientSession() as client:
for url in urls:
await self.sem.acquire()
task = asyncio.create_task(self.fetch(url, client))
task.add_done_callback(lambda t: self.sem.release())
def http_crawl(self, _urls_list):
loop = asyncio.get_event_loop()
crawl_loop = asyncio.ensure_future(self.run(_urls_list))
loop.run_until_complete(crawl_loop)
r = Requester()
_url_list = ['https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
For now, it doesn't work as expected, it blocks HTTP requests every time:
URL: https://www.google.com have code: 200
custom process pool 333333283333335000000
URL: https://images.google.com have code: 200
custom process pool 333333283333335000000
URL: https://maps.google.com have code: 200
custom process pool 333333283333335000000
URL: https://mail.google.com have code: 200
custom process pool 333333283333335000000
URL: https://news.google.com have code: 200
custom process pool 333333283333335000000
URL: https://video.google.com have code: 200
custom process pool 333333283333335000000
How to correctly put the task in the background inside the main asyncio process?
Are there best practices on how to do that in a simple way, or I should use Redis for task planning?
I believe that since you are setting your BoundedSemaphore to 1 it is only allowing one instance of your task to run at a time.
You can use the ratelimiter package to limit the number of concurrent requests in a certain amount of time.
I would also upload code that works for me. It is two independent async queues, and one of them spawn high-CPU consumption process in a separate loop:
import asyncio
import functools
import aiohttp
import concurrent.futures
def cpu_bound(num):
return sum(i * i for i in range(10 ** num))
class Requester:
def __init__(self):
self.threads = 3
self.threads2 = 10
self.pool = concurrent.futures.ProcessPoolExecutor()
async def fetch(self, url):
try:
timeout = aiohttp.ClientTimeout(total=10)
async with self.client.get(url, allow_redirects=False, verify_ssl=False, timeout=timeout) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
resp_list = {'url': str(response.real_url), 'data': str(data), 'headers': dict(response.headers)}
return resp_list
except Exception as err:
print(err)
return {}
async def heavy_worker(self, a):
while True:
resp_list = await a.get()
if resp_list.keys():
####### Blocking operation #######
try:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(self.pool, functools.partial(cpu_bound, num=5))
print('wappalazer', result)
except Exception as err:
print(err)
#################################
a.task_done()
else:
a.task_done()
async def fetch_worker(self, q, a):
while True:
url = await q.get()
resp_list = await self.fetch(url)
q.task_done()
await a.put(resp_list)
async def main(self, urls):
# Create an queues those we will use to store our "workload".
q = asyncio.Queue()
a = asyncio.Queue()
# Create workers tasks to process the queue concurrently.
workers_fetch = [asyncio.create_task(self.fetch_worker(q, a)) for _ in range(self.threads)]
workers_heavy = [asyncio.create_task(self.heavy_worker(a)) for _ in range(self.threads2)]
for url in urls:
await q.put(url)
# wait for all tasks to be processed
await q.join()
await a.join()
# Cancel our worker tasks.
for worker in workers_fetch:
worker.cancel()
await asyncio.gather(*workers_fetch , return_exceptions=True)
for worker in workers_heavy:
worker.cancel()
await asyncio.gather(*workers_heavy , return_exceptions=True)
async def run(self, _urls_list):
async with aiohttp.ClientSession() as self.client:
task_for_first_run = asyncio.create_task(self.main(_urls_list))
await asyncio.sleep(1)
await task_for_first_run
print("All tasks completed")
def http_crawl(self, _urls_list):
asyncio.run(self.run(_urls_list))
r = Requester()
_url_list = ['http://aaaaaaaaaaaaaaaa.aaaaaaaaaaaaaaaaaaa.aa', 'https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com',
'https://news.google.com','https://video.google.com','https://books.google.com', 'https://www.google.com',
'https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com',
'https://video.google.com','https://books.google.com', 'https://www.google.com','https://images.google.com',
'https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com',
'https://books.google.com', 'https://www.google.com','https://images.google.com','https://maps.google.com',
'https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com',
'https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com',
'https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)

Why does asyncio.create_task not run the method?

Code example:
async def download_page(session, url):
print(True)
async def downloader_init(session):
while True:
url = await download_queue.get()
task = asyncio.create_task(download_page(session, url))
print(task)
print(f"url: {url}")
async def get_urls(url):
while True:
try:
url = find_somewhere_url
await download_queue.put(url)
except NoSuchElementException:
break
return True
async def main():
async with aiohttp.ClientSession(headers=headers) as session:
get_urls_task = asyncio.create_task(get_urls(url))
downloader_init_task = asyncio.create_task(downloader_init(session))
asyncio.gather(get_urls_task, downloader_init_task)
if __name__ == "__main__":
asyncio.get_event_loop().run_until_complete(main())
Output:
<Task pending coro=<download_page() running at main.py:69>>
url: https://someurl.com/example
<Task pending coro=<download_page() running at main.py:69>>
url: https://someurl.com/example
<Task pending coro=<download_page() running at main.py:69>>
url: https://someurl.com/example
Why is the method download_page not executed?
The strange thing is that the script just ends its work, there are no errors anywhere.
downloader_init should work endlessly, but it does not.
In download_queue, method get_urls adds links as it finds them, after which it stops working.
downloader_init should immediately execute as soon as a new link appears in the queue, but it starts its work only when get_urls has completed its work.
Try this instead:
Note: Your problem wasn't with the task creation, it was because
there wasn't an await at the asyncio.gather part.
import asyncio
import aiohttp
async def download_page(session, url):
# Dummy function.
print(f"session={session}, url={url}")
async def downloader_init(session):
while True:
url = await download_queue.get()
task = asyncio.create_task(download_page(session, url))
print(f"task={task}, url={url}")
async def get_urls(url):
while True:
try:
url = find_somewhere_url()
await download_queue.put(url)
except NoSuchElementException:
break
async def main():
async with aiohttp.ClientSession(headers=headers) as session:
get_urls_task = asyncio.create_task(get_urls(url))
downloader_init_task = asyncio.create_task(downloader_init(session))
# Use await here to make it finish the tasks.
await asyncio.gather(get_urls_task, downloader_init_task)
if __name__ == "__main__":
# Use this as it deals with the loop creation, shutdown,
# and other stuff for you.
asyncio.run(main()) # This is new in Python 3.7

Awaiting a coroutine inside of a Class while event_loop is already running

I have an Issue with asyncio I can't really get me head around.
take this working example (with Python 3.6+ because of string interpolation)
import asyncio
import aiohttp
import async_timeout
import json
async def fetch(session, url):
async with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def get_bittrex_marketsummary(currency_pair):
url = f'https://bittrex.com/api/v1.1/public/getmarketsummary?market={currency_pair}'
async with aiohttp.ClientSession() as session:
response = await fetch(session, url)
return json.loads(response)
class MyCryptoCurrency:
def __init__(self):
self.currency = "BTC-ETH"
self.last_price = None
asyncio.ensure_future(self.get_last_price())
async def get_last_price(self):
self.last_price = await get_bittrex_marketsummary(self.currency)
async def main():
eth = MyCryptoCurrency()
print(eth.last_price)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
while this runs and doesn't throw any Exceptions, it doesn't get the result from the api request and so ... doesn't work :P
If I try to use f.e. loop.run_until_complete(get_bittrex_marketsummary()) I get "event loop is already running" error - which kind of makes sense.
Any hints how to solve this properly?
Thx in advance!
ok, after talking about this in #python channel on freenode I got the answer "don't do async I/O in __init__", so here is the working version:
import asyncio
import aiohttp
import async_timeout
import json
async def fetch(session, url):
async with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def get_bittrex_marketsummary(currency_pair):
url = f'https://bittrex.com/api/v1.1/public/getmarketsummary?market={currency_pair}'
async with aiohttp.ClientSession() as session:
response = await fetch(session, url)
return json.loads(response)
class MyCryptoCurrency:
def __init__(self):
self.currency = "BTC-ETH"
self.last_price = None
async def get_last_price(self):
self.last_price = await get_bittrex_marketsummary(self.currency)
async def main():
eth = MyCryptoCurrency()
await eth.get_last_price()
print(eth.last_price)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Resources