I created a BaseSpider class that needs to use playwright and httpx, but in the case of coroutines, they conflict.
code:
class BaseSpider():
def __init__(self) :
self.play_wright = sync_playwright().start()
async def req(self, client: AsyncClient, i):
res = await client.get('https://www.baidu.com')
print(f'{i + 1},status_code = {res.status_code}')
return res
async def main(self):
async with httpx.AsyncClient() as client:
response = await client.get('https://www.baidu.com')
print(response)
if __name__ == '__main__':
base = BaseSpider()
asyncio.run(base.main())
run error:asyncio.run() cannot be called from a running event loop
I comment # self.play_wright = sync_playwright().start().They're ready to work.
How to get Playwright and HTTPX to run together in coroutines?
Thank you for your help!
I'm trying to use the aiobotocore library with context managers, but I'm having a hard time trying to configure my credentials.
I need to create a class that configure my AWS client so I can use the put, read and delete functions in this library.
The following code is being used to this:
from contextlib import AsyncExitStack
from aiobotocore.session import AioSession
from credentials import aws_access_key_id, aws_secret_access_key
class AWSConnectionManager:
def __init__(self, aws_acces_key_id, aws_secret_access_key):
self.aws_acces_key_id = aws_acces_key_id
self.aws_secret_access_key = aws_secret_access_key
self._exit_stack = AsyncExitStack()
self._client = None
print('__init__')
async def __aenter__(self):
session = AioSession
self._client = await self._exit_stack.enter_async_context(session.create_client('s3'))
print('__aenter__')
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self._exit_stack.__aexit__(exc_type, exc_val, exc_tb)
print('__aexit__')
res = AWSConnectionManager(aws_acces_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
But, it doesn't pass through aenter and aexit method.
With the code above I have the following output:
__init__
<__main__.AWSConnectionManager object at 0x7f03921ac640>
Does anyone know what can be wrong with my code?
First: you need to fix 'session = AioSession' => 'session = AioSession()' + add return + pass credentials
async def __aenter__(self):
session = AioSession()
self._client = await self._exit_stack.enter_async_context(
session.create_client(
's3',
aws_secret_access_key=self.aws_secret_access_key,
aws_access_key_id=self.aws_access_key_id,
)
)
return self
Second: you need to write/add proxy calls for put_object/get_object or make _client public by rename _client => client
async def save_file(self, content, s3_filename: str):
await self._client.put_object(Bucket=self.bucket, Body=content, Key=f'{s3_filename}')
async def load_file(self, name):
obj = await self._client.get_object(Bucket=self.bucket, Key=f'{name}')
return obj['Body'].read()
now you can use like
async with SkyFileStorageProxy() as storage:
await storage.load_file(name='test.txt')
i have some troubles with parsing two or more sessions at the same time with telethon. I have tried this:
class NewSession:
def __init__(self, session_name):
self.client = TelegramClient(session_name, api_id, api_hash)
self.session_name = session_name
async def pool(self):
print("working with:", self.session_name)
#self.client.on(events.NewMessage(outgoing=True))
async def main(event):
message = event.message.to_dict()
msg_text = message['message']
print(msg_text)
try:
await self.client.start()
await self.client.run_until_disconnected()
finally:
await self.client.disconnect()
async def main():
user = NewSession("321")
user2 = NewSession("123")
await user.pool()
await user2.pool()
if __name__ == '__main__':
asyncio.run(main())
But only one is working. Need help :)
The problem is inside your main function. When you await for a coroutine to return it doesn't mean that the execution continues to the next expression. So, in your code the line await user2.pool() is going to be executed only when the user.poll() coroutines returns a value, this is when the session '321' is disconnected.
You need to run the tasks concurrently; you can use the function asyncio.gather. Reworking your main:
async def main():
user = NewSession("321")
user2 = NewSession("123")
await asyncio.gather(user.pool(), user2.pool())
How I can fix it? I played with it a lot and for a long time, but nothing came of it.
sql_db.py:
import asyncio
import asyncpg
LOG_PG = {"database": 'test_bot',
"user": 'misha',
"password": '1234',
"host": 'localhost'}
class Database:
SELECT_USER_LANG = "SELECT lang FROM lang_set WHERE user_id = $1 AND bot_token = $2"
def __init__(self, loop: asyncio.AbstractEventLoop):
self.pool = loop.run_until_complete(
asyncpg.create_pool(**LOG_PG)
)
async def get_lang(self, user_id, token):
search_d = [user_id, token]
res = await self.pool.fetchval(self.SELECT_USER_LANG, *search_d)
if res is None:
return "ru"
return res
I tried to insert this loop everywhere, run without it, multiple combinations in the code itself. But nothing changed. I do not know how to describe the problem in more detail
main.py:
from aiogram import Bot, Dispatcher
from aiogram.types import Message
import asyncio
from sql_db import Database
loop = asyncio.get_event_loop()
token = "TOKEN"
dp = Dispatcher()
bot = Bot(token=token, parse_mode="HTML")
db = Database(loop)
async def echo_msg(message: Message):
user_id = message.from_user.id
await message.send_copy(user_id)
await db.get_lang(user_id, token)
dp.message.register(callback=echo_msg)
if __name__ == '__main__':
dp.run_polling(bot, skip_updates=True)
error:
...
File "/home/mickey/Desktop/chat_admin/venv/lib/python3.8/site-packages/asyncpg/pool.py", line 867, in release
return await asyncio.shield(ch.release(timeout))
File "/home/mickey/Desktop/chat_admin/venv/lib/python3.8/site-packages/asyncpg/pool.py", line 224, in release
raise ex
File "/home/mickey/Desktop/chat_admin/venv/lib/python3.8/site-packages/asyncpg/pool.py", line 214, in release
await self._con.reset(timeout=budget)
File "/home/mickey/Desktop/chat_admin/venv/lib/python3.8/site-packages/asyncpg/connection.py", line 1367, in reset
await self.execute(reset_query, timeout=timeout)
File "/home/mickey/Desktop/chat_admin/venv/lib/python3.8/site-packages/asyncpg/connection.py", line 318, in execute
return await self._protocol.query(query, timeout)
File "asyncpg/protocol/protocol.pyx", line 323, in query
File "asyncpg/protocol/protocol.pyx", line 707, in asyncpg.protocol.protocol.BaseProtocol._check_state
asyncpg.exceptions._base.InterfaceError: cannot perform operation: another operation is in progress
Works through such a launch. It must be turned on through the aiogram. I do not know how to formulate, but I was lucky to understand the problem
...
data_ = {}
class Database:
def __init__(self, pool: asyncpg.create_pool):
self.pool = pool
async def get_lang(self, user_id, token):
search_d = [user_id, token]
async with self.pool.acquire() as conn:
res = await conn.fetchval(SELECT_USER_LANG, *search_d)
if res is None:
return "ru"
return res
async def create_pool():
pool = await asyncpg.create_pool(**LOG_PG)
data_["db"] = Database(pool)
async def echo_msg(message: Message):
user_id = message.from_user.id
await message.send_copy(user_id)
await data_["db"].get_lang(user_id, token)
dp.message.register(callback=echo_msg)
if __name__ == '__main__':
dp.startup.register(create_pool) # ANSWER
dp.run_polling(bot, skip_updates=True)
I have asyncio crawler, that visits URLs and collects new URLs from HTML responses. I was inspired that great tool: https://github.com/aio-libs/aiohttp/blob/master/examples/legacy/crawl.py
Here is a very simplified piece of workflow, how it works:
import asyncio
import aiohttp
class Requester:
def __init__(self):
self.sem = asyncio.BoundedSemaphore(1)
async def fetch(self, url, client):
async with client.get(url) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
return response, data
async def run(self, urls):
async with aiohttp.ClientSession() as client:
for url in urls:
await self.sem.acquire()
task = asyncio.create_task(self.fetch(url, client))
task.add_done_callback(lambda t: self.sem.release())
def http_crawl(self, _urls_list):
loop = asyncio.get_event_loop()
crawl_loop = asyncio.ensure_future(self.run(_urls_list))
loop.run_until_complete(crawl_loop)
r = Requester()
_url_list = ['https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
What I need now is to add some very slow beautifulsoap based function. I need that function do not block main loop and work as background process. For instance, I will handle HTTP responses.
I read python docs about it and found that: https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor
I tried to add it to my code, but it does not work as should (I use cpu_bound only for demo):
import asyncio
import aiohttp
import concurrent.futures
def cpu_bound():
return sum(i * i for i in range(10 ** 7))
class Requester:
def __init__(self):
self.sem = asyncio.BoundedSemaphore(1)
async def fetch(self, url, client):
async with client.get(url) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
####### Blocking operation #######
loop = asyncio.get_running_loop()
with concurrent.futures.ProcessPoolExecutor() as pool:
result = await loop.run_in_executor(pool, cpu_bound)
print('custom process pool', result)
#################################
return response, data
async def run(self, urls):
async with aiohttp.ClientSession() as client:
for url in urls:
await self.sem.acquire()
task = asyncio.create_task(self.fetch(url, client))
task.add_done_callback(lambda t: self.sem.release())
def http_crawl(self, _urls_list):
loop = asyncio.get_event_loop()
crawl_loop = asyncio.ensure_future(self.run(_urls_list))
loop.run_until_complete(crawl_loop)
r = Requester()
_url_list = ['https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
For now, it doesn't work as expected, it blocks HTTP requests every time:
URL: https://www.google.com have code: 200
custom process pool 333333283333335000000
URL: https://images.google.com have code: 200
custom process pool 333333283333335000000
URL: https://maps.google.com have code: 200
custom process pool 333333283333335000000
URL: https://mail.google.com have code: 200
custom process pool 333333283333335000000
URL: https://news.google.com have code: 200
custom process pool 333333283333335000000
URL: https://video.google.com have code: 200
custom process pool 333333283333335000000
How to correctly put the task in the background inside the main asyncio process?
Are there best practices on how to do that in a simple way, or I should use Redis for task planning?
I believe that since you are setting your BoundedSemaphore to 1 it is only allowing one instance of your task to run at a time.
You can use the ratelimiter package to limit the number of concurrent requests in a certain amount of time.
I would also upload code that works for me. It is two independent async queues, and one of them spawn high-CPU consumption process in a separate loop:
import asyncio
import functools
import aiohttp
import concurrent.futures
def cpu_bound(num):
return sum(i * i for i in range(10 ** num))
class Requester:
def __init__(self):
self.threads = 3
self.threads2 = 10
self.pool = concurrent.futures.ProcessPoolExecutor()
async def fetch(self, url):
try:
timeout = aiohttp.ClientTimeout(total=10)
async with self.client.get(url, allow_redirects=False, verify_ssl=False, timeout=timeout) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
resp_list = {'url': str(response.real_url), 'data': str(data), 'headers': dict(response.headers)}
return resp_list
except Exception as err:
print(err)
return {}
async def heavy_worker(self, a):
while True:
resp_list = await a.get()
if resp_list.keys():
####### Blocking operation #######
try:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(self.pool, functools.partial(cpu_bound, num=5))
print('wappalazer', result)
except Exception as err:
print(err)
#################################
a.task_done()
else:
a.task_done()
async def fetch_worker(self, q, a):
while True:
url = await q.get()
resp_list = await self.fetch(url)
q.task_done()
await a.put(resp_list)
async def main(self, urls):
# Create an queues those we will use to store our "workload".
q = asyncio.Queue()
a = asyncio.Queue()
# Create workers tasks to process the queue concurrently.
workers_fetch = [asyncio.create_task(self.fetch_worker(q, a)) for _ in range(self.threads)]
workers_heavy = [asyncio.create_task(self.heavy_worker(a)) for _ in range(self.threads2)]
for url in urls:
await q.put(url)
# wait for all tasks to be processed
await q.join()
await a.join()
# Cancel our worker tasks.
for worker in workers_fetch:
worker.cancel()
await asyncio.gather(*workers_fetch , return_exceptions=True)
for worker in workers_heavy:
worker.cancel()
await asyncio.gather(*workers_heavy , return_exceptions=True)
async def run(self, _urls_list):
async with aiohttp.ClientSession() as self.client:
task_for_first_run = asyncio.create_task(self.main(_urls_list))
await asyncio.sleep(1)
await task_for_first_run
print("All tasks completed")
def http_crawl(self, _urls_list):
asyncio.run(self.run(_urls_list))
r = Requester()
_url_list = ['http://aaaaaaaaaaaaaaaa.aaaaaaaaaaaaaaaaaaa.aa', 'https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com',
'https://news.google.com','https://video.google.com','https://books.google.com', 'https://www.google.com',
'https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com',
'https://video.google.com','https://books.google.com', 'https://www.google.com','https://images.google.com',
'https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com',
'https://books.google.com', 'https://www.google.com','https://images.google.com','https://maps.google.com',
'https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com',
'https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com',
'https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)