i am trying to make a python program that makes http requests and parses the data from the response with asyncio and aiohttp. The program takes as input a list of urls that can reach even more than 300/400 elements and should make requests as fast as possible. Since asyncio runs on only one thread, I was thinking of splitting the list into sublists and starting a thread for each sublist with asyncio and aiohttp, but adding the ThreadPoolExecutor I get coroutine and futures errors. How can I use Thread and asyncio together? I leave the code below.
async def fetch(url, session):
async with session.get(url, ssl=False) as response:
# if response.status == 200:
html_body = await response.json()
url = url.split('/')[-2]
file_name = url if url != 'services' else 'live'
async with aiofiles.open(f'{output_dir}/{file_name}.json', 'w') as f:
await f.write(json.dumps(html_body, indent=4))
return html_body
async def fetch_with_sem(sem, session, url):
async with sem:
return await fetch(url, session)
async def run(url, session, sem):
tasks = [asyncio.create_task(fetch_with_sem(sem, session, url)) for url in url]
page_content = await asyncio.gather(*tasks, return_exceptions=True)
return page_content
async def main(urls):
number = len(urls) // 10 + 1
sem = asyncio.Semaphore(50)
loop = asyncio.get_event_loop()
connector = aiohttp.TCPConnector(limit_per_host=30, limit=50, ttl_dns_cache=100)
headers = {
'user-agent': get_user_agent(),
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "macOS",
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'x-eb-accept-language': 'it_IT',
'x-eb-marketid': '5',
'x-eb-platformid': '1',
}
async with ClientSession(loop=loop, connector=connector, headers=headers) as session:
if isinstance(urls, list):
with ThreadPoolExecutor(max_workers=25) as executor:
page_content = [executor.submit(
run, urls[number * i : number * (i + 1)], session, sem
).result() for i in range(10)
]
else:
tasks = [asyncio.create_task(fetch_with_sem(sem, session, urls))]
page_content = await asyncio.gather(*tasks, return_exceptions=True)
return page_content
the problem is given from executor.submit and got this error:
RuntimeWarning: coroutine 'wait' was never awaited
page_content = [executor.submit(asyncio.wait(run), urls[number * i : number * (i + 1)], session, sem).result() for i in range(10)]
RuntimeWarning: Enable tracemalloc to get the object allocation traceback
Traceback (most recent call last):
File "/Users/federikowsky/Desktop/Python/Scraping/SureBet/prova.py", line 123, in <module>
x = asyncio.run(main(link))
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/asyncio/base_events.py", line 616, in run_until_complete
return future.result()
File "/Users/federikowsky/Desktop/Python/Scraping/SureBet/prova.py", line 110, in main
page_content = [executor.submit(asyncio.wait(run), urls[number * i : number * (i + 1)], session, sem).result() for i in range(10)]
File "/Users/federikowsky/Desktop/Python/Scraping/SureBet/prova.py", line 110, in <listcomp>
page_content = [executor.submit(asyncio.wait(run), urls[number * i : number * (i + 1)], session, sem).result() for i in range(10)]
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/concurrent/futures/_base.py", line 437, in result
return self.__get_result()
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
raise self._exception
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/concurrent/futures/thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
TypeError: 'coroutine' object is not callable
Related
import paypalrestsdk
import httpx
class paypal:
def __init__(self):
self.secret_id = 'XXXX'
self.client_id = 'XXXX'
self.token = ''
def getToken(self):
headers = {
'Accept': 'application/json',
'Accept-Language': 'en_US',
}
data = {
'grant_type': 'client_credentials'
}
response = httpx.post(url='https://api.sandbox.paypal.com/v1/oauth2/token', data=data,headers=headers,auth=(self.client_id,self.secret_id))
response_data = response.json()
self.token = response_data['access_token']
def getBalance(self):
print(self.token)
headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer '+self.token,
'Accept':'application/x-www-form-urlencoded'
}
response = httpx.post(url='https://api.sandbox.paypal.com/v2/wallet/balance-accounts', headers=headers)
print(response.status_code)
response_data = response.json()
print(response_data)
available = response_data['total_available'][0]['value']
print(response_data)
if __name__ == "__main__":
s = paypal()
s.getToken()
s.getBalance()
I am gettitng 404 code i am doing something bad?
Traceback (most recent call last):
File "C:/Users/localhost/PycharmProjects/Telegram/paypal/Main.py", line 48, in <module>
s.getBalance()
File "C:/Users/localhost/PycharmProjects/Telegram/paypal/Main.py", line 37, in getBalance
response_data = response.json()
File "C:\Users\localhost\AppData\Local\Programs\Python\Python37\lib\site-packages\httpx\models.py", line 899, in json
return jsonlib.loads(self.text, **kwargs)
File "C:\Users\localhost\AppData\Local\Programs\Python\Python37\lib\json\__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "C:\Users\localhost\AppData\Local\Programs\Python\Python37\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\localhost\AppData\Local\Programs\Python\Python37\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
404
https://developer.paypal.com/docs/limited-release/balance-accounts/v2/api/
I try also with
"Authorization: Access-Token" But the response is the same, i readed and searched in the docs, i dont found anything and the acces token is fresh so i dont understand, cause the acces token i get is valid.
You are sending a POST request instead of a GET request. Use httpx.get instead of httpx.post.
You can even use httpx_auth to keep your code focusing on what you want to retrieve:
import paypalrestsdk
import httpx
import httpx_auth
class paypal:
def __init__(self):
self.auth = httpx_auth.OAuth2ClientCredentials(
token_url='https://api.sandbox.paypal.com/v1/oauth2/token',
client_id='XXXX',
client_secret='XXXX',
)
def getBalance(self):
response = httpx.get(url='https://api.sandbox.paypal.com/v2/wallet/balance-accounts', auth=self.auth)
print(response.status_code)
response_data = response.json()
print(response_data)
available = response_data['total_available'][0]['value']
print(response_data)
if __name__ == "__main__":
s = paypal()
s.getBalance()
I'm downloading all the images from pexels.com with a given keywork by the user. The program gives me the following error.
Traceback (most recent call last):
File "./asyncioPexels.py", line 73, in <module>
asyncio.run(forming_all_pages(numberOfPages, mainurl))
File "/usr/lib/python3.7/asyncio/base_events.py", line 573, in run_until_complete
return future.result()
File "./asyncioPexels.py", line 50, in forming_all_pages
await download_all_pages(urls)
File "./asyncioPexels.py", line 38, in download_all_pages
async with aiohttp.ClientSession as session:
AttributeError: __aexit__
I think the problem now is that I'm using the function download_all_pages as a context manager! If this is the problem, how should I fix it? I have a general idea to make it work as a context manager or there is an easier solution?
here goes my whole code:
async def download_single_image(subsession, imgurl):
print(f'Downloading img {imgurl}')
async with session.get(imgurl) as res:
imgFile = open(os.path.join(str(keyword), os.path.basename(imgurl)), 'wb')
for chunk in res.iter_content(100000):
imgFile.write(chunk)
imgFile.close()
async def download_all_images(imgurls):
async with aiohttp.ClientSession as subsession:
subtasks = []
for imgurl in imgurls:
subtask = asyncio.ensure_future(download_single_image(subsession, imgurl))
subtasks.append(subtask)
await asyncio.gather(*subtasks, return_exception=True)
async def download_single_page(session, url):
print(f'Downloading page {url}...')
imgurls = []
async with session.get(url) as response:
imgs = response.text.split('infiniteScrollingAppender.append')[1:]
for img in imgs:
soup = BeautifulSoup(img[2:-5].replace("\\'", "'").replace('\\"', '"'), 'html.parser')
imgurls.append(soup.select('.photo-item__img')[0].get('srcset'))
await download_all_images(imgurls)
async def download_all_pages(urls):
async with aiohttp.ClientSession as session:
tasks = []
for url in urls:
task = asyncio.ensure_future(download_single_page(session, url))
tasks.append(task)
await asyncio.gather(*tasks, return_exception=True)
async def forming_all_pages(numberOfPages, mainurl):
urls = []
for _ in range(1, numberOfPages + 1):
page = mainurl + str(_)
urls.append(page)
await download_all_pages(urls)
if __name__ == "__main__":
asyncio.run(forming_all_pages(numberOfPages, mainurl))
How to solve this problem for the code to run?
In forming_all_pages you have
download_all_pages(urls)
But as the exception tells you
./asyncioPexels.py:50: RuntimeWarning: coroutine 'download_all_pages' was never awaited
Change this to
await download_all_pages(urls)
You also need to change download_single_page to use
await download_all_images(imgurls)
Finally, forming_all_pages needs to be awaitable. You need to change it to
async def forming_all_pages(numberOfPages, mainurl):
I am trying to connect to a .onion site using python. I have tor running on port 9050 and I am getting the following error:
Traceback (most recent call last):
File "/Users/jane/code/test/test.py", line 15, in main
res = await fetch(session, id)
File "/Users/jane/code/test/test.py", line 9, in fetch
async with session.get(url) as res:
File "/usr/local/lib/python3.7/site-packages/aiohttp/client.py", line 1005, in __aenter__
self._resp = await self._coro
File "/usr/local/lib/python3.7/site-packages/aiohttp/client.py", line 476, in _request
timeout=real_timeout
File "/usr/local/lib/python3.7/site-packages/aiohttp/connector.py", line 522, in connect
proto = await self._create_connection(req, traces, timeout)
File "/usr/local/lib/python3.7/site-packages/aiohttp/connector.py", line 854, in _create_connection
req, traces, timeout)
File "/usr/local/lib/python3.7/site-packages/aiohttp/connector.py", line 959, in _create_direct_connection
raise ClientConnectorError(req.connection_key, exc) from exc
aiohttp.client_exceptions.ClientConnectorError: Cannot connect to host intelex7ny6coqno.onion:80 ssl:None [nodename nor servname provided, or not known]
The code:
import asyncio
import aiohttp
from aiohttp_socks import SocksConnector
async def fetch(session, id):
print('Starting {}'.format(id))
url = 'http://intelex7ny6coqno.onion/topic/{}'.format(id)
async with session.get(url) as res:
return res.text
async def main(id):
connector = SocksConnector.from_url('socks5://localhost:9050')
async with aiohttp.ClientSession(connector=connector) as session:
res = await fetch(session, id)
print(res)
if __name__ == '__main__':
ids = ['10', '11', '12']
loop = asyncio.get_event_loop()
future = [asyncio.ensure_future(main(id)) for id in ids]
loop.run_until_complete(asyncio.wait(future))
This code works fine:
import requests
session = requests.session()
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
res = session.get(url, headers=headers)
print(res)
Why am I getting Cannot connect to host intelex7ny6coqno.onion:80 ssl:None [nodename nor servname provided, or not known]?
What am I missing here?
By default it appears to be using the local DNS resolver to asynchronously resolve hostnames. When using requests socks5h you are getting DNS resolution over SOCKS (Tor).
Adding rdns=True appears to work for .onion addresses:
connector = SocksConnector.from_url('socks5://localhost:9050', rdns=True)
I am running aiohttp as my server. When a request comes in, I try to spawn a process to handle it. But I get the below error:
Traceback (most recent call last): File "asyncppx.py", line 33, in
app.add_routes([web.get('/', asyncio.ensure_future(runMcows(n)))]) File
"/Users/i3ye/Programming/vsc/async/env/lib/python3.6/site-packages/aiohttp/web_app.py",
line 231, in add_routes
self.router.add_routes(routes) File "/Users/i3ye/Programming/vsc/async/env/lib/python3.6/site-packages/aiohttp/web_urldispatcher.py",
line 966, in add_routes
route_obj.register(self) File "/Users/i3ye/Programming/vsc/async/env/lib/python3.6/site-packages/aiohttp/web_routedef.py",
line 38, in register
reg(self.path, self.handler, **self.kwargs) File "/Users/i3ye/Programming/vsc/async/env/lib/python3.6/site-packages/aiohttp/web_urldispatcher.py",
line 922, in add_get
resource.add_route(hdrs.METH_HEAD, handler, **kwargs) File "/Users/i3ye/Programming/vsc/async/env/lib/python3.6/site-packages/aiohttp/web_urldispatcher.py",
line 269, in add_route
expect_handler=expect_handler) File "/Users/i3ye/Programming/vsc/async/env/lib/python3.6/site-packages/aiohttp/web_urldispatcher.py",
line 682, in init
resource=resource) File "/Users/i3ye/Programming/vsc/async/env/lib/python3.6/site-packages/aiohttp/web_urldispatcher.py",
line 103, in init
assert callable(handler), handler AssertionError: > Task was destroyed but it
is pending! task: > sys:1: RuntimeWarning: coroutine 'runMcows' was never
awaited
The code is below, any suggestions?
from aiohttp import web
import aiohttp
import asyncio
loop = asyncio.get_event_loop()
#tasks = []
n = 0
def mcowA(n):
print (n, " : A")
return
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
def mcowB(n):
print (n, " : B")
return
async def runMcows(n):
mcowA(n)
async with aiohttp.ClientSession() as session:
html = await fetch(session, 'http://localhost:8081')
mcowB(n)
return html
try:
app = web.Application()
app.add_routes([web.get('/', asyncio.ensure_future(runMcows(n)))])
loop.run_forever()
web.run_app(app)
finally:
loop.close()
If you look at the server example here:
Your code should be like this in the main execution:
app = web.Application()
app.add_routes([web.get('/', runMcows])
web.run_app(app)
app.add_routes You need to pass a coroutine runMcows which can only take 1 variable, the request itself.
async def runMcows(request):
mcowA(n)
async with aiohttp.ClientSession() as session:
html = await fetch(session, 'http://localhost:8081')
mcowB(n)
return web.Response(text=html) # Change this response type based on what you need.
I keep getting error AttributeError: __aexit__ on the code below, but I don't really understand why this happens.
My Python version is: 3.6.4 (v3.6.4:d48eceb, Dec 19 2017, 06:04:45) [MSC v.1900 32 bit (Intel)]
import aiohttp
import asyncio
import tqdm
async def fetch_url(session_, url_, timeout_=10):
with aiohttp.Timeout(timeout_):
async with session_.get(url_) as response:
text = await response.text()
print("URL: {} - TEXT: {}".format(url_, len(text)))
return text
async def parse_url(session, url, timeout=10):
# get doc from url
async with await fetch_url(session, url, timeout) as doc:
print("DOC: {}".format(doc, len(doc)))
return doc
async def parse_urls(session, urls, loop):
tasks = [parse_url(session, url) for url in urls]
responses = [await f for f in tqdm.tqdm(asyncio.as_completed(tasks), total = len(tasks))]
return responses
if __name__ == '__main__':
tickers = ['CTXS', 'MSFT', 'AAPL', 'GPRO', 'G', 'INTC', 'SYNC', 'SYNA']
urls = ["https://finance.yahoo.com/quote/{}".format(ticker) for ticker in tickers]
loop = asyncio.get_event_loop()
with aiohttp.ClientSession(loop=loop) as session:
parsed_data = loop.run_until_complete(parse_urls(session, urls, loop))
print(parsed_data)
Error callstack:
C:\Python\Python36\python.exe C:/Users/me/.PyCharmCE2017.3/config/scratches/scratch_4.py
0%| | 0/8 [00:00<?, ?it/s]Traceback (most recent call last):
URL: https://finance.yahoo.com/quote/CTXS - TEXT: 462138
File "C:/Users/me/.PyCharmCE2017.3/config/scratches/scratch_4.py", line 34, in <module>
parsed_data = loop.run_until_complete(parse_urls(session, urls, loop))
File "C:\Python\Python36\lib\asyncio\base_events.py", line 467, in run_until_complete
return future.result()
File "C:/Users/me/.PyCharmCE2017.3/config/scratches/scratch_4.py", line 23, in parse_urls
responses = [await f for f in tqdm.tqdm(asyncio.as_completed(tasks), total = len(tasks))]
File "C:/Users/me/.PyCharmCE2017.3/config/scratches/scratch_4.py", line 23, in <listcomp>
responses = [await f for f in tqdm.tqdm(asyncio.as_completed(tasks), total = len(tasks))]
File "C:\Python\Python36\lib\asyncio\tasks.py", line 458, in _wait_for_one
return f.result() # May raise f.exception().
File "C:/Users/me/.PyCharmCE2017.3/config/scratches/scratch_4.py", line 16, in parse_url
async with await fetch_url(session, url, timeout) as doc:
AttributeError: __aexit__
Process finished with exit code 1
You are trying to use fetch_url as a context manager, but it isn't one. You can either make it one
class fetch_url:
def __init__(self, session, url, timeout=10):
self.session = session
self.url = url
self.timeout = timeout
async def __aenter__(self):
with aiohttp.Timeout(self.timeout):
async with self.session.get(self.url) as response:
text = await response.text()
print("URL: {} - TEXT: {}".format(self.url, len(text)))
return text
async def __aexit__(self, exc_type, exc, tb):
# clean up anything you need to clean up
or change your code to
async def parse_url(session, url, timeout=10):
# get doc from url
doc = await fetch_url(session, url, timeout)
print("DOC: {}".format(doc, len(doc)))
return doc