I am trying to use the RequestSetIntercept function to
quicken the loading of webpage with Pyppeteer.
However I am getting the warning:
RuntimeWarning: coroutine 'block_image' was never awaited
I can't figure out where I am missing an await.
I've added awaits withing the intercept function itself following a template I've found online. I am testing out the
setIntercept function with Pyppeeteer.
Thank you.
#utils.py
class MakeRequest():
ua = User_Agent()
async def _proxy_browser(self, url,
headless = False,
intercept_func = None,
proxy = True,
**kwargs):
if proxy:
args = [*proxy*
'--ignore-certificate-errors']
else:
args = ['--ignore-certificate-errors']
for i in range(3):
try:
browser = await launch(headless = headless,
args = args,
defaultViewport = None)
page = await browser.newPage()
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0')
if intercept_func is not None:
await page.setRequestInterception(True)
page.on('request', intercept_func)
await page.goto(url, {'waitUntil' : 'load', 'timeout': 0 })
content = await page.content()
return content
except (pyppeteer.errors.PageError,
pyppeteer.errors.TimeoutError,
pyppeteer.errors.BrowserError,
pyppeteer.errors.NetworkError) as e:
print('error', e)
time.sleep(2)
continue
finally:
await browser.close()
return
scraper.py:
REQUESTER = MakeRequest()
async def block_image(request):
if request.url.endswith('.png') or request.url.endswith('.jpg'):
print(request.url)
await request.abort()
else:
await request.continue_()
def get_request(url):
for i in range(3):
response = REQUESTER.proxy_browser_request(url = url,
headless = False,
intercept_func = block_image)
if response:
return response
else:
print(f'Attempt {i +1} : {url}links not found')
print('retrying...')
time.sleep(3)
Your function block_image is a coroutine, but the callback passed to page.on is expected to be a normal function. Try writing a synchronous lambda function that wraps the coroutine in a Task (thus scheduling it on the current event loop):
if intercept_func is not None:
await page.setRequestInterception(True)
page.on('request', lambda request: asyncio.create_task(intercept_func(request)))
There's an example of this kind of code in the Pyppteer documentation here. If you're using an older version of Python (<3.7), use asyncio.ensure_future instead of asyncio.create_task (as is done in the example in the docs).
Related
I am using python version 3.9.
I have a code that repeats the same tests using a "While" loop.
async def pars():
site_ad = "my_link"
msg = ''
new_mass_for_mes = []
try:
launcher.defaultArgs().remove("--enable-automation")
browser = await launch({"headless": False,"userDataDir":"./tool/pyppeteer", "args": ["--no-sandbox", '--disable-setuid-sandbox',"--start-maximized","--disable-infobars","--disable-dev-shm-usage"]})
page = await browser.newPage()
await stealth(page)
await page.goto(site_ad,{'timeout': 15000})
...
await page.waitFor(3000)
await page.close()
await browser.close()
except Exception as e:
try:
try:
await page.close()
except:
pass
try:
await browser.disconnect()
except:
pass
await browser.close()
except:
pass
logging.error("Error params {0} at {1}".format(site_ad, datetime.datetime.now()))
logging.error(e)
The code works, but if an error occurred and the browser did not close, then the next time the program will stop at the moment "browser = await launch".
The error may also occur when ""headless": True".
So far, I have only come up with checking for the existence of the "pyppeteer" process before opening the browser and killing if it exists.
I want my script to
Open say 3 tabs
Asynchronously fetch a url(same for each tab)
Save the response
Sleep for 4 seconds
Parse through the response with regex(I tried BeautifulSoup but its too slow) and return a token
Loop through several times within the 3 tabs
My problem is with 2. I have an example script but it synchronously fetches the url. I would like to make it asynchronous.
from pyppeteer import launch
urls = ['https://www.example.com']
async def main():
browser = await launch(headless=False)
for url in urls:
page1 = await browser.newPage()
page2 = await browser.newPage()
page3 = await browser.newPage()
await page1.goto(url)
await page2.goto(url)
await page3.goto(url)
title1= await page1.title()
title2= await page2.title()
title3= await page3.title()
print(title1)
print(title2)
print(title3)
#await browser.close()
asyncio.get_event_loop().run_until_complete(main())
Also, as you can see, the code is not so concise. How do I go about making it asynchronous?
Also if it helps, I have other pyppeteer scripts which don't fit my need just in case it would be easier to convert those
import asyncio
from pyppeteer import launch
url = 'http://www.example.com'
browser = None
async def fetchUrl(url):
# Define browser as a global variable to ensure that the browser window is only created once in the entire process
global browser
if browser is None:
browser = await launch(headless=False)
page = await browser.newPage()
await page.goto(url)
#await asyncio.wait([page.waitForNavigation()])
#str = await page.content()
#print(str)
# Execute this function multiple times for testing
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
The script is asynchronous but it executes one event loop at a time so its as good as synchronous.
# cat test.py
import asyncio
import time
from pyppeteer import launch
WEBSITE_LIST = [
'http://envato.com',
'http://amazon.co.uk',
'http://example.com',
]
start = time.time()
async def fetch(url):
browser = await launch(headless=False, args=['--no-sandbox'])
page = await browser.newPage()
await page.goto(f'{url}', {'waitUntil': 'load'})
print(f'{url}')
await asyncio.sleep(1)
await page.close()
#await browser.close()
async def run():
tasks = []
for url in WEBSITE_LIST:
task = asyncio.ensure_future(fetch(url))
tasks.append(task)
responses = await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
print(f'It took {time.time()-start} seconds.')
The script is asynchronous but it launches a separate browser for each url which ends up taking too many resources.
This will open each URL in a separate tab:
import asyncio
import traceback
from pyppeteer import launch
URLS = [
"http://envato.com",
"http://amazon.co.uk",
"http://example.com",
]
async def fetch(browser, url):
page = await browser.newPage()
try:
await page.goto(f"{url}", {"waitUntil": "load"})
except Exception:
traceback.print_exc()
else:
html = await page.content()
return (url, html)
finally:
await page.close()
async def main():
tasks = []
browser = await launch(headless=True, args=["--no-sandbox"])
for url in URLS:
tasks.append(asyncio.create_task(fetch(browser, url)))
for coro in asyncio.as_completed(tasks):
url, html = await coro
print(f"{url}: ({len(html)})")
await browser.close()
if __name__ == "__main__":
main = asyncio.run(main())
I am trying to understand aiohttp a little better. Can someone check why my code is not printing the response of the request, instead it just prints the coroutine.
import asyncio
import aiohttp
import requests
async def get_event_1(session):
url = "https://stackoverflow.com/"
headers = {
'content-Type': 'application/json'
}
response = await session.request('GET', url)
return response.json()
async def get_event_2(session):
url = "https://google.com"
headers = {
'content-Type': 'application/json'
}
response = await session.request('GET', url)
return response.json()
async def main():
async with aiohttp.ClientSession() as session:
return await asyncio.gather(
get_event_1(session),
get_event_2(session)
)
loop = asyncio.get_event_loop()
x = loop.run_until_complete(main())
loop.close()
print(x)
Output:
$ python async.py
[<coroutine object ClientResponse.json at 0x10567ae60>, <coroutine object ClientResponse.json at 0x10567aef0>]
sys:1: RuntimeWarning: coroutine 'ClientResponse.json' was never awaited
How do i print the responses instead?
The error message you received is informing you that a coroutine was never awaited.
You can see from the aiohttp documentation that response.json() is a also a coroutine and therefore must be awaited. https://docs.aiohttp.org/en/stable/client_quickstart.html#json-response-content
return await response.json()
Hello good day stackoverflow pips,
Issue: stack and data was never scraped in a russian retailer which is in this case www.vseinstrumenti.ru
code:
import asyncio
from pyppeteer import launch
class PyppeteerRequests:
def __init__(self):
self.headers = {}
def get_url(self, url):
data = None
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
data = loop.run_until_complete(self.main(url))
print(data)
except Exception as e:
print(str(e))
return data
async def main(self, url):
browser = await launch(headless=True)
page = await browser.newPage()
await page.goto(url, options={'timeout':1000000, 'waitUntil':['load', 'networkidle2']}),
loaded_html = await page.content()
await page.waitForNavigation()
print("closing context...")
await asyncio.sleep(0.3)
await page.close()
await browser.close()
print("closing browser...")
await asyncio.sleep(0.3)
return loaded_html
if __name__=="__main__":
requester = PyppeteerRequests()
url = 'https://www.vseinstrumenti.ru/ruchnoy-instrument/sadoviy-instrument-i-inventar/topory/fiskars/x11-s-1015640/'
data = requester.get_url(url)
print(data)
It just stacked and get ERROR: Navigation Timeout Exceeded: 1000000 ms exceeded.
What part of the code should I change? Is it scrape-able on your side? Kindly let me know how to improve my code using asnycio. Thanks!
I'm trying to handle an asynchronous HTTP request. I call the async_provider() function from another module and with the resulting response.text() I perform subsequent tasks.
It only works if all requests are successful. But I can't handle any exceptions for failed requests (whatever the reason for the exception). Thank you for your help.
Here is the relevant part of the code:
import asyncio
import aiohttp
# i call this function from another module
def async_provider():
list_a, list_b = asyncio.run(main())
return list_a, list_b
async def fetch(session, url):
# session.post request cases
if url == "http://...1":
referer = "http://...referer"
user_agent = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
"Version/12.1 Safari/605.1.15"
)
payload = {'key1': 'value1', 'key2': 'value2'}
async with session.post(
url, data=payload, headers={"Referer": referer, "User-Agent": user_agent}
) as response:
if response.status != 200:
response.raise_for_status()
return await response.text()
# session.get request cases
else:
async with session.get(url) as response:
if response.status != 200:
response.raise_for_status()
return await response.text()
async def fetch_all(session, urls):
results = await asyncio.gather(
*[asyncio.create_task(fetch(session, url)) for url in urls]
)
return results
async def main():
urls = ["http://...1", "http://...2", "http://...3"]
async with aiohttp.ClientSession() as session:
response_text_1, response_text_2, response_text_3 = await fetch_all(
session, urls
)
# some task with response text
Any exception breaks all requests
Check "return_exceptions" flag on gather.
results = await asyncio.gather(
*[asyncio.create_task(fetch(session, url)) for url in urls],
return_exceptions=True
)
It will return you list of finished tasks. You can then use their Task.result() or
Task.exception() methods to reraise or check if there was exception.