How to fetch a url asynchronously with pyppeteer(One browser many tabs) - python-3.x

I want my script to
Open say 3 tabs
Asynchronously fetch a url(same for each tab)
Save the response
Sleep for 4 seconds
Parse through the response with regex(I tried BeautifulSoup but its too slow) and return a token
Loop through several times within the 3 tabs
My problem is with 2. I have an example script but it synchronously fetches the url. I would like to make it asynchronous.
from pyppeteer import launch
urls = ['https://www.example.com']
async def main():
browser = await launch(headless=False)
for url in urls:
page1 = await browser.newPage()
page2 = await browser.newPage()
page3 = await browser.newPage()
await page1.goto(url)
await page2.goto(url)
await page3.goto(url)
title1= await page1.title()
title2= await page2.title()
title3= await page3.title()
print(title1)
print(title2)
print(title3)
#await browser.close()
asyncio.get_event_loop().run_until_complete(main())
Also, as you can see, the code is not so concise. How do I go about making it asynchronous?
Also if it helps, I have other pyppeteer scripts which don't fit my need just in case it would be easier to convert those
import asyncio
from pyppeteer import launch
url = 'http://www.example.com'
browser = None
async def fetchUrl(url):
# Define browser as a global variable to ensure that the browser window is only created once in the entire process
global browser
if browser is None:
browser = await launch(headless=False)
page = await browser.newPage()
await page.goto(url)
#await asyncio.wait([page.waitForNavigation()])
#str = await page.content()
#print(str)
# Execute this function multiple times for testing
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
The script is asynchronous but it executes one event loop at a time so its as good as synchronous.
# cat test.py
import asyncio
import time
from pyppeteer import launch
WEBSITE_LIST = [
'http://envato.com',
'http://amazon.co.uk',
'http://example.com',
]
start = time.time()
async def fetch(url):
browser = await launch(headless=False, args=['--no-sandbox'])
page = await browser.newPage()
await page.goto(f'{url}', {'waitUntil': 'load'})
print(f'{url}')
await asyncio.sleep(1)
await page.close()
#await browser.close()
async def run():
tasks = []
for url in WEBSITE_LIST:
task = asyncio.ensure_future(fetch(url))
tasks.append(task)
responses = await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
print(f'It took {time.time()-start} seconds.')
The script is asynchronous but it launches a separate browser for each url which ends up taking too many resources.

This will open each URL in a separate tab:
import asyncio
import traceback
from pyppeteer import launch
URLS = [
"http://envato.com",
"http://amazon.co.uk",
"http://example.com",
]
async def fetch(browser, url):
page = await browser.newPage()
try:
await page.goto(f"{url}", {"waitUntil": "load"})
except Exception:
traceback.print_exc()
else:
html = await page.content()
return (url, html)
finally:
await page.close()
async def main():
tasks = []
browser = await launch(headless=True, args=["--no-sandbox"])
for url in URLS:
tasks.append(asyncio.create_task(fetch(browser, url)))
for coro in asyncio.as_completed(tasks):
url, html = await coro
print(f"{url}: ({len(html)})")
await browser.close()
if __name__ == "__main__":
main = asyncio.run(main())

Related

Pyppeteer will stop working at the moment of opening the browser

I am using python version 3.9.
I have a code that repeats the same tests using a "While" loop.
async def pars():
site_ad = "my_link"
msg = ''
new_mass_for_mes = []
try:
launcher.defaultArgs().remove("--enable-automation")
browser = await launch({"headless": False,"userDataDir":"./tool/pyppeteer", "args": ["--no-sandbox", '--disable-setuid-sandbox',"--start-maximized","--disable-infobars","--disable-dev-shm-usage"]})
page = await browser.newPage()
await stealth(page)
await page.goto(site_ad,{'timeout': 15000})
...
await page.waitFor(3000)
await page.close()
await browser.close()
except Exception as e:
try:
try:
await page.close()
except:
pass
try:
await browser.disconnect()
except:
pass
await browser.close()
except:
pass
logging.error("Error params {0} at {1}".format(site_ad, datetime.datetime.now()))
logging.error(e)
The code works, but if an error occurred and the browser did not close, then the next time the program will stop at the moment "browser = await launch".
The error may also occur when ""headless": True".
So far, I have only come up with checking for the existence of the "pyppeteer" process before opening the browser and killing if it exists.

Python Pyppeter Unable to Scrape RU retailers

Hello good day stackoverflow pips,
Issue: stack and data was never scraped in a russian retailer which is in this case www.vseinstrumenti.ru
code:
import asyncio
from pyppeteer import launch
class PyppeteerRequests:
def __init__(self):
self.headers = {}
def get_url(self, url):
data = None
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
data = loop.run_until_complete(self.main(url))
print(data)
except Exception as e:
print(str(e))
return data
async def main(self, url):
browser = await launch(headless=True)
page = await browser.newPage()
await page.goto(url, options={'timeout':1000000, 'waitUntil':['load', 'networkidle2']}),
loaded_html = await page.content()
await page.waitForNavigation()
print("closing context...")
await asyncio.sleep(0.3)
await page.close()
await browser.close()
print("closing browser...")
await asyncio.sleep(0.3)
return loaded_html
if __name__=="__main__":
requester = PyppeteerRequests()
url = 'https://www.vseinstrumenti.ru/ruchnoy-instrument/sadoviy-instrument-i-inventar/topory/fiskars/x11-s-1015640/'
data = requester.get_url(url)
print(data)
It just stacked and get ERROR: Navigation Timeout Exceeded: 1000000 ms exceeded.
What part of the code should I change? Is it scrape-able on your side? Kindly let me know how to improve my code using asnycio. Thanks!

Why does asyncio.create_task not run the method?

Code example:
async def download_page(session, url):
print(True)
async def downloader_init(session):
while True:
url = await download_queue.get()
task = asyncio.create_task(download_page(session, url))
print(task)
print(f"url: {url}")
async def get_urls(url):
while True:
try:
url = find_somewhere_url
await download_queue.put(url)
except NoSuchElementException:
break
return True
async def main():
async with aiohttp.ClientSession(headers=headers) as session:
get_urls_task = asyncio.create_task(get_urls(url))
downloader_init_task = asyncio.create_task(downloader_init(session))
asyncio.gather(get_urls_task, downloader_init_task)
if __name__ == "__main__":
asyncio.get_event_loop().run_until_complete(main())
Output:
<Task pending coro=<download_page() running at main.py:69>>
url: https://someurl.com/example
<Task pending coro=<download_page() running at main.py:69>>
url: https://someurl.com/example
<Task pending coro=<download_page() running at main.py:69>>
url: https://someurl.com/example
Why is the method download_page not executed?
The strange thing is that the script just ends its work, there are no errors anywhere.
downloader_init should work endlessly, but it does not.
In download_queue, method get_urls adds links as it finds them, after which it stops working.
downloader_init should immediately execute as soon as a new link appears in the queue, but it starts its work only when get_urls has completed its work.
Try this instead:
Note: Your problem wasn't with the task creation, it was because
there wasn't an await at the asyncio.gather part.
import asyncio
import aiohttp
async def download_page(session, url):
# Dummy function.
print(f"session={session}, url={url}")
async def downloader_init(session):
while True:
url = await download_queue.get()
task = asyncio.create_task(download_page(session, url))
print(f"task={task}, url={url}")
async def get_urls(url):
while True:
try:
url = find_somewhere_url()
await download_queue.put(url)
except NoSuchElementException:
break
async def main():
async with aiohttp.ClientSession(headers=headers) as session:
get_urls_task = asyncio.create_task(get_urls(url))
downloader_init_task = asyncio.create_task(downloader_init(session))
# Use await here to make it finish the tasks.
await asyncio.gather(get_urls_task, downloader_init_task)
if __name__ == "__main__":
# Use this as it deals with the loop creation, shutdown,
# and other stuff for you.
asyncio.run(main()) # This is new in Python 3.7

aiohttp and asyncio how to get response from http request and websocket in concurrent way?

I try to receive data from two endpoints in same time. But if websocket stop to send messages I won't receive data from request from "https://www.blabla.com". What is the best way for solving this problem?
import asyncio
import aiohttp
URL = 'wss://www.some_web_socket.io'
async def get_some_data(session):
url = "https://www.blabla.com"
async with session.get(url) as response:
data = await response.text()
return data
async def ws_handler(url):
async with aiohttp.ClientSession() as session:
async with session.ws_connect(url) as ws:
msg = await ws.receive()
while True:
some_data_from_get_request = await get_some_data(session)
msg_from_websocket = await ws.receive()
if msg.type == aiohttp.WSMsgType.TEXT:
print(stream_data)
print(some_data_from_get_request)
def _main():
asyncio.run(ws_handler(URL))
if __name__ == "__main__":
_main()
This code serializes the return values of HTTP and websocket communication:
while True:
some_data_from_get_request = await get_some_data(session)
msg_from_websocket = await ws.receive()
To be able to detect either of the two coroutines returning, you can use asyncio.wait(..., return_when=asyncio.FIRST_COMPLETED):
http_fut = asyncio.ensure_future(get_some_data(session))
ws_fut = asyncio.ensure_future(ws.receive())
pending = {http_fut, ws_fut}
while pending:
_done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
if http_fut.done():
some_data_from_get_request = http_fut.result()
...
if ws_fut.done():
msg_from_websocket = ws_fut.result()
...

Fetching multiple urls with aiohttp in python

In a previous question, a user suggested the following approach for fetching multiple urls (API calls) with aiohttp:
import asyncio
import aiohttp
url_list = ['https://api.pushshift.io/reddit/search/comment/?q=Nestle&size=30&after=1530396000&before=1530436000', 'https://api.pushshift.io/reddit/search/comment/?q=Nestle&size=30&after=1530436000&before=1530476000']
async def fetch(session, url):
async with session.get(url) as response:
return await response.json()['data']
async def fetch_all(session, urls, loop):
results = await asyncio.gather(*[loop.create_task(fetch(session, url)) for url in urls], return_exceptions= True)
return results
if __name__=='__main__':
loop = asyncio.get_event_loop()
urls = url_list
with aiohttp.ClientSession(loop=loop) as session:
htmls = loop.run_until_complete(fetch_all(session, urls, loop))
print(htmls)
However, this results in only returning Attribute errors:
[AttributeError('__aexit__',), AttributeError('__aexit__',)]
(which I enabled, otherwhise it would just break). I really hope there is somebody here, who can help with this, it is still kind of hard to find resources for asyncio etc. The returned data is in json format. In the end I would like to put all json dicts in a list.
Working example:
import asyncio
import aiohttp
import ssl
url_list = ['https://api.pushshift.io/reddit/search/comment/?q=Nestle&size=30&after=1530396000&before=1530436000',
'https://api.pushshift.io/reddit/search/comment/?q=Nestle&size=30&after=1530436000&before=1530476000']
async def fetch(session, url):
async with session.get(url, ssl=ssl.SSLContext()) as response:
return await response.json()
async def fetch_all(urls, loop):
async with aiohttp.ClientSession(loop=loop) as session:
results = await asyncio.gather(*[fetch(session, url) for url in urls], return_exceptions=True)
return results
if __name__ == '__main__':
loop = asyncio.get_event_loop()
urls = url_list
htmls = loop.run_until_complete(fetch_all(urls, loop))
print(htmls)

Resources