Pyppeteer will stop working at the moment of opening the browser - python-3.x

I am using python version 3.9.
I have a code that repeats the same tests using a "While" loop.
async def pars():
site_ad = "my_link"
msg = ''
new_mass_for_mes = []
try:
launcher.defaultArgs().remove("--enable-automation")
browser = await launch({"headless": False,"userDataDir":"./tool/pyppeteer", "args": ["--no-sandbox", '--disable-setuid-sandbox',"--start-maximized","--disable-infobars","--disable-dev-shm-usage"]})
page = await browser.newPage()
await stealth(page)
await page.goto(site_ad,{'timeout': 15000})
...
await page.waitFor(3000)
await page.close()
await browser.close()
except Exception as e:
try:
try:
await page.close()
except:
pass
try:
await browser.disconnect()
except:
pass
await browser.close()
except:
pass
logging.error("Error params {0} at {1}".format(site_ad, datetime.datetime.now()))
logging.error(e)
The code works, but if an error occurred and the browser did not close, then the next time the program will stop at the moment "browser = await launch".
The error may also occur when ""headless": True".
So far, I have only come up with checking for the existence of the "pyppeteer" process before opening the browser and killing if it exists.

Related

Pyppeteer RequestSetIntercept function : coroutine was never awaited

I am trying to use the RequestSetIntercept function to
quicken the loading of webpage with Pyppeteer.
However I am getting the warning:
RuntimeWarning: coroutine 'block_image' was never awaited
I can't figure out where I am missing an await.
I've added awaits withing the intercept function itself following a template I've found online. I am testing out the
setIntercept function with Pyppeeteer.
Thank you.
#utils.py
class MakeRequest():
ua = User_Agent()
async def _proxy_browser(self, url,
headless = False,
intercept_func = None,
proxy = True,
**kwargs):
if proxy:
args = [*proxy*
'--ignore-certificate-errors']
else:
args = ['--ignore-certificate-errors']
for i in range(3):
try:
browser = await launch(headless = headless,
args = args,
defaultViewport = None)
page = await browser.newPage()
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0')
if intercept_func is not None:
await page.setRequestInterception(True)
page.on('request', intercept_func)
await page.goto(url, {'waitUntil' : 'load', 'timeout': 0 })
content = await page.content()
return content
except (pyppeteer.errors.PageError,
pyppeteer.errors.TimeoutError,
pyppeteer.errors.BrowserError,
pyppeteer.errors.NetworkError) as e:
print('error', e)
time.sleep(2)
continue
finally:
await browser.close()
return
scraper.py:
REQUESTER = MakeRequest()
async def block_image(request):
if request.url.endswith('.png') or request.url.endswith('.jpg'):
print(request.url)
await request.abort()
else:
await request.continue_()
def get_request(url):
for i in range(3):
response = REQUESTER.proxy_browser_request(url = url,
headless = False,
intercept_func = block_image)
if response:
return response
else:
print(f'Attempt {i +1} : {url}links not found')
print('retrying...')
time.sleep(3)
Your function block_image is a coroutine, but the callback passed to page.on is expected to be a normal function. Try writing a synchronous lambda function that wraps the coroutine in a Task (thus scheduling it on the current event loop):
if intercept_func is not None:
await page.setRequestInterception(True)
page.on('request', lambda request: asyncio.create_task(intercept_func(request)))
There's an example of this kind of code in the Pyppteer documentation here. If you're using an older version of Python (<3.7), use asyncio.ensure_future instead of asyncio.create_task (as is done in the example in the docs).

How to fetch a url asynchronously with pyppeteer(One browser many tabs)

I want my script to
Open say 3 tabs
Asynchronously fetch a url(same for each tab)
Save the response
Sleep for 4 seconds
Parse through the response with regex(I tried BeautifulSoup but its too slow) and return a token
Loop through several times within the 3 tabs
My problem is with 2. I have an example script but it synchronously fetches the url. I would like to make it asynchronous.
from pyppeteer import launch
urls = ['https://www.example.com']
async def main():
browser = await launch(headless=False)
for url in urls:
page1 = await browser.newPage()
page2 = await browser.newPage()
page3 = await browser.newPage()
await page1.goto(url)
await page2.goto(url)
await page3.goto(url)
title1= await page1.title()
title2= await page2.title()
title3= await page3.title()
print(title1)
print(title2)
print(title3)
#await browser.close()
asyncio.get_event_loop().run_until_complete(main())
Also, as you can see, the code is not so concise. How do I go about making it asynchronous?
Also if it helps, I have other pyppeteer scripts which don't fit my need just in case it would be easier to convert those
import asyncio
from pyppeteer import launch
url = 'http://www.example.com'
browser = None
async def fetchUrl(url):
# Define browser as a global variable to ensure that the browser window is only created once in the entire process
global browser
if browser is None:
browser = await launch(headless=False)
page = await browser.newPage()
await page.goto(url)
#await asyncio.wait([page.waitForNavigation()])
#str = await page.content()
#print(str)
# Execute this function multiple times for testing
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
The script is asynchronous but it executes one event loop at a time so its as good as synchronous.
# cat test.py
import asyncio
import time
from pyppeteer import launch
WEBSITE_LIST = [
'http://envato.com',
'http://amazon.co.uk',
'http://example.com',
]
start = time.time()
async def fetch(url):
browser = await launch(headless=False, args=['--no-sandbox'])
page = await browser.newPage()
await page.goto(f'{url}', {'waitUntil': 'load'})
print(f'{url}')
await asyncio.sleep(1)
await page.close()
#await browser.close()
async def run():
tasks = []
for url in WEBSITE_LIST:
task = asyncio.ensure_future(fetch(url))
tasks.append(task)
responses = await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
print(f'It took {time.time()-start} seconds.')
The script is asynchronous but it launches a separate browser for each url which ends up taking too many resources.
This will open each URL in a separate tab:
import asyncio
import traceback
from pyppeteer import launch
URLS = [
"http://envato.com",
"http://amazon.co.uk",
"http://example.com",
]
async def fetch(browser, url):
page = await browser.newPage()
try:
await page.goto(f"{url}", {"waitUntil": "load"})
except Exception:
traceback.print_exc()
else:
html = await page.content()
return (url, html)
finally:
await page.close()
async def main():
tasks = []
browser = await launch(headless=True, args=["--no-sandbox"])
for url in URLS:
tasks.append(asyncio.create_task(fetch(browser, url)))
for coro in asyncio.as_completed(tasks):
url, html = await coro
print(f"{url}: ({len(html)})")
await browser.close()
if __name__ == "__main__":
main = asyncio.run(main())

Discord.py - Asyncio Timeout not working properly

Expected output and what my code for it is:
My bot is supposed to send a message, and then check if the user who sent the command reacted on that message with :arrow_left:, :arrow_right:, or :wastebasket: and if they did, it is supposed to change the "page" of the message. That part works fine, but I also want the message to timeout after 7 seconds of inactivity.
embed = await self.get_member_list(ctx, member, page)
msg = await ctx.send(embed=embed)
reactions = ["⬅️", "➡️", "🗑️"]
for x in r:
await msg.add_reaction(x)
await asyncio.sleep(.35)
def check(payload):
return str(payload.emoji) in reactions
done = False
page = 1
while not done:
try:
pending_tasks = [self.bot.wait_for('raw_reaction_add', timeout=7.0, check=check),
self.bot.wait_for('raw_reaction_remove', timeout=7.0, check=check)]
done_tasks, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
for task in done_tasks: payload = await task
user = await commands.MemberConverter().convert(ctx, str(payload.user_id))
except asyncio.TimeoutError:
done = True
return
else:
if user == ctx.author:
if str(payload.emoji) == "🗑️":
return await msg.delete(delay=0)
if str(payload.emoji) == "⬅️":
if page == 1:
page = total_pages
else:
page -= 1
if str(payload.emoji) == "➡️":
if page == total_pages:
page = 1
else:
page += 1
await msg.edit(embed=await self.get_member_list(ctx, member, page)) #
However, I am facing the problem "Task exception was never retrieved" after running the code above.
Actual results:
Task exception was never retrieved
future: <Task finished name='Task-58' coro=<wait_for() done, defined at C:\Users\AppData\Local\Programs\Python\Python38\lib\asyncio\tasks.py:434> exception=TimeoutError()>
Traceback (most recent call last):
File "C:\Users\AppData\Local\Programs\Python\Python38\lib\asyncio\tasks.py", line 501, in wait_for
raise exceptions.TimeoutError()
asyncio.exceptions.TimeoutError
The problem:
Whenever 7 seconds of inactivity passes and it is supposed to timeout I get an error. I have tried to fix this by running asyncio.gather() but I am unfamiliar with asyncio and I am unsure how to use it properly.
done_tasks, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
await asyncio.gather('raw_reaction_add', 'raw_reaction_remove', return_exceptions=True)
I have tried:
Checking for typos
Running except Exception and except asyncio.exceptions.TimeoutError instead of except asyncio.TimeoutError
Contemplating my sanity
Reading the asyncio documentation, specifically on asyncio.wait()
Making sure that my bot has all the permissions and intents it needs in discord
Using self.bot.wait_for() with raw_reaction_add and raw_reaction_remove inside of a tuple instead of asyncio.wait()
I ended up getting rid of the error message by doing this:
First I moved the timeout from bot.wait_for() to asyncio.wait()
pending_tasks = [self.bot.wait_for('raw_reaction_add', check=check),
self.bot.wait_for('raw_reaction_remove', check=check)]
done_tasks, pending_tasks = await asyncio.wait(pending_tasks, timeout=7.0, return_when=asyncio.FIRST_COMPLETED)
Then I got the payload from a pop instead of awaiting task
payload = done_tasks.pop().result()
Now after 7 seconds of inactivity it tries to pop from an empty set and raises a KeyError which I can catch using except KeyError:

Python Pyppeter Unable to Scrape RU retailers

Hello good day stackoverflow pips,
Issue: stack and data was never scraped in a russian retailer which is in this case www.vseinstrumenti.ru
code:
import asyncio
from pyppeteer import launch
class PyppeteerRequests:
def __init__(self):
self.headers = {}
def get_url(self, url):
data = None
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
data = loop.run_until_complete(self.main(url))
print(data)
except Exception as e:
print(str(e))
return data
async def main(self, url):
browser = await launch(headless=True)
page = await browser.newPage()
await page.goto(url, options={'timeout':1000000, 'waitUntil':['load', 'networkidle2']}),
loaded_html = await page.content()
await page.waitForNavigation()
print("closing context...")
await asyncio.sleep(0.3)
await page.close()
await browser.close()
print("closing browser...")
await asyncio.sleep(0.3)
return loaded_html
if __name__=="__main__":
requester = PyppeteerRequests()
url = 'https://www.vseinstrumenti.ru/ruchnoy-instrument/sadoviy-instrument-i-inventar/topory/fiskars/x11-s-1015640/'
data = requester.get_url(url)
print(data)
It just stacked and get ERROR: Navigation Timeout Exceeded: 1000000 ms exceeded.
What part of the code should I change? Is it scrape-able on your side? Kindly let me know how to improve my code using asnycio. Thanks!

Why does asyncio.create_task not run the method?

Code example:
async def download_page(session, url):
print(True)
async def downloader_init(session):
while True:
url = await download_queue.get()
task = asyncio.create_task(download_page(session, url))
print(task)
print(f"url: {url}")
async def get_urls(url):
while True:
try:
url = find_somewhere_url
await download_queue.put(url)
except NoSuchElementException:
break
return True
async def main():
async with aiohttp.ClientSession(headers=headers) as session:
get_urls_task = asyncio.create_task(get_urls(url))
downloader_init_task = asyncio.create_task(downloader_init(session))
asyncio.gather(get_urls_task, downloader_init_task)
if __name__ == "__main__":
asyncio.get_event_loop().run_until_complete(main())
Output:
<Task pending coro=<download_page() running at main.py:69>>
url: https://someurl.com/example
<Task pending coro=<download_page() running at main.py:69>>
url: https://someurl.com/example
<Task pending coro=<download_page() running at main.py:69>>
url: https://someurl.com/example
Why is the method download_page not executed?
The strange thing is that the script just ends its work, there are no errors anywhere.
downloader_init should work endlessly, but it does not.
In download_queue, method get_urls adds links as it finds them, after which it stops working.
downloader_init should immediately execute as soon as a new link appears in the queue, but it starts its work only when get_urls has completed its work.
Try this instead:
Note: Your problem wasn't with the task creation, it was because
there wasn't an await at the asyncio.gather part.
import asyncio
import aiohttp
async def download_page(session, url):
# Dummy function.
print(f"session={session}, url={url}")
async def downloader_init(session):
while True:
url = await download_queue.get()
task = asyncio.create_task(download_page(session, url))
print(f"task={task}, url={url}")
async def get_urls(url):
while True:
try:
url = find_somewhere_url()
await download_queue.put(url)
except NoSuchElementException:
break
async def main():
async with aiohttp.ClientSession(headers=headers) as session:
get_urls_task = asyncio.create_task(get_urls(url))
downloader_init_task = asyncio.create_task(downloader_init(session))
# Use await here to make it finish the tasks.
await asyncio.gather(get_urls_task, downloader_init_task)
if __name__ == "__main__":
# Use this as it deals with the loop creation, shutdown,
# and other stuff for you.
asyncio.run(main()) # This is new in Python 3.7

Resources