Is there a way to optimize page scrapping using pyppeteer?
My below code sometimes takes more than 20s to scrape page.
launch(
executablePath=path,
handleSIGINT=False,
handleSIGTERM=False,
handleSIGHUP=False,
args=[
'--disable-dev-shm-usage',
'--disable-gpu',
'--single-process',
'--no-zygote',
'--no-sandbox',
'--disable-web-security',
'--disable-notifications'
],
userDataDir='/tmp/',
headless=True)
page = await browser.newPage()
res = await load_page(page, url, headers, timeout)
I am using pyppeteer==0.2.6
Related
I am using python version 3.9.
I have a code that repeats the same tests using a "While" loop.
async def pars():
site_ad = "my_link"
msg = ''
new_mass_for_mes = []
try:
launcher.defaultArgs().remove("--enable-automation")
browser = await launch({"headless": False,"userDataDir":"./tool/pyppeteer", "args": ["--no-sandbox", '--disable-setuid-sandbox',"--start-maximized","--disable-infobars","--disable-dev-shm-usage"]})
page = await browser.newPage()
await stealth(page)
await page.goto(site_ad,{'timeout': 15000})
...
await page.waitFor(3000)
await page.close()
await browser.close()
except Exception as e:
try:
try:
await page.close()
except:
pass
try:
await browser.disconnect()
except:
pass
await browser.close()
except:
pass
logging.error("Error params {0} at {1}".format(site_ad, datetime.datetime.now()))
logging.error(e)
The code works, but if an error occurred and the browser did not close, then the next time the program will stop at the moment "browser = await launch".
The error may also occur when ""headless": True".
So far, I have only come up with checking for the existence of the "pyppeteer" process before opening the browser and killing if it exists.
Currently i open a new browser session using the code below, but it always starts as incognito, can I start a new chromium session but not as incognito?:
from behave import *
from playwright.sync_api import sync_playwright
import time
class session_driver:
driver = None
def open_browser(self, url):
playW_sync_instace = sync_playwright().start()
global browser
browser = playW_sync_instace.chromium.launch(headless=False)
browser.new_context(record_video_dir="videos/",
record_video_size={"width": 640, "height": 480})
self.driver = browser.new_page()
self.driver.goto(url)
from playwright.sync_api import sync_playwright
import os
user_dir = '/tmp/playwright'
if not os.path.exists(user_dir):
os.makedirs(user_dir)
with sync_playwright() as p:
browser = p.chromium.launch_persistent_context(user_dir, headless=False)
page = browser.new_page()
page.goto('https://playwright.dev/python', wait_until='domcontentloaded')
This is what I have done in Typescript code base. But this would leverage the existing logged-in session, and it would not ask for fresh user login.
const userDataDir = 'C:/Users/yuv****dir/AppData/Local/Temp/tjwmm3m0.hmt';
context = await chromium.launchPersistentContext(userDataDir,{
headless: false,
args: [ ]
});
I hope this is like above only what is there in Python code.
I am trying to use the RequestSetIntercept function to
quicken the loading of webpage with Pyppeteer.
However I am getting the warning:
RuntimeWarning: coroutine 'block_image' was never awaited
I can't figure out where I am missing an await.
I've added awaits withing the intercept function itself following a template I've found online. I am testing out the
setIntercept function with Pyppeeteer.
Thank you.
#utils.py
class MakeRequest():
ua = User_Agent()
async def _proxy_browser(self, url,
headless = False,
intercept_func = None,
proxy = True,
**kwargs):
if proxy:
args = [*proxy*
'--ignore-certificate-errors']
else:
args = ['--ignore-certificate-errors']
for i in range(3):
try:
browser = await launch(headless = headless,
args = args,
defaultViewport = None)
page = await browser.newPage()
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0')
if intercept_func is not None:
await page.setRequestInterception(True)
page.on('request', intercept_func)
await page.goto(url, {'waitUntil' : 'load', 'timeout': 0 })
content = await page.content()
return content
except (pyppeteer.errors.PageError,
pyppeteer.errors.TimeoutError,
pyppeteer.errors.BrowserError,
pyppeteer.errors.NetworkError) as e:
print('error', e)
time.sleep(2)
continue
finally:
await browser.close()
return
scraper.py:
REQUESTER = MakeRequest()
async def block_image(request):
if request.url.endswith('.png') or request.url.endswith('.jpg'):
print(request.url)
await request.abort()
else:
await request.continue_()
def get_request(url):
for i in range(3):
response = REQUESTER.proxy_browser_request(url = url,
headless = False,
intercept_func = block_image)
if response:
return response
else:
print(f'Attempt {i +1} : {url}links not found')
print('retrying...')
time.sleep(3)
Your function block_image is a coroutine, but the callback passed to page.on is expected to be a normal function. Try writing a synchronous lambda function that wraps the coroutine in a Task (thus scheduling it on the current event loop):
if intercept_func is not None:
await page.setRequestInterception(True)
page.on('request', lambda request: asyncio.create_task(intercept_func(request)))
There's an example of this kind of code in the Pyppteer documentation here. If you're using an older version of Python (<3.7), use asyncio.ensure_future instead of asyncio.create_task (as is done in the example in the docs).
I want my script to
Open say 3 tabs
Asynchronously fetch a url(same for each tab)
Save the response
Sleep for 4 seconds
Parse through the response with regex(I tried BeautifulSoup but its too slow) and return a token
Loop through several times within the 3 tabs
My problem is with 2. I have an example script but it synchronously fetches the url. I would like to make it asynchronous.
from pyppeteer import launch
urls = ['https://www.example.com']
async def main():
browser = await launch(headless=False)
for url in urls:
page1 = await browser.newPage()
page2 = await browser.newPage()
page3 = await browser.newPage()
await page1.goto(url)
await page2.goto(url)
await page3.goto(url)
title1= await page1.title()
title2= await page2.title()
title3= await page3.title()
print(title1)
print(title2)
print(title3)
#await browser.close()
asyncio.get_event_loop().run_until_complete(main())
Also, as you can see, the code is not so concise. How do I go about making it asynchronous?
Also if it helps, I have other pyppeteer scripts which don't fit my need just in case it would be easier to convert those
import asyncio
from pyppeteer import launch
url = 'http://www.example.com'
browser = None
async def fetchUrl(url):
# Define browser as a global variable to ensure that the browser window is only created once in the entire process
global browser
if browser is None:
browser = await launch(headless=False)
page = await browser.newPage()
await page.goto(url)
#await asyncio.wait([page.waitForNavigation()])
#str = await page.content()
#print(str)
# Execute this function multiple times for testing
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
The script is asynchronous but it executes one event loop at a time so its as good as synchronous.
# cat test.py
import asyncio
import time
from pyppeteer import launch
WEBSITE_LIST = [
'http://envato.com',
'http://amazon.co.uk',
'http://example.com',
]
start = time.time()
async def fetch(url):
browser = await launch(headless=False, args=['--no-sandbox'])
page = await browser.newPage()
await page.goto(f'{url}', {'waitUntil': 'load'})
print(f'{url}')
await asyncio.sleep(1)
await page.close()
#await browser.close()
async def run():
tasks = []
for url in WEBSITE_LIST:
task = asyncio.ensure_future(fetch(url))
tasks.append(task)
responses = await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
print(f'It took {time.time()-start} seconds.')
The script is asynchronous but it launches a separate browser for each url which ends up taking too many resources.
This will open each URL in a separate tab:
import asyncio
import traceback
from pyppeteer import launch
URLS = [
"http://envato.com",
"http://amazon.co.uk",
"http://example.com",
]
async def fetch(browser, url):
page = await browser.newPage()
try:
await page.goto(f"{url}", {"waitUntil": "load"})
except Exception:
traceback.print_exc()
else:
html = await page.content()
return (url, html)
finally:
await page.close()
async def main():
tasks = []
browser = await launch(headless=True, args=["--no-sandbox"])
for url in URLS:
tasks.append(asyncio.create_task(fetch(browser, url)))
for coro in asyncio.as_completed(tasks):
url, html = await coro
print(f"{url}: ({len(html)})")
await browser.close()
if __name__ == "__main__":
main = asyncio.run(main())
I am needing help in adding parameters in API command for api's such Urban Dictionary (for searching definitions) and open weather map (to get weather of certain location) I understand the fact that a lot of these are provided with code for querystring then ("GET", url, headers=headers, params=querystring) but I don't understand how to allow something such as $urban yo-yo.
#commands.command(
name="urban",
description="Allows the user to get a definition from Urban Dictionary",
aliases=['urbandict']
)
async def urban(self, ctx):
url = "https://mashape-community-urban-dictionary.p.rapidapi.com/define"
headers = {
'x-rapidapi-key': self.bot.quote_api_key,
'x-rapidapi-host': "mashape-community-urban-dictionary.p.rapidapi.com"
}
async with ClientSession() as session:
async with session.get(url, headers=headers) as response:
r = await response.json()
# print(r)
embed = discord.Embed(title="Term:", description=f"{r['']}")
embed.add_field(name="Definition:", value=f"||{r['']}||")
embed.set_author(name=ctx.author.display_name, icon_url=ctx.message.author.avatar_url)
await ctx.send(embed=embed)
Looking at the Urban Dictionnary API, querystring must a dictionnary that has a term key.
Then, to add parameters to commands, you simply have to add a parameter to your function and discord.py will parse the command automatically. If you want everything after $urban in a single parameter, you have to add a * before the term parameter.
It would look like this :
#commands.command()
async def urban(self, ctx, *, term):
url = "https://mashape-community-urban-dictionary.p.rapidapi.com/define"
querystring = {"term": term}
headers = {
'x-rapidapi-key': self.bot.quote_api_key,
'x-rapidapi-host': "mashape-community-urban-dictionary.p.rapidapi.com"
}
async with ClientSession() as session:
async with session.get(url, headers=headers, params=querystring) as response:
r = await response.json()
embed = discord.Embed(title="Term:", description=f"{r['']}")
embed.add_field(name="Definition:", value=f"||{r['']}||")
embed.set_author(name=ctx.author.display_name, icon_url=ctx.message.author.avatar_url)
await ctx.send(embed=embed)
For the Open Weather Map API, you need to pass the argument in the url. For instance, if you want Paris' 5 days forecast, you'd have to use this url :
http://api.openweathermap.org/data/2.5/forecast?q=Paris&units=metric&APPID=your_token
For more informations, you can look at the documentations :
Discord.py's command arguments handler
Urban Dictionnary API
Open Weather Map API