Python Pyppeter Unable to Scrape RU retailers - python-3.x

Hello good day stackoverflow pips,
Issue: stack and data was never scraped in a russian retailer which is in this case www.vseinstrumenti.ru
code:
import asyncio
from pyppeteer import launch
class PyppeteerRequests:
def __init__(self):
self.headers = {}
def get_url(self, url):
data = None
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
data = loop.run_until_complete(self.main(url))
print(data)
except Exception as e:
print(str(e))
return data
async def main(self, url):
browser = await launch(headless=True)
page = await browser.newPage()
await page.goto(url, options={'timeout':1000000, 'waitUntil':['load', 'networkidle2']}),
loaded_html = await page.content()
await page.waitForNavigation()
print("closing context...")
await asyncio.sleep(0.3)
await page.close()
await browser.close()
print("closing browser...")
await asyncio.sleep(0.3)
return loaded_html
if __name__=="__main__":
requester = PyppeteerRequests()
url = 'https://www.vseinstrumenti.ru/ruchnoy-instrument/sadoviy-instrument-i-inventar/topory/fiskars/x11-s-1015640/'
data = requester.get_url(url)
print(data)
It just stacked and get ERROR: Navigation Timeout Exceeded: 1000000 ms exceeded.
What part of the code should I change? Is it scrape-able on your side? Kindly let me know how to improve my code using asnycio. Thanks!

Related

How to use events (Chrome-Developer-Tools) using Selenium with Python as a thread?

My current code is like that:
# Note: driver variable allready initialzed (Chrome)
import trio # async library that selenium uses
import requests
from PIL import Image
from io import BytesIO
def show_image(url):
try:
response = requests.get(url)
img = Image.open(BytesIO(response.content))
img.show()
except Exception as e:
print(e)
def intercept(event, devtools):
return devtools.fetch.fail_request(request_id=event.request_id,error_reason=devtools.network.ErrorReason.CONNECTION_REFUSED)
async def at_event(listener, connection, intercept):
session, devtools = connection.session, connection.devtools
async for event in listener:
print({"frame_id": event.frame_id, "url": event.request.url})
#show_image(event.request.url)
await trio.sleep(0.0001)
try:
await session.execute(intercept(event=event, devtools=devtools))
# await session.execute(devtools.fetch.fulfill_request(request_id = event.request_id, response_code=200))
# await session.execute(devtools.fetch.continue_request(request_id=event.request_id))
except Exception as e:
print(e)
async def async_interceptor(pattern,main_func, intercept):
async with driver.bidi_connection() as connection:
session, devtools = connection.session, connection.devtools
pattern = devtools.fetch.RequestPattern.from_json(pattern)
await session.execute(devtools.fetch.enable(patterns =[pattern]))
listener = session.listen(devtools.fetch.RequestPaused)
async with trio.open_nursery() as nursery:
nursery.start_soon(at_event,listener, connection, intercept)
nursery.start_soon(main_func)
async def interceptor():
await async_interceptor({"resourceType":"Image"},main, intercept)
async def main():
driver.get("https://www.youtube.com/")
trio.run(interceptor)
What I want to change:
I want interceptor to be a non-blocking like in a thread, and not inside a async function
atm, the listener doesn't always get triggered, maybe because something in the async code?
In my example code, I want to intercept and fail all requests which are Images
See https://github.com/kaliiiiiiiiii/selenium_interceptor
import threading
import trio
from PIL import Image
from io import BytesIO
import requests
def show_image(url):
try:
response = requests.get(url)
img = Image.open(BytesIO(response.content))
img.show()
except Exception as e:
print(e)
def listener_helper(listener, at_event):
async def async_listener_helper(listener, at_event):
async with driver.bidi_connection() as connection:
session, devtools = connection.session, connection.devtools
my_listener = await listener(connection=connection)
async for event in my_listener:
print({"frame_id": event.frame_id, "url": event.request.url})
try:
await session.execute(at_event(event=event, connection=connection))
except Exception as e:
print(e)
trio.run(async_listener_helper, listener,at_event)
def threaded_listener(listener, at_event):
thread = threading.Thread(target=listener_helper, kwargs={"listener":listener, "at_event":at_event})
thread.start()
return thread
async def request_listener(connection):
session, devtools = connection.session, connection.devtools
pattern = devtools.fetch.RequestPattern.from_json({"resourceType":"Image"})
await session.execute(devtools.fetch.enable(patterns=[pattern]))
return session.listen(devtools.fetch.RequestPaused)
def at_event(event, connection):
session, devtools = connection.session, connection.devtools
# show_image(event.request.url)
return devtools.fetch.fail_request(request_id=event.request_id,error_reason=devtools.network.ErrorReason.CONNECTION_REFUSED)
thread = threaded_listener(request_listener,at_event)
driver.get("https://www.youtube.com/")

Pyppeteer will stop working at the moment of opening the browser

I am using python version 3.9.
I have a code that repeats the same tests using a "While" loop.
async def pars():
site_ad = "my_link"
msg = ''
new_mass_for_mes = []
try:
launcher.defaultArgs().remove("--enable-automation")
browser = await launch({"headless": False,"userDataDir":"./tool/pyppeteer", "args": ["--no-sandbox", '--disable-setuid-sandbox',"--start-maximized","--disable-infobars","--disable-dev-shm-usage"]})
page = await browser.newPage()
await stealth(page)
await page.goto(site_ad,{'timeout': 15000})
...
await page.waitFor(3000)
await page.close()
await browser.close()
except Exception as e:
try:
try:
await page.close()
except:
pass
try:
await browser.disconnect()
except:
pass
await browser.close()
except:
pass
logging.error("Error params {0} at {1}".format(site_ad, datetime.datetime.now()))
logging.error(e)
The code works, but if an error occurred and the browser did not close, then the next time the program will stop at the moment "browser = await launch".
The error may also occur when ""headless": True".
So far, I have only come up with checking for the existence of the "pyppeteer" process before opening the browser and killing if it exists.

import asynchronous methods from another file

scrap.py
class Scraping:
async def open_browser():
url = "https://www.myurl.com"
async with async_playwright() as p:
browser = await p.firefox.launch()
page = await browser.new_page()
return await page.goto(url, timeout=0)
async def search(self, page, num: str):
await page.fill('input#search', num)
await page.locator("div[class='srch-btn']").click()
core.py
from scrap import *
#myroute("/route1")
def main(self):
a = Scraping()
brow = a.open_browser()
self.asn = asyncio.run(brow)
query.action('/route2')
#myroute("/route2")
def main(self, num):
a = Scraping()
b = a.search(self.asn, num)
How can I run open_browser() function in '/route1' and get its page content in '/route2' and work with search() methods
I've already try in my code but it doesn't work
THANKS!!!!

How to fetch a url asynchronously with pyppeteer(One browser many tabs)

I want my script to
Open say 3 tabs
Asynchronously fetch a url(same for each tab)
Save the response
Sleep for 4 seconds
Parse through the response with regex(I tried BeautifulSoup but its too slow) and return a token
Loop through several times within the 3 tabs
My problem is with 2. I have an example script but it synchronously fetches the url. I would like to make it asynchronous.
from pyppeteer import launch
urls = ['https://www.example.com']
async def main():
browser = await launch(headless=False)
for url in urls:
page1 = await browser.newPage()
page2 = await browser.newPage()
page3 = await browser.newPage()
await page1.goto(url)
await page2.goto(url)
await page3.goto(url)
title1= await page1.title()
title2= await page2.title()
title3= await page3.title()
print(title1)
print(title2)
print(title3)
#await browser.close()
asyncio.get_event_loop().run_until_complete(main())
Also, as you can see, the code is not so concise. How do I go about making it asynchronous?
Also if it helps, I have other pyppeteer scripts which don't fit my need just in case it would be easier to convert those
import asyncio
from pyppeteer import launch
url = 'http://www.example.com'
browser = None
async def fetchUrl(url):
# Define browser as a global variable to ensure that the browser window is only created once in the entire process
global browser
if browser is None:
browser = await launch(headless=False)
page = await browser.newPage()
await page.goto(url)
#await asyncio.wait([page.waitForNavigation()])
#str = await page.content()
#print(str)
# Execute this function multiple times for testing
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
The script is asynchronous but it executes one event loop at a time so its as good as synchronous.
# cat test.py
import asyncio
import time
from pyppeteer import launch
WEBSITE_LIST = [
'http://envato.com',
'http://amazon.co.uk',
'http://example.com',
]
start = time.time()
async def fetch(url):
browser = await launch(headless=False, args=['--no-sandbox'])
page = await browser.newPage()
await page.goto(f'{url}', {'waitUntil': 'load'})
print(f'{url}')
await asyncio.sleep(1)
await page.close()
#await browser.close()
async def run():
tasks = []
for url in WEBSITE_LIST:
task = asyncio.ensure_future(fetch(url))
tasks.append(task)
responses = await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
print(f'It took {time.time()-start} seconds.')
The script is asynchronous but it launches a separate browser for each url which ends up taking too many resources.
This will open each URL in a separate tab:
import asyncio
import traceback
from pyppeteer import launch
URLS = [
"http://envato.com",
"http://amazon.co.uk",
"http://example.com",
]
async def fetch(browser, url):
page = await browser.newPage()
try:
await page.goto(f"{url}", {"waitUntil": "load"})
except Exception:
traceback.print_exc()
else:
html = await page.content()
return (url, html)
finally:
await page.close()
async def main():
tasks = []
browser = await launch(headless=True, args=["--no-sandbox"])
for url in URLS:
tasks.append(asyncio.create_task(fetch(browser, url)))
for coro in asyncio.as_completed(tasks):
url, html = await coro
print(f"{url}: ({len(html)})")
await browser.close()
if __name__ == "__main__":
main = asyncio.run(main())

Fetching multiple urls with aiohttp in python

In a previous question, a user suggested the following approach for fetching multiple urls (API calls) with aiohttp:
import asyncio
import aiohttp
url_list = ['https://api.pushshift.io/reddit/search/comment/?q=Nestle&size=30&after=1530396000&before=1530436000', 'https://api.pushshift.io/reddit/search/comment/?q=Nestle&size=30&after=1530436000&before=1530476000']
async def fetch(session, url):
async with session.get(url) as response:
return await response.json()['data']
async def fetch_all(session, urls, loop):
results = await asyncio.gather(*[loop.create_task(fetch(session, url)) for url in urls], return_exceptions= True)
return results
if __name__=='__main__':
loop = asyncio.get_event_loop()
urls = url_list
with aiohttp.ClientSession(loop=loop) as session:
htmls = loop.run_until_complete(fetch_all(session, urls, loop))
print(htmls)
However, this results in only returning Attribute errors:
[AttributeError('__aexit__',), AttributeError('__aexit__',)]
(which I enabled, otherwhise it would just break). I really hope there is somebody here, who can help with this, it is still kind of hard to find resources for asyncio etc. The returned data is in json format. In the end I would like to put all json dicts in a list.
Working example:
import asyncio
import aiohttp
import ssl
url_list = ['https://api.pushshift.io/reddit/search/comment/?q=Nestle&size=30&after=1530396000&before=1530436000',
'https://api.pushshift.io/reddit/search/comment/?q=Nestle&size=30&after=1530436000&before=1530476000']
async def fetch(session, url):
async with session.get(url, ssl=ssl.SSLContext()) as response:
return await response.json()
async def fetch_all(urls, loop):
async with aiohttp.ClientSession(loop=loop) as session:
results = await asyncio.gather(*[fetch(session, url) for url in urls], return_exceptions=True)
return results
if __name__ == '__main__':
loop = asyncio.get_event_loop()
urls = url_list
htmls = loop.run_until_complete(fetch_all(urls, loop))
print(htmls)

Resources