Playwright Python: Get only selected attribute urls

Playwright Python: Get only selected attribute urls - attributes

import asyncio
from playwright.async_api import Playwright, async_playwright, expect
#Get images urls
#Output: img_urls.csv (Consist of instances of {"property_id": str, "img_urls": []}
async def run(playwright):
image_urls = [] #Will contain instances of {"property_id": "value", "img_url": [img_urls]}
browser = await playwright.chromium.launch(headless=False)
context = await browser.new_context()
# Open new page
page = await context.new_page()
# Go to https://www.zoopla.co.uk/for-sale/details/49240624/
await page.goto("https://www.zoopla.co.uk/for-sale/details/49240624/")
# Click button:has-text("Accept all cookies")
await page.frame_locator("[aria-label=\"Privacy Manager window\\.\"]").locator("button:has-text(\"Accept all cookies\")").click()
# Click next image
for i in range(5):
await page.locator("[data-testid=\"arrow_right\"]").click()
#Fetch img urls
imgs = await page.query_selector_all("img")
for img in imgs:
src = await img.get_attribute("src")
print(src)
# ---------------------
await context.close()
await browser.close()
async def main() -> None:
async with async_playwright() as playwright:
await run(playwright)
asyncio.run(main())
The above would return
https://lid.zoocdn.com/u/2400/1800/26d9845a91c7fe21834b531a292533dcf16f6754.jpg
https://lid.zoocdn.com/u/2400/1800/2267900ffd5e795f568bf1305a5eab0b95e59e5f.jpg
https://lid.zoocdn.com/u/2400/1800/75d0a22274ed94c1b33db52f5c5cc1022905df02.jpg
https://lid.zoocdn.com/u/2400/1800/d459c1d0ff8e7a1b52f667a48e593f72f91ea368.jpg
https://lid.zoocdn.com/u/2400/1800/0e21775e213c064fd83916b27e795536c816edb0.jpg
https://maps.googleapis.com/maps/api/staticmap?size=792x398&format=jpg&scale=2&center=51.535651,-0.006482&maptype=roadmap&zoom=15&channel=Lex-LDP&client=gme-zooplapropertygroup&sensor=fa
lse&markers=scale:2%7Cicon:https://r.zoocdn.com/assets/map-static-pin-purple-76.png%7C51.535651,-0.006482&signature=EZukT7ugiBKGYFT9F9phLleIXBs=
https://lid.zoocdn.com/u/2400/1800/55e734ae277ee03b2d46f55298acd762727bd727.gif
https://r.zoocdn.com/_next/static/images/natwest-dd532b27dc13112df4f05058c26a990a.svg
https://st.zoocdn.com/zoopla_static_agent_logo_(584439).png
Every time you click on the right arrow to move on to the next image it would return a different number of urls in the console. How to you specifically only get the urls that end with ".jpg" or simply contain ".jpg"? (I could check if contain ".jpg" with Python but just want to know if there's an official way to achievement this with Playwright Python?)

Look for src that end with .png:
#Fetch img urls
imgs = await page.query_selector_all("img")
for img in imgs:
src = await img.get_attribute("src")
if src.endswith('.png'):
print(src)

Related

How to get all data certain website with Scrapy?

I'm currently learn scraping using scrapy, so I want to scrape data from this: https://www.espn.com/nba/stats/player or https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2
if you go to the link you will see the show more button at the bottom data so im confuse about this because if i scrape that right now im only get 50 of data that, not what i want, so i decided to look up the show more button but its only a href=#
[UPDATE] USING Scrapy+playwright
def start_requests(self):
yield scrapy.Request(
url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
meta= dict(
playwright = True,
playwright_include_page=True,
playwright_page_coroutines = [
PageMethod('wait_for_selector','//a[#class="AnchorLink loadMore__link"]'),
PageMethod('click','//a[#class="AnchorLink loadMore__link"]'),
]
),
callback=self.parse,
)
async def parse(self, response):
page = response.meta['playwright_page']
button = response.meta['playwright_page_coroutines'][0]
if button:
await button.click()
resp = response.body
player_list = sel.xpath(
"//table[#class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
stats_list = sel.xpath(
"//div[#class='Table__ScrollerWrapper relative overflow-hidden']/div[#class='Table__Scroller']/table/tbody/tr")
await page.wait_for_selector(player_list)
sel = Selector(text=resp)
for player, stat in zip(player_list, stats_list):
player_name = player.xpath(".//a/text()").get()
position = stat.xpath(".//td/div/text()").get()
team_name = player.xpath(".//span/text()").get()
game_played = stat.xpath(".//td[2]/text()").get()
minutes_per_minute = stat.xpath(".//td[3]/text()").get()
points_per_game = stat.xpath(".//td[4]/text()").get()
fields_goal_made = stat.xpath(".//td[5]/text()").get()
fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
field_goal_percentage = stat.xpath(".//td[7]/text()").get()
three_point_goal_made = stat.xpath(".//td[8]/text()").get()
yield {
"player_name": player_name,
"player_position": position,
"team_name": team_name,
"game_played": game_played,
"minutes_per_minute": minutes_per_minute,
"points_per_game": points_per_game,
"fields_goal_made": fields_goal_made,
"fields_goal_attempted": fields_goal_attempted,
"field_goal_percentage": field_goal_percentage,
"three_point_goal_made": three_point_goal_made,
}
When only using scrapy
def start_requests(self):
yield scrapy.Request(
url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
callback=self.parse,
)
def parse(self, response):
sel = Selector(text=response.body)
player_list = sel.xpath("//table[#class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
stats_list = sel.xpath("//div[#class='Table__ScrollerWrapper relative overflow-hidden']/div[#class='Table__Scroller']/table/tbody/tr")
for player,stat in zip(player_list,stats_list):
player_name = player.xpath(".//a/text()").get()
position = stat.xpath(".//td/div/text()").get()
team_name = player.xpath(".//span/text()").get()
game_played = stat.xpath(".//td[2]/text()").get()
minutes_per_minute= stat.xpath(".//td[3]/text()").get()
points_per_game = stat.xpath(".//td[4]/text()").get()
fields_goal_made = stat.xpath(".//td[5]/text()").get()
fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
field_goal_percentage = stat.xpath(".//td[7]/text()").get()
three_point_goal_made = stat.xpath(".//td[8]/text()").get()
yield {
"player_name": player_name,
"player_position":position,
"team_name": team_name,
"game_played": game_played,
"minutes_per_minute": minutes_per_minute,
"points_per_game": points_per_game,
"fields_goal_made":fields_goal_made,
"fields_goal_attempted":fields_goal_attempted,
"field_goal_percentage":field_goal_percentage,
"three_point_goal_made":three_point_goal_made,
}
Am i doing it wrong here? also if you click show more it will show the api like down bellow, i can scrape from that api but for now i want it from html with xpath it self.

TypeError: Bot.edit_message_media() got multiple values for argument 'media'

Something went wrong and Bot.edit_message_media() got multiple values for argument 'media'
Message photo(with inline keyboard and caption) should be edited(also should be edited inline keyboard and caption and thats working.
from aiogram import types
from keyboards.inline.choise_buttons import choice, shop_keyboard
from keyboards.inline.callback_datas import get_callback
from aiogram.types import CallbackQuery, InputMediaPhoto
from loader import dp,bot
#dp.message_handler()
async def bot_reply(message: types.Message):
photo_url = 'https://pbs.twimg.com/profile_images/1187438234382548994/SesNEhbs_400x400.jpg'
await bot.send_photo(photo=photo_url,
chat_id=message.chat.id,
caption='Главное меню',
reply_markup=choice,
)
#dp.callback_query_handler(get_callback.filter(category_name='shop'))
async def open_shop(call: CallbackQuery):
await call.answer(cache_time=60)
await bot.edit_message_media(call.message.chat.id,
call.message.message_id,
media = InputMediaPhoto('https://fikiwiki.com/uploads/posts/2022-02/1645044676_1-fikiwiki-com-p-kartinki-lami-1.jpg'))
await bot.edit_message_caption(call.message.chat.id,
call.message.message_id,
caption='Выберите категорию'
)
await bot.edit_message_reply_markup(call.message.chat.id,
call.message.message_id,
reply_markup=shop_keyboard
)
#dp.callback_query_handler(get_callback.filter(category_name='shop'))
async def open_shop(call: CallbackQuery):
await call.answer(cache_time=60)
await bot.edit_message_reply_markup(call.message.chat.id,
call.message.message_id,
reply_markup=shop_keyboard
)
# await call.message.delete()
# await call.message.answer_photo(photo=photo_url2,
# reply_markup=shop_keyboard)
# #dp.callback_query_handler(get_callback.filter(category_name='royale'))
# async def open_shop(call: CallbackQuery):
# photo_url2 = 'https://i.ytimg.com/vi/P2TifLd5OIQ/maxresdefault.jpg'
# await call.answer(cache_time=60)
# await call.message.delete()

import asynchronous methods from another file

scrap.py
class Scraping:
async def open_browser():
url = "https://www.myurl.com"
async with async_playwright() as p:
browser = await p.firefox.launch()
page = await browser.new_page()
return await page.goto(url, timeout=0)
async def search(self, page, num: str):
await page.fill('input#search', num)
await page.locator("div[class='srch-btn']").click()
core.py
from scrap import *
#myroute("/route1")
def main(self):
a = Scraping()
brow = a.open_browser()
self.asn = asyncio.run(brow)
query.action('/route2')
#myroute("/route2")
def main(self, num):
a = Scraping()
b = a.search(self.asn, num)
How can I run open_browser() function in '/route1' and get its page content in '/route2' and work with search() methods
I've already try in my code but it doesn't work
THANKS!!!!

How to fetch a url asynchronously with pyppeteer(One browser many tabs)

I want my script to
Open say 3 tabs
Asynchronously fetch a url(same for each tab)
Save the response
Sleep for 4 seconds
Parse through the response with regex(I tried BeautifulSoup but its too slow) and return a token
Loop through several times within the 3 tabs
My problem is with 2. I have an example script but it synchronously fetches the url. I would like to make it asynchronous.
from pyppeteer import launch
urls = ['https://www.example.com']
async def main():
browser = await launch(headless=False)
for url in urls:
page1 = await browser.newPage()
page2 = await browser.newPage()
page3 = await browser.newPage()
await page1.goto(url)
await page2.goto(url)
await page3.goto(url)
title1= await page1.title()
title2= await page2.title()
title3= await page3.title()
print(title1)
print(title2)
print(title3)
#await browser.close()
asyncio.get_event_loop().run_until_complete(main())
Also, as you can see, the code is not so concise. How do I go about making it asynchronous?
Also if it helps, I have other pyppeteer scripts which don't fit my need just in case it would be easier to convert those
import asyncio
from pyppeteer import launch
url = 'http://www.example.com'
browser = None
async def fetchUrl(url):
# Define browser as a global variable to ensure that the browser window is only created once in the entire process
global browser
if browser is None:
browser = await launch(headless=False)
page = await browser.newPage()
await page.goto(url)
#await asyncio.wait([page.waitForNavigation()])
#str = await page.content()
#print(str)
# Execute this function multiple times for testing
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
asyncio.get_event_loop().run_until_complete(fetchUrl(url))
The script is asynchronous but it executes one event loop at a time so its as good as synchronous.
# cat test.py
import asyncio
import time
from pyppeteer import launch
WEBSITE_LIST = [
'http://envato.com',
'http://amazon.co.uk',
'http://example.com',
]
start = time.time()
async def fetch(url):
browser = await launch(headless=False, args=['--no-sandbox'])
page = await browser.newPage()
await page.goto(f'{url}', {'waitUntil': 'load'})
print(f'{url}')
await asyncio.sleep(1)
await page.close()
#await browser.close()
async def run():
tasks = []
for url in WEBSITE_LIST:
task = asyncio.ensure_future(fetch(url))
tasks.append(task)
responses = await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
print(f'It took {time.time()-start} seconds.')
The script is asynchronous but it launches a separate browser for each url which ends up taking too many resources.

This will open each URL in a separate tab:
import asyncio
import traceback
from pyppeteer import launch
URLS = [
"http://envato.com",
"http://amazon.co.uk",
"http://example.com",
]
async def fetch(browser, url):
page = await browser.newPage()
try:
await page.goto(f"{url}", {"waitUntil": "load"})
except Exception:
traceback.print_exc()
else:
html = await page.content()
return (url, html)
finally:
await page.close()
async def main():
tasks = []
browser = await launch(headless=True, args=["--no-sandbox"])
for url in URLS:
tasks.append(asyncio.create_task(fetch(browser, url)))
for coro in asyncio.as_completed(tasks):
url, html = await coro
print(f"{url}: ({len(html)})")
await browser.close()
if __name__ == "__main__":
main = asyncio.run(main())

Python Pyppeter Unable to Scrape RU retailers

Hello good day stackoverflow pips,
Issue: stack and data was never scraped in a russian retailer which is in this case www.vseinstrumenti.ru
code:
import asyncio
from pyppeteer import launch
class PyppeteerRequests:
def __init__(self):
self.headers = {}
def get_url(self, url):
data = None
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
data = loop.run_until_complete(self.main(url))
print(data)
except Exception as e:
print(str(e))
return data
async def main(self, url):
browser = await launch(headless=True)
page = await browser.newPage()
await page.goto(url, options={'timeout':1000000, 'waitUntil':['load', 'networkidle2']}),
loaded_html = await page.content()
await page.waitForNavigation()
print("closing context...")
await asyncio.sleep(0.3)
await page.close()
await browser.close()
print("closing browser...")
await asyncio.sleep(0.3)
return loaded_html
if __name__=="__main__":
requester = PyppeteerRequests()
url = 'https://www.vseinstrumenti.ru/ruchnoy-instrument/sadoviy-instrument-i-inventar/topory/fiskars/x11-s-1015640/'
data = requester.get_url(url)
print(data)
It just stacked and get ERROR: Navigation Timeout Exceeded: 1000000 ms exceeded.
What part of the code should I change? Is it scrape-able on your side? Kindly let me know how to improve my code using asnycio. Thanks!

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Playwright Python: Get only selected attribute urls - attributes

Look for src that end with .png: #Fetch img urls imgs = await page.query_selector_all("img") for img in imgs: src = await img.get_attribute("src") if src.endswith('.png'): print(src)

Related

How to get all data certain website with Scrapy?

TypeError: Bot.edit_message_media() got multiple values for argument 'media'

import asynchronous methods from another file

How to fetch a url asynchronously with pyppeteer(One browser many tabs)

Python Pyppeter Unable to Scrape RU retailers

Categories

Resources