How to select only divs with specific children span with xpath python - python-3.x

I am currently trying to scrap information of a particular ecommerce site and i only want to get product information like product name, price, color and sizes of only products whose prices have been slashed.
i am currently using xpath
this is my python scraping code
from lxml import html
import requests
class CategoryCrawler(object):
def __init__(self, starting_url):
self.starting_url = starting_url
self.items = set()
def __str__(self):
return('All Items:', self.items)
def crawl(self):
self.get_item_from_link(self.starting_url)
return
def get_item_from_link(self, link):
start_page = requests.get(link)
tree = html.fromstring(start_page.text)
names = tree.xpath('//span[#class="name"][#dir="ltr"]/text()')
print(names)
Note this is not the original URL
crawler = CategoryCrawler('https://www.myfavoriteecommercesite.com/')
crawler.crawl()
When the program is Run ... These are the HTML Content Gotten from the E-commerce Site
Div of Products With Price Slash
div class="products-info">
<h2 class="title"><span class="brand ">Apple </span> <span class="name" dir="ltr">IPhone X 5.8-Inch HD (3GB,64GB ROM) IOS 11, 12MP + 7MP 4G Smartphone - Silver</span></h2>
<div class="price-container clearfix">
<span class="sale-flag-percent">-22%</span>
<span class="price-box ri">
<span class="price ">
<span data-currency-iso="NGN">₦</span>
<span dir="ltr" data-price="388990">388,990</span>
</span>
<span class="price -old ">
<span data-currency-iso="NGN">₦</span>
<span dir="ltr" data-price="500000">500,000</span>
</span>
</span>
</div>
div
Div of Products with No Price Slash
div class="products-info">
<h2 class="title"><span class="brand ">Apple </span> <span class="name" dir="ltr">IPhone X 5.8-Inch HD (3GB,64GB ROM) IOS 11, 12MP + 7MP 4G Smartphone - Silver</span></h2>
<div class="price-container clearfix">
<span class="price-box ri">
<span class="price ">
<span data-currency-iso="NGN">₦</span>
<span dir="ltr" data-price="388990">388,990</span>
</span>
</span>
</div>
div
Now this is my exact Question
i want to know how to select only the parent divs i.e
div class="price-container clearfix"> that also contains any of these children span classes
span class="price -old "> or
span class="sale-flag-percent">
Thank you all

One solution would be get all <div class="price-container clearfix"> and iterate, checking with the string of the whole element that your keywords exist.
But a better solution would be to use conditionals with xpath:
from lxml import html
htmlst = 'your html'
tree=html.fromstring(htmlst)
divs = tree.xpath('//div[#class="price-container clearfix" and .//span[#class = "price -old " or #class = "sale-flag-percent"] ]')
print(divs)
This get all divs where class="price-container clearfix" and then check if contains span with the searched classes.

Related

How to get the text of all the elements in a given html in Python3?

How to extract all the text of elements from the following html:
from bs4 import BeautifulSoup
html3 = """
<div class="tab-cell l1">
<span class="cyan-90">***</span>
<h2 class="white-80">
<a class="k-link" href="#" title="Jump">Jump</a>
</h2>
<h3 class="black-70">
<span>Red</span>
<span class="black-50">lock</span>
</h3>
<div class="l-block">
<a class="lang-menu" href="#">A</a>
<a class="lang-menu" href="#">B</a>
<a class="lang-menu" href="#">C</a>
</div>
<div class="black-50">
<div class="p-bold">Period</div>
<div class="tab--cell">$</div><div class="white-90">Method</div>
<div class="tab--cell">$</div><div class="tab--cell">Type</div>
</div>
</div>
"""
soup = BeautifulSoup(html3, "lxml")
if soup.find('div', attrs={'class': 'tab-cell l1'}):
div_descendants = soup.div.descendants
for des in div_descendants:
if des.name is not None:
print(des.name)
if des.find(class_='k-link'):
print(des.a.string)
if des.find(class_='black-70'):
print('span')
print(des.span.text)
I'm getting text of only first link, after that I'm unable to get anything.
I would like to crawl line by line and get whatever I want, if anyone have any idea please let me know.
Your own if-conditions hinder you to get all things. You only print in two cases based on a class_=... condition - you do not print in all conditions:
# html3 = see above
from bs4 import BeautifulSoup
import lxml
soup = BeautifulSoup(html3, "lxml")
if soup.find('div', attrs={'class': 'tab-cell l1'}):
div_descendants = soup.div.descendants
for des in div_descendants:
if des.name is not None:
print(des.name)
found = False
if des.find(class_='k-link'):
print(des.a.string)
found = True
if des.find(class_='black-70'):
print('span')
print(des.span.text)
found = True
# find all others that are not already reported:
if not found:
print(f"Other {des.name}: {des.string}")
Output:
span
Other span: ***
h2
Jump
a
Other a: Jump
h3
Other h3: None
span
Other span: Red
span
Other span: lock
div
Other div: None
a
Other a: A
a
Other a: B
a
Other a: C
div
Other div: None
div
Other div: Period
div
Other div: $
div
Other div: Method
div
Other div: $
div
Other div: Type
Sorted out the issue like this:
from bs4 import BeautifulSoup
import lxml
html3 = """
<div class="tab-cell l1">
<span class="cyan-90">***</span>
<h2 class="white-80">
<a class="k-link" href="#" title="Jump">Jump</a>
</h2>
<h3 class="black-70">
<span>Red</span>
<span class="black-50">lock</span>
</h3>
<div class="l-block">
<a class="lang-menu" href="#">A</a>
<a class="lang-menu" href="#">B</a>
<a class="lang-menu" href="#">C</a>
</div>
<div class="black-50">
<div class="p-bold">Period</div>
<div class="tab--cell">$</div><div class="white-90">Method</div>
<div class="tab--cell">$</div><div class="tab--cell">Type</div>
</div>
</div>
"""
soup = BeautifulSoup(html3, "lxml")
if soup.find('div', attrs={'class': 'tab-cell l1'}):
div_descendants = soup.div.descendants
for des in div_descendants:
if des.name is not None and des.string is not None:
print(f"{des.name}: {des.string}")

Need to get a specific class exists in HTML body

I am trying to check if class = "special-price" exists in below code.
Here is html code :
<div class="product-shop">
<div class="f-fix">
<h2 class="product-name newname"> Xiaomi Mi Band 2 Strap (Black with White Border) </h2>
<!--product price-->
<div class="text-center ">
<div class="price-box">
<p class="old-price"> <span class="price-label">Regular Price:</span >
<span class = "price" id = "old-price-8846" > ৳200 </span>
</p >
<p class = "special-price" >
<span class = "price-label"> Special Price </span>
<span class="price" itemprop="price" content="149" id="product-price-8846"> ৳149 </span>
</p>
</div>
</div >
</div>
I am using Scrapy with python. After checking if the class found I need to collect text of class="price".
Did you try something like:
if response.css('.special-price'):
price = response.css('.price::text').get() # or do whatever you need
or for short:
price = response.css('.special-price .price::text').get()
it will give you None in case there is no element with special-price class.

Scraping multiple similar lines with python

Using a simple request I'm trying to get from this html page some information stored in "alt". The problem is that, within each instance, the information is separated in multiple lines that start with "img", and when I try to access it, I can only read the first instance of "img" and not the rest, but I'm not sure how to do it. Here's the HTML text:
<div class="archetype-tile-description-wrapper">
<div class="archetype-tile-description">
<h2>
<span class="deck-price-online">
Golgari Midrange
</span>
<span class="deck-price-paper">
Golgari Midrange
</span>
</h2>
<div class="manacost-container">
<span class="manacost">
<img alt="b" class="common-manaCost-manaSymbol sprite-mana_symbols_b" src="//assets1.mtggoldfish.com/assets/s-d69cbc552cfe8de4931deb191dd349a881ff4448ed3251571e0bacd0257519b1.gif" />
<img alt="g" class="common-manaCost-manaSymbol sprite-mana_symbols_g" src="//assets1.mtggoldfish.com/assets/s-d69cbc552cfe8de4931deb191dd349a881ff4448ed3251571e0bacd0257519b1.gif" />
</span>
</div>
<ul>
<li>Jadelight Ranger</li>
<li>Merfolk Branchwalker</li>
<li>Vraska's Contempt</li>
</ul>
</div>
</div>
Having said that, what I'm looking to get from this is both "b" and "g" and store them in a single variable.
You can probably grab those <img> elements with the class "common-manaCost-manaSymbol" like this:
imgs = soup.find_all("img",{"class":"common-manaCost-manaSymbol"})
and then you can iterate over each <img> and grab the alt property of it.
alts = []
for i in imgs:
alts.append(i['alt'])
or with a list comprehension
alts = [i['alt'] for i in imgs]

How to get data form website using BeautifulSoup Python?

I have a problem to get the data from some page. This is part of my code:
for result in results:
street = result.find('p', attrs={'class':'size16'}).text
records.append((street))
print (street)
Website:
<div class="media-body pt5 pb10">
<div class="mb15">
<span class="map-item-city block mb0 colorgreen">City</span>
<p class="small mb20"> </p>
<p class="size16">street 98<br>phone. 22 721-56-70</p>
</div>
<div class="colorblack"><strong>open</strong></div>
<div class="mb20 size16">Mon.-Fr. 07.30-15.30</div>
<div class="mb15 ">
Result of my code:
ul. Bema 2phone. (32) 745 72 66-69 Wroclaw None
ul. 1 Maja 22/Vphone. 537-943-969 Olawa <p class="small mb20 colorgreen">Placowka partnerska</p>
I would like to separate or delete the text after a "br" tag. I need only 'street'
<p class="size16">street 98<br>phone. 22 721-56-70</p>
Can You help me?
Use previous_sibling like this:
from bs4 import BeautifulSoup
html = """
<div class="media-body pt5 pb10">
<div class="mb15">
<span class="map-item-city block mb0 colorgreen">Bronisze</span>
<p class="small mb20"> </p>
<p class="size16">Poznańska 98<br>tel. 22 721-56-70</p>
</div>
<div class="colorblack"><strong>Godziny otwarcia</strong></div>
<div class="mb20 size16">Pn.-Pt. 07.30-15.30</div>
<div class="mb15 ">
"""
result=BeautifulSoup(html, "lxml")
br = result.find('br')
print (br.previous_sibling)
Or if you want to narrow it down a bit:
street = result.find('p', attrs={'class':'size16'}).find('br').previous_sibling
print (street)
Outputs (in both cases)
Poznańska 98
From the documentation https://www.crummy.com/software/BeautifulSoup/bs4/doc/
.next_sibling and .previous_sibling
You can use .next_sibling and .previous_sibling to navigate between page elements that are on the same level of the parse tree:
from bs4 import BeautifulSoup
html = """
<div class="media-body pt5 pb10">
<div class="mb15">
<span class="map-item-city block mb0 colorgreen">Bronisze</span>
<p class="small mb20"> </p>
<p class="size16">Poznańska 98<br>tel. 22 721-56-70</p>
</div>
<div class="colorblack"><strong>Godziny otwarcia</strong></div>
<div class="mb20 size16">Pn.-Pt. 07.30-15.30</div>
<div class="mb15 ">
"""
soup=BeautifulSoup(html, "lxml")
for html_tag_div in soup.find_all('div', class_ = "media-body pt5 pb10"):
for html_tag_div_1 in html_tag_div.find_all('div', class_ = "mb15"):
for html_tag_2 in html_tag_div_1.find_all("p", class_ = "size16"):
for html_tag_3 in html_tag_2.find("br").previous_siblings:
print(html_tag_3.get_text())

How to create list of web elements?

I am trying to make a list of web elements, but it can not seem to find the elements on the web page, although did worked 3 days ago and i can not find any changes in the web page.
this is the html code :
<li id="wlg_41410" class="leagueWindow " dataid="41410">
<h5 style="cursor: pointer; cursor: hand;" onclick="TodaysEventsLeagueWindow.minimizeRestoreClick(41410)">Europa League</h5>
<div class="bet_type select" id="_bet_types"></div>
<div class="bet_type lastscore ">
<h6>1X2 FT </h6>
<div class="types_bg">
<!--[if IE]> <div id="IEroot"> <![endif]-->
<div class="first_buttons_line">
</div>
<!--[if IE]> </div> <![endif]-->
<div class="time"> 23/11 | 18:00 </div>
<div class="bets ml">
</div>
<div class="time"> 23/11 | 20:00 </div>
<div class="bets ml">
</div>
<div class="time"> 23/11 | 20:00 </div>
<div class="bets ml">
</div>
<div class="time"> 23/11 | 20:00 </div>
<div class="bets ml">
</div>
<div class="time"> 23/11 | 20:00 </div>
<div class="bets ml">
</div>
<div class="clr"></div>
</div>
</div> <span class="x" onclick="TodaysEventsLeagueWindow.closeLeagueWindow(41410)"></span>
</li>
i am trying to make a list from the <div class="bets ml"></div> elements
but keep getting the selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document exception , as if selenium can't find the web element.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
import time
driver.get("https://www.luckia.es/apuestas")
WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it("sbtechBC"))
eventos_de_hoy = driver.find_element_by_id("today_event_btn")
eventos_de_hoy.click()
ligi_len = len(WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "leagueWindow "))))
print(ligi_len)
for index in range(ligi_len):
item = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "leagueWindow ")))[index]
driver.execute_script("arguments[0].scrollIntoView(true);", item)
nume_liga = item.find_element_by_tag_name("h5").text
time.sleep(3)
print('try', nume_liga)
meci = item.find_elements_by_xpath("//*[#class='bets ml']")
print("there are", len(meci), "in one liga")
the reason for the index is that the iframe refreshes every 25 sec.
i also tried meci = item.find_elements_by_css_selector('.bets.ml') and meci = item.find_elements_by_class_name('ml')
Why should i be able to extract the <h5></h5> element and not the other elements?
From your code block, its pretty clear you have just managed to cover up the real issue through time.sleep(3) as follows :
nume_liga = item.find_element_by_tag_name("h5").text
time.sleep(3)
print('try', nume_liga)
While invoking print() for a text, I am not sure why time.sleep(3) was induced. So our main issue got covered up there. But as the List was already created, you are able to print('try', nume_liga)
But next, when you do meci = item.find_elements_by_xpath("//*[#class='bets ml']") you face a StaleElementReferenceException because the HTML DOM have changed.
A closer look into the <h5> tag reveals it have a onclick() event as :
<h5 style="cursor: pointer; cursor: hand;" onclick="TodaysEventsLeagueWindow.minimizeRestoreClick(41410)">Europa League</h5>
A wild guess, while invoking .text on <h5> tag, the HTML DOM changes.
Solution :
A possible solution with your current code block may be to use getAttribute("innerHTML") instead of .text. So your line of code will be :
nume_liga = item.find_element_by_tag_name("h5").get_attribute("innerHTML")

Resources