Scrape text between <span> tags - python-3.x

I am only new to python and i am having trouble getting the text between the tags, here is the html of the full table.
<div id="menu">
<h4 style="display:none">Horse Photo</h4>
<ul style="margin-top:5px;border-radius:6px">
<li style="padding:0">
<img src="/images/unknown_horse.png" style="width:298px;margin-bottom:-3px;border-radius:5px;">
</li>
</ul>
<h4>Horse Profile</h4>
<ul>
<li>Age<span>3yo</span></li>
<li>Foaled<span>17/11/2014</span></li>
<li>Country<span>New Zealand</span></li>
<li>Location<span>Kembla Grange</span></li>
<li>Sex<span>Filly</span></li>
<li>Colour<span>Grey</span></li>
<li>Sire<span>Mastercraftsman</span></li>
<li>Dam<span>In Essence</span></li>
<li>Trainer
<span>
R & L Price
</span>
</li>
<li>Earnings<span>$19,795</span></li>
</ul>
<h4>Owners</h4>
<ul>
<li style="font:normal 12px 'Tahoma">Bell View Park Stud (Mgr: A P Mackrell)</li>
</ul>
</div>

For parsing HTML use beautifulsoup package. That way you can select elements of your html document with ease. To print all text within <span> tags, you can use this example:
data = """
<div id="menu">
<h4 style="display:none">Horse Photo</h4>
<ul style="margin-top:5px;border-radius:6px">
<li style="padding:0">
<img src="/images/unknown_horse.png" style="width:298px;margin-bottom:-3px;border-radius:5px;">
</li>
</ul>
<h4>Horse Profile</h4>
<ul>
<li>Age<span>3yo</span></li>
<li>Foaled<span>17/11/2014</span></li>
<li>Country<span>New Zealand</span></li>
<li>Location<span>Kembla Grange</span></li>
<li>Sex<span>Filly</span></li>
<li>Colour<span>Grey</span></li>
<li>Sire<span>Mastercraftsman</span></li>
<li>Dam<span>In Essence</span></li>
<li>Trainer
<span>
R & L Price
</span>
</li>
<li>Earnings<span>$19,795</span></li>
</ul>
<h4>Owners</h4>
<ul>
<li style="font:normal 12px 'Tahoma">Bell View Park Stud (Mgr: A P Mackrell)</li>
</ul>
</div>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'lxml')
for li in soup.select('span'):
if li.text.strip() == '':
continue
print(li.text)
Will print:
3yo
17/11/2014
New Zealand
Kembla Grange
Filly
Grey
Mastercraftsman
In Essence
R & L Price
$19,795

There are plenty of options to work with HTML/XML. I prefer parsel package. You can install it to your environment with the following command:
$ pip install parsel
After that you can use it like this:
from parsel import Selector
sel = Selector(html)
sel.css('ul li::text').extract()
# ['Age',
# 'Foaled',
# 'Country',
# 'Location',
# 'Sex',
# 'Colour',
# 'Sire',
# 'Dam',
# 'Trainer',
# 'Earnings',
# 'Bell View Park Stud (Mgr: A P Mackrell)']
More detailed description can be found here.

Related

How to get the text of all the elements in a given html in Python3?

How to extract all the text of elements from the following html:
from bs4 import BeautifulSoup
html3 = """
<div class="tab-cell l1">
<span class="cyan-90">***</span>
<h2 class="white-80">
<a class="k-link" href="#" title="Jump">Jump</a>
</h2>
<h3 class="black-70">
<span>Red</span>
<span class="black-50">lock</span>
</h3>
<div class="l-block">
<a class="lang-menu" href="#">A</a>
<a class="lang-menu" href="#">B</a>
<a class="lang-menu" href="#">C</a>
</div>
<div class="black-50">
<div class="p-bold">Period</div>
<div class="tab--cell">$</div><div class="white-90">Method</div>
<div class="tab--cell">$</div><div class="tab--cell">Type</div>
</div>
</div>
"""
soup = BeautifulSoup(html3, "lxml")
if soup.find('div', attrs={'class': 'tab-cell l1'}):
div_descendants = soup.div.descendants
for des in div_descendants:
if des.name is not None:
print(des.name)
if des.find(class_='k-link'):
print(des.a.string)
if des.find(class_='black-70'):
print('span')
print(des.span.text)
I'm getting text of only first link, after that I'm unable to get anything.
I would like to crawl line by line and get whatever I want, if anyone have any idea please let me know.
Your own if-conditions hinder you to get all things. You only print in two cases based on a class_=... condition - you do not print in all conditions:
# html3 = see above
from bs4 import BeautifulSoup
import lxml
soup = BeautifulSoup(html3, "lxml")
if soup.find('div', attrs={'class': 'tab-cell l1'}):
div_descendants = soup.div.descendants
for des in div_descendants:
if des.name is not None:
print(des.name)
found = False
if des.find(class_='k-link'):
print(des.a.string)
found = True
if des.find(class_='black-70'):
print('span')
print(des.span.text)
found = True
# find all others that are not already reported:
if not found:
print(f"Other {des.name}: {des.string}")
Output:
span
Other span: ***
h2
Jump
a
Other a: Jump
h3
Other h3: None
span
Other span: Red
span
Other span: lock
div
Other div: None
a
Other a: A
a
Other a: B
a
Other a: C
div
Other div: None
div
Other div: Period
div
Other div: $
div
Other div: Method
div
Other div: $
div
Other div: Type
Sorted out the issue like this:
from bs4 import BeautifulSoup
import lxml
html3 = """
<div class="tab-cell l1">
<span class="cyan-90">***</span>
<h2 class="white-80">
<a class="k-link" href="#" title="Jump">Jump</a>
</h2>
<h3 class="black-70">
<span>Red</span>
<span class="black-50">lock</span>
</h3>
<div class="l-block">
<a class="lang-menu" href="#">A</a>
<a class="lang-menu" href="#">B</a>
<a class="lang-menu" href="#">C</a>
</div>
<div class="black-50">
<div class="p-bold">Period</div>
<div class="tab--cell">$</div><div class="white-90">Method</div>
<div class="tab--cell">$</div><div class="tab--cell">Type</div>
</div>
</div>
"""
soup = BeautifulSoup(html3, "lxml")
if soup.find('div', attrs={'class': 'tab-cell l1'}):
div_descendants = soup.div.descendants
for des in div_descendants:
if des.name is not None and des.string is not None:
print(f"{des.name}: {des.string}")

How to iterate through a list of Beautful soup tag elements and get a particular text if found else an empty string?

Case1:
<li style="padding:5px;border-bottom:1px solid #ccc">
<div itemscope="" itemtype="http://schema.org/LocalBusiness">
<h5 itemprop="name">
Derattizzazione Disinfestazione Punteruolo Rosso - Quark Srl
</h5>
<div itemprop="address" itemscope="" itemtype="http://schema.org/PostalAddress">
<span itemprop="streetAddress">
Via S. Pellico, 198/L
</span>
<br/>
<span itemprop="postalCode">
63039,
<span itemprop="addressLocality">
San Benedetto del Tronto
</span>
(AP)
</span>
<br/>
</div>
<span itemprop="telephone">
tel: 800 99 83 01
</span>
<br/>
<span>
sito:quarksrl.it
</span>
<br/>
<span>
parole chiave:
<strong>
derattizzazione,consulenza ambientale,disinfestazione ratti,allontanamento piccioni,punteruolo rosso
</strong>
</span>
</div>
</li>
Case2:
<li style="padding:5px;border-bottom:1px solid #ccc">
<div itemscope="" itemtype="http://schema.org/LocalBusiness">
<h5 itemprop="name">
V&b Home Comfort
</h5>
<div itemprop="address" itemscope="" itemtype="http://schema.org/PostalAddress">
<span itemprop="streetAddress">
via delle Torri, 5
</span>
<br/>
<span itemprop="postalCode">
63100,
<span itemprop="addressLocality">
Ascoli Piceno
</span>
(AP)
</span>
<br/>
</div>
<span>
sito:vebhomecomfort.it
</span>
<br/>
</div>
</li>
in case 1 the text 'parole chiave:' is present so I want to fetch the data which is thereafter and in case 2 element is not present so I want None or 'Empty Text' there.
or is there any way to do the same in scrapy?
I really appreciate your efforts in taking out time thanks!
If txt is the string from case 1 + case 2, then you cam use this script to extract the elements:
from bs4 import BeautifulSoup
soup = BeautifulSoup(txt, 'html.parser')
for li in soup.select('li'):
name = li.select_one('h5').get_text(strip=True, separator=' ')
address = li.select_one('[itemprop="streetAddress"]').get_text(strip=True, separator=' ')
postal_code = li.select_one('[itemprop="postalCode"]').get_text(strip=True, separator=' ')
address_locality = li.select_one('[itemprop="addressLocality"]').get_text(strip=True, separator=' ')
telephone = li.select_one('[itemprop="telephone"]')
telephone = telephone.get_text(strip=True, separator=' ') if telephone else '-'
web = li.find(lambda t: t.name=='span' and t.get_text(strip=True).startswith('sito:'))
web = web.get_text(strip=True, separator=' ').replace('sito:', '') if web else '-'
keywords = li.find(lambda t: t.name=='span' and t.get_text(strip=True).startswith('parole chiave:'))
keywords = keywords.get_text(strip=True, separator=' ').replace('parole chiave:', '').split(',') if keywords else []
print(name)
print(address)
print(postal_code)
print(address_locality)
print(telephone)
print(web)
print(keywords)
print('-' * 80)
Prints:
Derattizzazione Disinfestazione Punteruolo Rosso - Quark Srl
Via S. Pellico, 198/L
63039, San Benedetto del Tronto (AP)
San Benedetto del Tronto
tel: 800 99 83 01
quarksrl.it
[' derattizzazione', 'consulenza ambientale', 'disinfestazione ratti', 'allontanamento piccioni', 'punteruolo rosso']
--------------------------------------------------------------------------------
V&b Home Comfort
via delle Torri, 5
63100, Ascoli Piceno (AP)
Ascoli Piceno
-
vebhomecomfort.it
[]
--------------------------------------------------------------------------------

How to fix missing ul tags in html list snippet with Python and Beautiful Soup

If I have a snippet of html like this:
<p><br><p>
<li>stuff</li>
<li>stuff</li>
Is there a way to clean this and add the missing ul/ol tags using beautiful soup, or another python library?
I tried soup.prettify() but it left as is.
It doesn't seem like there's a built-in method which wraps groups of li elements into an ul. However, you can simply loop over the li elements, identify the first element of each li group and wrap it in ul tags. The next elements in the group are appended to the previously created ul:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
ulgroup = 0
uls = []
for li in soup.findAll('li'):
previous_element = li.findPrevious()
# if <li> already wrapped in <ul>, do nothing
if previous_element and previous_element.name == 'ul':
continue
# if <li> is the first element of a <li> group, wrap it in a new <ul>
if not previous_element or previous_element.name != 'li':
ulgroup += 1
ul = soup.new_tag("ul")
li.wrap(ul)
uls.append(ul)
# append rest of <li> group to previously created <ul>
elif ulgroup > 0:
uls[ulgroup-1].append(li)
print(soup.prettify())
For example, the following input:
html = '''
<p><br><p>
<li>stuff1</li>
<li>stuff2</li>
<div></div>
<li>stuff3</li>
<li>stuff4</li>
<li>stuff5</li>
'''
outputs:
<p>
<br/>
<p>
<ul>
<li>
stuff1
</li>
<li>
stuff2
</li>
</ul>
<div>
</div>
<ul>
<li>
stuff3
</li>
<li>
stuff4
</li>
<li>
stuff5
</li>
</ul>
</p>
</p>
Demo: https://repl.it/#glhr/55619920-fixing-uls
First, you have to decide which parser you are going to use. Different parsers treat malformed html differently.
The following BeautifulSoup methods will help you accomplish what you require
new_tag() - create a new ul tag
append() - To append the newly created ul tag somewhere in the soup tree.
extract() - To extract the li tags one by one (which we can append to the ul tag)
decompose() - To remove any unwanted tags from the tree. Which may be formed as a result of the parser's interpretation of the malformed html.
My Solution
Let's create a soup object using html5lib parser and see what we get
from bs4 import BeautifulSoup
html="""
<p><br><p>
<li>stuff</li>
<li>stuff</li>
"""
soup=BeautifulSoup(html,'html5lib')
print(soup)
Outputs:
<html><head></head><body><p><br/></p><p>
</p><li>stuff</li>
<li>stuff</li>
</body></html>
The next step may vary according to what you want to accomplish. I want to remove the second empty p. Add a new ul tag and get all the li tags inside it.
from bs4 import BeautifulSoup
html="""
<p><br><p>
<li>stuff</li>
<li>stuff</li>
"""
soup=BeautifulSoup(html,'html5lib')
second_p=soup.find_all('p')[1]
second_p.decompose()
ul_tag=soup.new_tag('ul')
soup.find('body').append(ul_tag)
for li_tag in soup.find_all('li'):
ul_tag.append(li_tag.extract())
print(soup.prettify())
Outputs:
<html>
<head>
</head>
<body>
<p>
<br/>
</p>
<ul>
<li>
stuff
</li>
<li>
stuff
</li>
</ul>
</body>
</html>

How to select only divs with specific children span with xpath python

I am currently trying to scrap information of a particular ecommerce site and i only want to get product information like product name, price, color and sizes of only products whose prices have been slashed.
i am currently using xpath
this is my python scraping code
from lxml import html
import requests
class CategoryCrawler(object):
def __init__(self, starting_url):
self.starting_url = starting_url
self.items = set()
def __str__(self):
return('All Items:', self.items)
def crawl(self):
self.get_item_from_link(self.starting_url)
return
def get_item_from_link(self, link):
start_page = requests.get(link)
tree = html.fromstring(start_page.text)
names = tree.xpath('//span[#class="name"][#dir="ltr"]/text()')
print(names)
Note this is not the original URL
crawler = CategoryCrawler('https://www.myfavoriteecommercesite.com/')
crawler.crawl()
When the program is Run ... These are the HTML Content Gotten from the E-commerce Site
Div of Products With Price Slash
div class="products-info">
<h2 class="title"><span class="brand ">Apple </span> <span class="name" dir="ltr">IPhone X 5.8-Inch HD (3GB,64GB ROM) IOS 11, 12MP + 7MP 4G Smartphone - Silver</span></h2>
<div class="price-container clearfix">
<span class="sale-flag-percent">-22%</span>
<span class="price-box ri">
<span class="price ">
<span data-currency-iso="NGN">₦</span>
<span dir="ltr" data-price="388990">388,990</span>
</span>
<span class="price -old ">
<span data-currency-iso="NGN">₦</span>
<span dir="ltr" data-price="500000">500,000</span>
</span>
</span>
</div>
div
Div of Products with No Price Slash
div class="products-info">
<h2 class="title"><span class="brand ">Apple </span> <span class="name" dir="ltr">IPhone X 5.8-Inch HD (3GB,64GB ROM) IOS 11, 12MP + 7MP 4G Smartphone - Silver</span></h2>
<div class="price-container clearfix">
<span class="price-box ri">
<span class="price ">
<span data-currency-iso="NGN">₦</span>
<span dir="ltr" data-price="388990">388,990</span>
</span>
</span>
</div>
div
Now this is my exact Question
i want to know how to select only the parent divs i.e
div class="price-container clearfix"> that also contains any of these children span classes
span class="price -old "> or
span class="sale-flag-percent">
Thank you all
One solution would be get all <div class="price-container clearfix"> and iterate, checking with the string of the whole element that your keywords exist.
But a better solution would be to use conditionals with xpath:
from lxml import html
htmlst = 'your html'
tree=html.fromstring(htmlst)
divs = tree.xpath('//div[#class="price-container clearfix" and .//span[#class = "price -old " or #class = "sale-flag-percent"] ]')
print(divs)
This get all divs where class="price-container clearfix" and then check if contains span with the searched classes.

How to get data form website using BeautifulSoup Python?

I have a problem to get the data from some page. This is part of my code:
for result in results:
street = result.find('p', attrs={'class':'size16'}).text
records.append((street))
print (street)
Website:
<div class="media-body pt5 pb10">
<div class="mb15">
<span class="map-item-city block mb0 colorgreen">City</span>
<p class="small mb20"> </p>
<p class="size16">street 98<br>phone. 22 721-56-70</p>
</div>
<div class="colorblack"><strong>open</strong></div>
<div class="mb20 size16">Mon.-Fr. 07.30-15.30</div>
<div class="mb15 ">
Result of my code:
ul. Bema 2phone. (32) 745 72 66-69 Wroclaw None
ul. 1 Maja 22/Vphone. 537-943-969 Olawa <p class="small mb20 colorgreen">Placowka partnerska</p>
I would like to separate or delete the text after a "br" tag. I need only 'street'
<p class="size16">street 98<br>phone. 22 721-56-70</p>
Can You help me?
Use previous_sibling like this:
from bs4 import BeautifulSoup
html = """
<div class="media-body pt5 pb10">
<div class="mb15">
<span class="map-item-city block mb0 colorgreen">Bronisze</span>
<p class="small mb20"> </p>
<p class="size16">Poznańska 98<br>tel. 22 721-56-70</p>
</div>
<div class="colorblack"><strong>Godziny otwarcia</strong></div>
<div class="mb20 size16">Pn.-Pt. 07.30-15.30</div>
<div class="mb15 ">
"""
result=BeautifulSoup(html, "lxml")
br = result.find('br')
print (br.previous_sibling)
Or if you want to narrow it down a bit:
street = result.find('p', attrs={'class':'size16'}).find('br').previous_sibling
print (street)
Outputs (in both cases)
Poznańska 98
From the documentation https://www.crummy.com/software/BeautifulSoup/bs4/doc/
.next_sibling and .previous_sibling
You can use .next_sibling and .previous_sibling to navigate between page elements that are on the same level of the parse tree:
from bs4 import BeautifulSoup
html = """
<div class="media-body pt5 pb10">
<div class="mb15">
<span class="map-item-city block mb0 colorgreen">Bronisze</span>
<p class="small mb20"> </p>
<p class="size16">Poznańska 98<br>tel. 22 721-56-70</p>
</div>
<div class="colorblack"><strong>Godziny otwarcia</strong></div>
<div class="mb20 size16">Pn.-Pt. 07.30-15.30</div>
<div class="mb15 ">
"""
soup=BeautifulSoup(html, "lxml")
for html_tag_div in soup.find_all('div', class_ = "media-body pt5 pb10"):
for html_tag_div_1 in html_tag_div.find_all('div', class_ = "mb15"):
for html_tag_2 in html_tag_div_1.find_all("p", class_ = "size16"):
for html_tag_3 in html_tag_2.find("br").previous_siblings:
print(html_tag_3.get_text())

Resources