Extracting Data from HTML Span using Beautiful Soup - python-3.x

I want to extract"1.02 Crores" and "7864" from html code and save them in different column in csv file.
Code:
<div class="featuresvap _graybox clearfix"><h3><span><i class="icon-inr"></i>1.02 Crores</span><small> # <i class="icon-inr"></i><b>7864/sq.ft</b> as per carpet area</small></h3>

Not sure about the actual data but this is just something that I threw together really quick. If you need it to navigate to a website then use import requests. you'' need to add url = 'yourwebpagehere' page = requests.get(url) and change soup to soup = BeautifulSoup(page.text, 'lxml') then remove the html variable since it would be unneeded.
from bs4 import BeautifulSoup
import csv
html = '<div class="featuresvap _graybox clearfix"><h3><span><i class="icon-inr"></i>1.02 Crores</span><small> # <i class="icon-inr"></i><b>7864/sq.ft</b> as per carpet area</small></h3>'
soup = BeautifulSoup(html, 'lxml')
findSpan = soup.find('span')
findB = soup.find('b')
print([findSpan.text, findB.text.replace('/sq.ft', '')])
with open('NAMEYOURFILE.csv', 'w+') as writer:
csv_writer = csv.writer(writer)
csv_writer.writerow(["First Column Name", "Second Column Name"])
csv_writer.writerow([findSpan, findB])

self explained in code
from bs4 import BeautifulSoup
# data for first column
firstCol = []
# data for second column
secondCol = []
for url in listURL:
html = '.....' # downloaded html
soup = BeautifulSoup(html, 'html.parser')
# 'select_one' select using CSS selectors, return only first element
fCol = soup.select_one('.featuresvap h3 span')
# remove: <i class="icon-inr"></i>
span.find("i").extract()
sCol = soup.select_one('.featuresvap h3 b')
firstCol.append(fCol.text)
secondCol.append(sCol.text.replace('/sq.ft', ''))
with open('results.csv', 'w') as fl:
csvContent = ','.join(firstCol) + '\n' + ','.join(secondCol)
fl.write(csvContent)
''' sample results
1.02 Crores | 2.34 Crores
7864 | 2475
'''
print('finish')

Related

How can I get BeautifulSoup info in the same row?

I am currently web scraping and would like to get the specifications on the same row. When I currently print it column 2 looks like this:
text
text
text
text
text
I would like to get it all on the same row like this
text text text text text
so i can later chop it up into different columns in Excel later.
Is there maybe a transposing command I could use or something else?
Code:
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(
page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
skriver.writerow({'column1': address.text.strip(), 'column2': specs.text})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')
EDIT: Scraping from the website is illegal, so I removed the URL.
your specs.text is a string that contains \n new lines. You can split it, then join it back with just a space. Ie ' '.join(specs.text.split())
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
address = ' '.join(address.text.split())
specs = ' '.joins(specs.text.split()) #<-- changed here
skriver.writerow({'column1': address, 'column2': specs})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')

How to print item in a list without a for loop

I'm trying to just get the price off a website and found that "class="udYkAW2UrhZln2Iv62EYb" " gave me the price in one line. but when I try to print it out I keep getting
<span class="udYkAW2UrhZln2Iv62EYb">$0.312423</span>
and not just the price itself. I fixed this by using a for loop to get me item, but is there a way to just display the price with a print function without a for loop?
Please and thank you.
Here's the code
from bs4 import BeautifulSoup as bs
import requests
url = 'https://robinhood.com/crypto/DOGE'
r = requests.get(url)
#make to soup
soup = bs(r.content, 'lxml')
#where the price of the search was found "span class='udYkAW2UrhZln2Iv62EYb'"
#Using find() because this is the first instance of this class
price_class = soup.find('span', {'class' : 'udYkAW2UrhZln2Iv62EYb'})
print(price_class)
type(price_class)
#outout: <span class="udYkAW2UrhZln2Iv62EYb">$0.312423</span>
#output: bs4.element.Tag
for i in price_class:
print(i)
#output: $0.312423
Use .text or .get_text():
from bs4 import BeautifulSoup as bs
import requests
url = "https://robinhood.com/crypto/DOGE"
r = requests.get(url)
soup = bs(r.content, "lxml")
price = soup.find("span", {"class": "udYkAW2UrhZln2Iv62EYb"})
print(price.text) # <--- use .text
Prints:
$0.315917

How to extract both content and markup in a class?

I'm trying to extract the content marked by <div class="sense"> in abc. With ''.join(map(str, soup.select_one('.sense').contents)), I only get the content between markers, i.e xyz. To fulfill my job, I also need the full <div class="sense">xyz</div>
from bs4 import BeautifulSoup
abc = """abcdd<div class="sense">xyz</div>"""
soup = BeautifulSoup(abc, 'html.parser')
content1 = ''.join(map(str, soup.select_one('.sense').contents))
print(content1)
and the result is xyz. Could you please elaborate on how to achieve my goal?
Try:
from bs4 import BeautifulSoup
abc = """abcdd<div class="sense">xyz</div>"""
soup = BeautifulSoup(abc, 'html.parser')
div = soup.find('div', attrs={'class': 'sense'})
print(div)
prints:
<div class="sense">xyz</div>

Getting Empty content Beautiful Soup Python

Am not very familiar with Beautifulsoup, for the life i cant seem to retrieve the table in this html. I parsed the html page using Beautiful Soup and i come up empty. Any help will be appreciated. Thanks!
url='https://definitivehc.maps.arcgis.com/home/item.html?id=1044bb19da8d4dbfb6a96eb1b4ebf629&view=list&showFilters=false#data'
response = requests.get(url, timeout=10)
bs4 = BeautifulSoup(response.content, 'lxml')
table_body=bs4.find('table')
rows = table_body.find_all('tr')
for row in rows:
cols=row.find_all('th')
cols=[x.text.strip() for x in cols]
print(cols)
So i could generate the header for the table, but could not retrive the data from the table itself. Here is the html:
<table class="dgrid-row-table" role="presentation">
<tr>
<td class="dgrid-cell dgrid-cell-padding dgrid-column-0 field-HOSPITAL_NAME"
role="gridcell"><div>**Phoenix VA Health Care System (AKA Carl T Hayden VA
Medical Center)**</div>
</td>
:
:
<td....................</td>
<td....................</td>
<td....................</td>
<td....................</td>
...and there are several other TDs. Am trying to capture all the values from the table. Here is my attempt so far:
url='https://definitivehc.maps.arcgis.com/home/item.html?id=1044bb19da8d4dbfb6a96eb1b4ebf629&view=list&showFilters=false#data'
browser = webdriver.Chrome(r"C:\Users\lab\chromedriver")
browser.get(url)
time.sleep(15)
html = browser.page_source
soup = Soup(html, "html")
table_body=soup.find('table', {'class': 'dgrid-row-table', 'role': 'presentation'})
rows = table_body.find_all('tr')
for row in rows:
cols=row.find_all('td')
cols=[x.text.strip() for x in cols]
print(cols)
The column generates nothing when i run it. Thanks.
Using selenium:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
url = "https://definitivehc.maps.arcgis.com/home/item.html?id=1044bb19da8d4dbfb6a96eb1b4ebf629&view=list&showFilters=false#data"
browser = webdriver.Chrome('/usr/bin/chromedriver')
browser.get(url)
time.sleep(15)
html = browser.page_source
soup = BeautifulSoup(html, "html")
print(len(soup.find_all("table")))
print(soup.find("table", {"id": "dgrid_0-header"}))
browser.close()
browser.quit()

Beautifullsoup get text in tag

I am trying to get data from yellowpages, but i need only numbered plumbers. But i can't get text numbers in h2 class='n'. I can get a class="business-name" text but i need only numbered plumbers not with advertisement. What is my mistake? Thank you very much.
This html :
<div class="info">
<h2 class="n">1. <a class="business-name" href="/austin-tx/mip/johnny-rooter-11404675?lid=171372530" rel="" data-impressed="1"><span>Johnny Rooter</span></a></h2>
</div>
And this is my python code:
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.yellowpages.com/austin-tx/plumbers"
req = requests.get(url)
data = req.content
soup = bs(data, "lxml")
links = soup.findAll("div", {"class": "info"})
for link in links:
for content in link.contents:
try:
print(content.find("h2", {"class": "n"}).text)
except:
pass
You need a different class selector to limit to that section
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.yellowpages.com/austin-tx/plumbers"
req = requests.get(url)
data = req.content
soup = bs(data, "lxml")
links = [item.text.replace('\xa0','') for item in soup.select('.organic h2')]
print(links)
.organic is a single class selector, from a compound class, for a parent element which restricts to all the numbered plumbers. Observe how the highlighting starts after the ads:
Output:

Resources