Scraping incidents in a table in a website - python-3.x

I am trying to extract a table into pandas from a website that is automatically updated on a regular basis. I tried:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
website = 'http://www.dallasfirerescue.com/active_incidents.html'
req = Request(website)
abc = urlopen(req)
raw = abc.read().decode("utf-8")
page = raw.replace('<!-->', '')
soup = BeautifulSoup(page, "html.parser")
table = soup.find("table")
print (table)
It gives me None

Your link didn't work for me, but here is a great example of how to download data from an HTML table into Python.
# import libraries
import requests
from bs4 import BeautifulSoup
# query the website and return the html to the variable ‘page’
page = requests.get("https://www.aucklandairport.co.nz/flights").text
soup = BeautifulSoup(page)
tbody = soup.find('tbody')
rows = tbody.findAll('tr',{'class':'flight-toggle'}) #find tr whose class = flight-toggle
for tr in rows:
cols = tr.findAll('td',class_=lambda x: x != 'logo') # find td whose class!=logo (exclude the first td)
dv0 = cols[0].find('div').findAll('div') #flight, carrier, origin under second td
flight, carrier, origin = [c.text.strip() for c in dv0]
dv1 = cols[1].find('div').findAll('div') #date, schedule under third td
date, scheduled = [c.text.strip() for c in dv1]
dv2 = cols[2].find('div').findAll('div') #estimated, statusunder fouth td
estimated, status = [c.text.strip() for c in dv2[1:]] # exclude the first div
print(flight, carrier, origin, date, scheduled, estimated, status)
See the links below for more info.
http://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/
https://pythonprogramminglanguage.com/web-scraping-with-pandas-and-beautifulsoup/

The content of that page is generated dynamically. You can't grab the response by making http request. You need to use any browser simulator instead. Here is how you can achieve that. I used selenium in this case:
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get('http://www.dallasfirerescue.com/active_incidents.html')
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.find(class_="CSVTable")
for tr in table.find_all("tr"):
data = [item.text.strip() for item in tr.find_all("td")]
print(data)
driver.quit()
When you execute the above script, the data from the table of that webpage will be available in your grip.

Related

How Can I Assign A Variable To All Of The Items In A List?

I'm following a guide and it's saying to print the first item from an html document that contains the dollar sign.
It seems to do it correctly, outputting a price to the terminal and it actually being present on the webpage. However, I don't want to have just that single listing, I want to have all of the listings and print them to the terminal.
I'm almost positive that you could do this with a for loop, but I don't know how to set that up correctly. Here's the code I have so far with a comment on line 14, and the code I'm asking about on line 15.
from bs4 import BeautifulSoup
import requests
import os
os.system("clear")
url = 'https://www.newegg.com/p/pl?d=RTX+3080'
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
prices = doc.find_all(text="$")
#Print all prices instead of just the specified number?
parent = prices[0].parent
strong = parent.find("strong")
print(strong.string)
You could try the following:
from bs4 import BeautifulSoup
import requests
import os
os.system("clear")
url = 'https://www.newegg.com/p/pl?d=RTX+3080'
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
prices = doc.find_all(text="$")
for price in prices:
parent = price.parent
strong = parent.find("strong")
print(strong.string)

can't scape a value from Beautifulsoup in python

I am creating a website where I display the current wind. when I go to https://www.windguru.cz/station/219 (and click on inspect element at the max:{wind}) I can see this:
<span class="wgs_wind_max_value">12</span>
the 12 is the value I need but when I try to scrape it with bs4 and requests, this appears as output:
<span class="wgs_wind_max_value"></span>
as you can see there is no '12' value.
can someone help me with that?
from bs4 import BeautifulSoup
import requests
page = requests.get('https://www.windguru.cz/3323')
soup = BeautifulSoup(page.content, "lxml")
table = soup.find_all("span",{"class","wgs_wind_max_value"})
print(table)
Use the same API as page does to get json to populate those values. Notice the querystring construction passed to the API.
import requests
headers = {'Referer' : 'https://www.windguru.cz/station/219'}
r = requests.get('https://www.windguru.cz/int/iapi.php?q=station_data_current&id_station=219&date_format=Y-m-d%20H%3Ai%3As%20T&_mha=f4d18b6c', headers = headers).json()
print(r)
print(r['wind_max'])

Scraping wikipedia infobox geography vcard

I have been trying too scrape the data from the Website section from the various cities' vcard table on wikipedia but somehow I get the results for the Co-ordinates section which is located at the beginning off the table
I have tried specifying "Website" while selecting the specific tags in the table.
def getAdditionalInfo(url):
try:
city_page = PageContent('https://en.wikipedia.org' + url)
table = city_page.find('table', {'class' : 'infobox geography vcard'})
additional_details = []
read_content = False
for tr in table.find_all('tr'):
if (tr.get('class') == ['mergedtoprow'] and not read_content):
link = tr.find('th')
if (link and (link.get_text().strip() == 'Website')):
read_content = True
elif ((tr.get('class') == ['mergedbottomrow']) or tr.get('class') == ['mergedrow'] and read_content):
additional_details.append(tr.find('td').get_text().strip('\n'))
return additional_details
except Exception as error:
print('Error occured: {}'.format(error))
return []
I want to append this data into a new column which shows the website link for each city's official page which I would be getting from this function
With bs4 4.7.1 you can use :contains to target the table header of website and then get the next td's a tag href attribute. Clearly there are other cases where this pattern could match so perhaps some other form of validation is required on input values.
You could add an additional class selector for the vcard if you wish: result = soup.select_one('.vcard th:contains(Website) + td > [href]')
Python
import requests
from bs4 import BeautifulSoup as bs
cities = ['Paris', 'Frankfurt', 'London']
base = 'https://en.wikipedia.org/wiki/'
with requests.Session() as s:
for city in cities:
r = s.get(base + city)
soup = bs(r.content, 'lxml')
result = soup.select_one('th:contains(Website) + td > [href]')
if result is None:
print(city, 'selector failed to find url')
else:
print(city, result['href'])
As I understand the problem correctly, you want to extract official URL of the city from Wikipedia:
import requests
from bs4 import BeautifulSoup
def getAdditionalInfo(url):
soup = BeautifulSoup(requests.get('https://en.wikipedia.org' + url).text, 'lxml')
for th in soup.select('.vcard th'):
if not th.text.lower() == 'website':
continue
yield th.parent.select_one('td').text
cities = ['/wiki/Paris', '/wiki/London', '/wiki/Madrid']
for city in cities:
for info in getAdditionalInfo(city):
print(f'{city}: {info}')
This prints:
/wiki/Paris: www.paris.fr
/wiki/London: london.gov.uk
/wiki/Madrid: www.madrid.es

Beautiful Soup Error: Trying to retrieve data from web page returns empty array

I am trying to download a list of voting intention opinion polls from this web page using beautiful soup. However, the code I wrote returns an empty array or nothing. The code I used is below:
The page code is like this:
<div class="ST-c2-dv1 ST-ch ST-PS" style="width:33px"></div>
<div class="ST-c2-dv2">41.8</div>
That's what I tried:
import requests
from bs4 import BeautifulSoup
request = requests.get(quote_page) # take the page link
page = request.content # extract page content
soup = BeautifulSoup(page, "html.parser")
# extract all the divs
for each_div in soup.findAll('div',{'class':'ST-c2-dv2'}):
print each_div
At this point, it prints nothing.
I've tried also this:
tutti_a = soup.find_all("html_element", class_="ST-c2-dv2")
and also:
tutti_a = soup.find_all("div", class_="ST-c2-dv2")
But I get an empty array [] or nothing at all
I think you can use the following url
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
r = requests.get('https://www.marktest.com/wap/a/sf/v~[73D5799E1B0E]/name~Dossier_5fSondagensLegislativas_5f2011.HighCharts.Sondagens.xml.aspx')
soup = bs(r.content, 'lxml')
results = []
for record in soup.select('p'):
results.append([item.text for item in record.select('b')])
df = pd.DataFrame(results)
print(df)
Columns 5,6,7,8,9,10 correspond with PS, PSD,CDS,CDU,Bloco,Outros/Brancos/Nulos
You can drop unwanted columns, add appropriate headers etc.

Web Scraping reviews -Flipkart

I am trying to take out entire review of a product(remaining half of the review is display after clicking read more. but I am still not able to do so.It is not displaying entire content of a review, which get dispalyed after clicking read more option. Below is the code , which click the readmore option and also get data from the website
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
response = requests.get("https://www.flipkart.com/poco-f1-graphite-black-64-gb/product-reviews/itmf8fyjyssnt25c?page=2&pid=MOBF85V7A6PXETAX")
data = BeautifulSoup(response.content, 'lxml')
chromepath = r"C:\Users\Mohammed\Downloads\chromedriver.exe"
driver=webdriver.Chrome(chromepath)
driver.get("https://www.flipkart.com/poco-f1-graphite-black-64-gb/product-reviews/itmf8fyjyssnt25c?page=2&pid=MOBF85V7A6PXETAX")
d = driver.find_element_by_class_name("_1EPkIx")
d.click()
title = data.find_all("p",{"class" : "_2xg6Ul"})
text1 = data.find_all("div",{"class" : "qwjRop"})
name = data.find_all("p",{"class" : "_3LYOAd _3sxSiS"})
for t2, t , t1 in zip(title,text1,name) :
print(t2.text,'\n',t.text,'\n',t1.text)
To get the full reviews, It is necessary to click on those READ MORE buttons to unwrap the rest. As you have already used selenium in combination with BeautifulSoup, I've modified the script to follow the logic. The script will first click on those READ MORE buttons. Once it is done, it will then parse all the titles and reviews from there. You can now get the titles and reviews from multiple pages (upto 4 pages).
import time
from bs4 import BeautifulSoup
from selenium import webdriver
link = "https://www.flipkart.com/poco-f1-graphite-black-64-gb/product-reviews/itmf8fyjyssnt25c?page={}&pid=MOBF85V7A6PXETAX"
driver = webdriver.Chrome() #If necessary, define the chrome path explicitly
for page_num in range(1,5):
driver.get(link.format(page_num))
[item.click() for item in driver.find_elements_by_class_name("_1EPkIx")]
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'lxml')
for items in soup.select("._3DCdKt"):
title = items.select_one("p._2xg6Ul").text
review = ' '.join(items.select_one(".qwjRop div:nth-of-type(2)").text.split())
print(f'{title}\n{review}\n')
driver.quit()

Resources