I'm trying to grab the last table (titled "Registro de los casos") on this wikipedia page
with this python 3.7 code
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
def webcrawler():
url = "https://es.wikipedia.org/wiki/Pandemia_de_enfermedad_por_coronavirus_de_2020_en_Argentina"#Cronolog%C3%ADa"
page = requests.get(url)
soup = BeautifulSoup(page.text,"html.parser")
tables = soup.findAll("table", class_='wikitable')[0]
#print(tables)
for table in tables:
if isinstance(table, NavigableString):
continue
ths = table.find_all('th')
headings = [th.text.strip() for th in ths]
print(headings)
webcrawler()
But it only finds the first table, and not the last. What am I doing wrong?
You set tables to the first item that is returned by soup.findAll("table", class_='wikitable')[0]. If you take out [0] you write all tables with that class to the tables variable
Related
So for a project, I'm working on creating an API to interface with my School's course-finder and I'm struggling to grab the data from the a HTML table they store the data in without using Selenium. I was able to pull the HTML data initially using Selenium but my Instructor says he would prefer if I used BeautifulSoup4 & MechanicalSoup libraries. I got as far as submitting a search and grabbing the HTML table the data is stored in. I'm not sure how to iterate through the data stored in the HTML table as I did with my Selenium code below.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
Chrome_Options = Options()
Chrome_Options.add_argument("--headless") #allows program to run without opening a chrome window
driver = webdriver.Chrome()
driver.get("https://winnet.wartburg.edu/coursefinder/") #sets the Silenium driver
select = Select(driver.find_element_by_id("ctl00_ContentPlaceHolder1_FormView1_DropDownList_Term"))
term_options = select.options
#for index in range(0, len(term_options) - 1):
# select.select_by_index(index)
lst = []
DeptSelect = Select(driver.find_element_by_id("ctl00_ContentPlaceHolder1_FormView1_DropDownList_Department"))
DeptSelect.select_by_visible_text("History") #finds the desiered department
search = driver.find_element_by_name("ctl00$ContentPlaceHolder1$FormView1$Button_FindNow")
search.click() #sends query
table_id = driver.find_element_by_id("ctl00_ContentPlaceHolder1_GridView1")
rows = table_id.find_elements_by_tag_name("tr")
for row in rows: #creates a list of lists containing our data
col_lst = []
col = row.find_elements_by_tag_name("td")
for data in col:
lst.append(data.text)
def chunk(l, n): #class that partitions our lists neatly
print("chunking...")
for i in range(0, len(l), n):
yield l[i:i + n]
n = 16 #each list contains 16 items regardless of contents or search
uberlist = list(chunk(lst, n)) #call chunk fn to partion list
with open('class_data.txt', 'w') as handler: #output of scraped data
print("writing file...")
for listitem in uberlist:
handler.write('%s\n' % listitem)
driver.close #ends and closes Silenium control over brower
This is my Soup Code and I'm wondering how I can take the data from the HTML in a similar way I did above with my Selenium.
import mechanicalsoup
import requests
from lxml import html
from lxml import etree
import pandas as pd
def text(elt):
return elt.text_content().replace(u'\xa0', u' ')
#This Will Use Mechanical Soup to grab the Form, Subit it and find the Data Table
browser = mechanicalsoup.StatefulBrowser()
winnet = "http://winnet.wartburg.edu/coursefinder/"
browser.open(winnet)
Searchform = browser.select_form()
Searchform.choose_submit('ctl00$ContentPlaceHolder1$FormView1$Button_FindNow')
response1 = browser.submit_selected() #This Progresses to Second Form
dataURL = browser.get_url() #Get URL of Second Form w/ Data
dataURL2 = 'https://winnet.wartburg.edu/coursefinder/Results.aspx'
pageContent=requests.get(dataURL2)
tree = html.fromstring(pageContent.content)
dataTable = tree.xpath('//*[#id="ctl00_ContentPlaceHolder1_GridView1"]')
rows = [] #initialize a collection of rows
for row in dataTable[0].xpath(".//tr")[1:]: #add new rows to the collection
rows.append([cell.text_content().strip() for cell in row.xpath(".//td")])
df = pd.DataFrame(rows) #load the collection to a dataframe
print(df)
#XPath to Table
#//*[#id="ctl00_ContentPlaceHolder1_GridView1"]
#//*[#id="ctl00_ContentPlaceHolder1_GridView1"]/tbody
Turns out I was able passing the wrong thing when using MechanicalSoup. I was able to pass the new page's contents to a variable called table had the page use .find('table') to retrieve the table HTML rather than the full page's HTML. From there just used table.get_text().split('\n') to make essentially a giant list of all of the rows.
I also dabble with setting form filters which worked as well.
import mechanicalsoup
from bs4 import BeautifulSoup
#Sets StatefulBrowser Object to winnet then it it grabs form
browser = mechanicalsoup.StatefulBrowser()
winnet = "http://winnet.wartburg.edu/coursefinder/"
browser.open(winnet)
Searchform = browser.select_form()
#Selects submit button and has filter options listed.
Searchform.choose_submit('ctl00$ContentPlaceHolder1$FormView1$Button_FindNow')
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$TextBox_keyword', "") #Keyword Searches by Class Title. Inputting string will search by that string ignoring any stored nonsense in the page.
#ACxxx Course Codes have 3 spaces after them, THIS IS REQUIRED. Except the All value for not searching by a Department does not.
Searchform.set("ctl00$ContentPlaceHolder1$FormView1$DropDownList_Department", 'All') #For Department List, it takes the CourseCodes as inputs and displays as the Full Name
Searchform.set("ctl00$ContentPlaceHolder1$FormView1$DropDownList_Term", "2020 Winter Term") # Term Dropdown takes a value that is a string. String is Exactly the Term date.
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_MeetingTime', 'all') #Takes the Week Class Time as a String. Need to Retrieve list of options from pages
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_EssentialEd', 'none') #takes a small string signialling the EE req or 'all' or 'none'. None doesn't select and option and all selects all coruses w/ a EE
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_CulturalDiversity', 'none')# Cultural Diversity, Takes none, C, D or all
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_WritingIntensive', 'none') # options are none or WI
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_PassFail', 'none')# Pass/Faill takes 'none' or 'PF'
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$CheckBox_OpenCourses', False) #Check Box, It's True or False
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_Instructor', '0')# 0 is for None Selected otherwise it is a string of numbers (Instructor ID?)
#Submits Page, Grabs results and then launches a browser for test purposes.
browser.submit_selected()# Submits Form. Retrieves Results.
table = browser.get_current_page().find('table') #Finds Result Table
print(type(table))
rows = table.get_text().split('\n') # List of all Class Rows split by \n.
I have written the code below attempting to practice web-scraping with Python, Pandas, etc. In general I have four steps I am trying to follow to achieve my desired output:
Get a list of names to append to a base url
Create a list of player specific urls
Use the player urls to scrape tables
add the player name to the table I scraped to keep track of which player belongs to which stats - so in each row of the table add a column with the players name who was used to scrape the table
I was able to get #'s 1 and 2 working. The components of #3 seem to work, but i believe i have something wrong with my try: except because if i run just the line of code to scrape a specific playerUrl the tables DF populates as expected. The first player scraped has no data so I believe I am failing with the error catching.
For # 4 i really havent been able to find a solution. How do i add the name to the list as it is iterating in the for loop?
Any help is appreciated.
import requests
import pandas as pd
from bs4 import BeautifulSoup
### get the player data to create player specific urls
res = requests.get("https://www.mlssoccer.com/players?page=0")
soup = BeautifulSoup(res.content,'html.parser')
data = soup.find('div', class_ = 'item-list' )
names=[]
for player in data:
name = data.find_all('div', class_ = 'name')
for obj in name:
names.append(obj.find('a').text.lower().lstrip().rstrip().replace(' ','-'))
### create a list of player specific urls
url = 'https://www.mlssoccer.com/players/'
playerUrl = []
x = 0
for name in (names):
playerList = names
newUrl = url + str(playerList[x])
print("Gathering url..."+newUrl)
playerUrl.append(newUrl)
x +=1
### now take the list of urls and gather stats tables
tbls = []
i = 0
for url in (playerUrl):
try: ### added the try, except, pass because some players have no stats table
tables = pd.read_html(playerUrl[i], header = 0)[2]
tbls.append(tables)
i +=1
except Exception:
continue
There are lots of redundancy in your script. You can clean them up complying the following. I've used select() instead of find_all() to shake of the verbosity in the first place. To get rid of that IndexError, you can make use of continue keyword like I've shown below:
import requests
import pandas as pd
from bs4 import BeautifulSoup
base_url = "https://www.mlssoccer.com/players?page=0"
url = 'https://www.mlssoccer.com/players/'
res = requests.get(base_url)
soup = BeautifulSoup(res.text,'lxml')
names = []
for player in soup.select('.item-list .name a'):
names.append(player.get_text(strip=True).replace(" ","-"))
playerUrl = {}
for name in names:
playerUrl[name] = f'{url}{name}'
tbls = []
for url in playerUrl.values():
if len(pd.read_html(url))<=2:continue
tables = pd.read_html(url, header=0)[2]
tbls.append(tables)
print(tbls)
You can do couple of things to improve your code and get the step # 3 and 4 done.
(i) When using the for name in names loop, there is no need to explicitly use the indexing, just use the variable name.
(ii) You can save the player's name and its corresponding URL as a dict, where the name is the key. Then in step 3/4 you can use that name
(iii) Construct a DataFrame for each parsed HTML table and just append the player's name to it. Save this data frame individually.
(iv) Finally concatenate these data frames to form a single one.
Here is your code modified with above suggested changes:
import requests
import pandas as pd
from bs4 import BeautifulSoup
### get the player data to create player specific urls
res = requests.get("https://www.mlssoccer.com/players?page=0")
soup = BeautifulSoup(res.content,'html.parser')
data = soup.find('div', class_ = 'item-list' )
names=[]
for player in data:
name = data.find_all('div', class_ = 'name')
for obj in name:
names.append(obj.find('a').text.lower().lstrip().rstrip().replace(' ','-'))
### create a list of player specific urls
url = 'https://www.mlssoccer.com/players/'
playerUrl = {}
x = 0
for name in names:
newUrl = url + str(name)
print("Gathering url..."+newUrl)
playerUrl[name] = newUrl
### now take the list of urls and gather stats tables
tbls = []
for name, url in playerUrl.items():
try:
tables = pd.read_html(url, header = 0)[2]
df = pd.DataFrame(tables)
df['Player'] = name
tbls.append(df)
except Exception as e:
print(e)
continue
result = pd.concat(tbls)
print(result.head())
I'm using BS4 to parse this webpage:
You'll notice there are two separate tables on the page. Here's the relevant snipped of my code, which is successfully returning the data I want from the first table, but does not find anything from the second table:
# import packages
import urllib3
import certifi
from bs4 import BeautifulSoup
import pandas as pd
#settings
http = urllib3.PoolManager(
cert_reqs='CERT_REQUIRED',
ca_certs=certifi.where())
gamelog_offense = []
#scrape the data and write the .csv files
url = "https://www.sports-reference.com/cfb/schools/florida/2018/gamelog/"
response = http.request('GET', url)
soup = BeautifulSoup(response.data, features="html.parser")
cnt = 0
for row in soup.findAll('tr'):
try:
col=row.findAll('td')
Pass_cmp = col[4].get_text()
Pass_att = col[5].get_text()
gamelog_offense.append([Pass_cmp, Pass_att])
cnt += 1
except:
pass
print("Finished writing with " + str(cnt) + " records")
Finished writing with 13 records
I've verified the data from the SECOND table is contained within the soup (I can see it!). After lots of troubleshooting, I've discovered that the entire second table is completely contained within one big comment(why?). I've managed to extract this comment into a single comment object using the code below, but can't figure out what to do with it after that to extract the data I want. Ideally, I'd like to parse the comment in same way I'm successfully parsing the first table. I've tried using the ideas from similar stack overflow questions (selenium, phantomjs)...no luck.
import bs4
defense = soup.find(id="all_defense")
for item in defense.children:
if isinstance(item, bs4.element.Comment):
big_comment = item
print(big_comment)
<div class="table_outer_container">
<div class="overthrow table_container" id="div_defense">
...and so on....
Posting an answer here in case others find helpful. Many thanks to #TomasCarvalho for directing me to find a solution. I was able to pass the big comment as html into a second soup instance using the following code, and then just use the original parsing code on the new soup instance. (note: the try/except is because some of the teams have no gamelog, and you can't call .children on a NoneType.
try:
defense = soup.find(id="all_defense")
for item in defense.children:
if isinstance(item, bs4.element.Comment):
html = item
Dsoup = BeautifulSoup(html, features="html.parser")
except:
html = ''
Dsoup = BeautifulSoup(html, features="html.parser")
I am trying to extract a table into pandas from a website that is automatically updated on a regular basis. I tried:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
website = 'http://www.dallasfirerescue.com/active_incidents.html'
req = Request(website)
abc = urlopen(req)
raw = abc.read().decode("utf-8")
page = raw.replace('<!-->', '')
soup = BeautifulSoup(page, "html.parser")
table = soup.find("table")
print (table)
It gives me None
Your link didn't work for me, but here is a great example of how to download data from an HTML table into Python.
# import libraries
import requests
from bs4 import BeautifulSoup
# query the website and return the html to the variable ‘page’
page = requests.get("https://www.aucklandairport.co.nz/flights").text
soup = BeautifulSoup(page)
tbody = soup.find('tbody')
rows = tbody.findAll('tr',{'class':'flight-toggle'}) #find tr whose class = flight-toggle
for tr in rows:
cols = tr.findAll('td',class_=lambda x: x != 'logo') # find td whose class!=logo (exclude the first td)
dv0 = cols[0].find('div').findAll('div') #flight, carrier, origin under second td
flight, carrier, origin = [c.text.strip() for c in dv0]
dv1 = cols[1].find('div').findAll('div') #date, schedule under third td
date, scheduled = [c.text.strip() for c in dv1]
dv2 = cols[2].find('div').findAll('div') #estimated, statusunder fouth td
estimated, status = [c.text.strip() for c in dv2[1:]] # exclude the first div
print(flight, carrier, origin, date, scheduled, estimated, status)
See the links below for more info.
http://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/
https://pythonprogramminglanguage.com/web-scraping-with-pandas-and-beautifulsoup/
The content of that page is generated dynamically. You can't grab the response by making http request. You need to use any browser simulator instead. Here is how you can achieve that. I used selenium in this case:
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get('http://www.dallasfirerescue.com/active_incidents.html')
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.find(class_="CSVTable")
for tr in table.find_all("tr"):
data = [item.text.strip() for item in tr.find_all("td")]
print(data)
driver.quit()
When you execute the above script, the data from the table of that webpage will be available in your grip.
I wish to parse the results table from a local sport event (the page basically just contain a table), but when I try with the script below I just get the "menu", not the actual result list. What am I missing?
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
site = "https://rittresultater.no/nb/sb_tid/923?pv2=11027&pv1=U"
html = urlopen(site)
soup = BeautifulSoup(html, "lxml") #BeautifulSoup(urlopen(html, "lxml"))
table = soup.select("table")
df = pd.read_html(str(table))[0]
print.df
This is happening because there are two <table>s on that page. You can either query on the class name of the table you want (in this case .table-condensed) using the class_ parameter of the find() function, or you can just grab the second table in the list of all tables using the find_all() function.
Solution 1:
table = soup.find('table', class_='table-condensed')
print(table)
Solution 2:
tables = soup.find_all('table')
print(tables[1])