Issue webscraping a linked header with Beautiful Soup - python-3.x

I am running into an issue pulling in the human readable header name from this table in an html document. I can pull in the id, but my trouble comes when trying to pull in the correct header between the '>... I am not sure what I need to do in this instance... Below is my code. It all runs except for the last for loop.
# Import libraries
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import numpy as np
# Pull the HTML link into a local file or buffer
# and then parse with the BeautifulSoup library
# ------------------------------------------------
url = 'https://web.dsa.missouri.edu/static/mirror_sites/factfinder.census.gov/bkmk/table/1.0/en/GEP/2014/00A4/0100000US.html'
r = requests.get(url)
#print('Status: ' + str(r.status_code))
#print(requests.status_codes._codes[200])
soup = BeautifulSoup(r.content, "html")
table = soup.find(id='data')
#print(table)
# Convert the data into a list of dictionaries
# or some other structure you can convert into
# pandas Data Frame
# ------------------------------------------------
trs = table.find_all('tr')
#print(trs)
header_row = trs[0]
#print(header_row)
names = []
for column in header_row.find_all('th'):
names.append(column.attrs['id'])
#print(names)
db_names = []
for column in header_row.find_all('a'):
db_names.append(column.attrs['data-vo-id']) # ISSUE ARISES HERE!!!
print(db_names)

Let pandas read_html do the work for you, and simply specify the table id to find:
from pandas import read_html as rh
table = rh('https://web.dsa.missouri.edu/static/mirror_sites/factfinder.census.gov/bkmk/table/1.0/en/GEP/2014/00A4/0100000US.html', attrs = {'id': 'data'})[0]

Hey you can try something like this :
soup = BeautifulSoup(r.content, "html")
table = soup.findAll('table', {'id':'data'})
trs = table[0].find_all('tr')
#print(trs)
names = []
for row in trs[:1]:
td = row.find_all('td')
data_row_txt_list = [td_tag.text.strip() for td_tag in row]
header_row = data_row_txt_list
for column in header_row:
names.append(column)

Related

Iterate all pages and crawler table's elements save as dataframe in Python

I need to loop all the entries of all the pages from this link, then click the menu check in the red part (please see the image below) to enter the detail of each entry:
The objective is to cralwer the infos from the pages such as image below, and save left part as column names and right part as rows:
The code I used:
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
url = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=425000'
content = requests.get(url).text
soup = BeautifulSoup(content, 'lxml')
table = soup.find('table', {'class': 'gridview'})
df = pd.read_html(str(table))[0]
print(df.head(5))
Out:
序号 工程名称 ... 发证日期 详细信息
0 NaN 假日万恒社区卫生服务站装饰装修工程 ... 2020-07-07 查看
The code for entering the detailed pages:
url = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=308891&t=toDetail&GCBM=202006202001'
content = requests.get(url).text
soup = BeautifulSoup(content, 'lxml')
table = soup.find("table", attrs={"class":"detailview"}).findAll("tr")
for elements in table:
inner_elements = elements.findAll("td", attrs={"class":"label"})
for text_for_elements in inner_elements:
print(text_for_elements.text)
Out:
工程名称:
施工许可证号:
所在区县:
建设单位:
工程规模(平方米):
发证日期:
建设地址:
施工单位:
监理单位:
设计单位:
行政相对人代码:
法定代表人姓名:
许可机关:
As you can see, I only get column name, no entries have been successfully extracted.
In order to loop all pages, I think we need to use post requests, but I don't know how to get headers.
Thanks for your help at advance.
This script will go for all pages and gets the data into a DataFrame and saves them to data.csv.
(!!! Warning !!! there are 2405 pages total, so it takes a long time to get them all):
import requests
import pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup
url = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=425000'
payload = {'currentPage': 1, 'pageSize':15}
def scrape_page(url):
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
return {td.get_text(strip=True).replace(':', ''): td.find_next('td').get_text(strip=True) for td in soup.select('td.label')}
all_data = []
current_page = 1
while True:
print('Page {}...'.format(current_page))
payload['currentPage'] = current_page
soup = BeautifulSoup(requests.post(url, data=payload).content, 'html.parser')
for a in soup.select('a:contains("查看")'):
u = 'http://bjjs.zjw.beijing.gov.cn' + a['href']
d = scrape_page(u)
all_data.append(d)
pprint(d)
page_next = soup.select_one('a:contains("下一页")[onclick]')
if not page_next:
break
current_page += 1
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
Prints the data to screen and saves data.csv (screenshot from LibreOffice):

Beautiful Soup parse Table return only the last row

I want to parse an HTML Table with BeautifulSoup. My problem is that I get only the values for the last row.
feature_list.append(features) add a dict for every row. At the end every dict in feature_list has the same values, but as you can see print(features['Code']) delivers the "Code" for every line.
Can anyone help me out?
from bs4 import BeautifulSoup
from bs4.element import Tag
import numpy as np
import pandas as pd
import requests
def read_item_list_view(url):
html_doc = requests.get(url, timeout=5).content
soup = BeautifulSoup(html_doc, 'html.parser')
#parse generic features and item information
generic_features = {}
generic_features['Titel'] = soup.select_one('.field-name-title').string
generic_features['Image'] = soup.select_one('.field-name-field-immagine-prodotto').a.img['src']
generic_features_list = soup.select_one('.field-group-div')
for feature in generic_features_list.children:
generic_features[feature.div.string[:-2]] = feature.a.string
if soup.select_one('.field-name-field-note-prodotto').contents:
notes = []
for string in soup.select_one('.field-name-field-note-prodotto').stripped_strings:
notes.append(string)
generic_features['Notes'] = notes
#find feature table
table = soup.select_one('.field-name-product-dimensions-press-fitting').find('table')
#read html table with simple header
def read_table_with_simple_header(table, generic_features):
header = []
feature_list = []
features = generic_features
table_header = table.thead
for tr in table_header.contents:
if isinstance(tr, Tag):
for col in tr.children:
if isinstance(col, Tag):
header.append(col.text.strip())
table_content = table.tbody
for tr in table_content.contents:
if isinstance(tr, Tag):
cur_col = 0
for col in tr.children:
if isinstance(col, Tag):
features[header[cur_col]] = col.text.strip()
cur_col += 1
print(features['Code'])
feature_list.append(features)
return feature_list
return read_table_with_simple_header(table, generic_features)
print(read_item_list_view('https://www.vitillo.eu/de/press-fittings/metric-female-24deg-cone-90deg-elbow-l-type.html'))
I think the problem is because dictionaries are a mutable type, meaning that when you change the 'features' dictionary on each loop it is updating the same 'features' object each time, which affects the previously appended 'features' (i.e. they are all the same object. You are just appending a new reference to it each time)
What you want to do is append a new dictionary object each time.
Change
feature_list.append(features)
to
feature_list.append(dict(features))
and I think this will solve your problem.

Loading scraped data into list

I was able to successfully scrape some text from a website and I'm now trying to load the text into a list so I can later convert it to a Pandas DataFrame.
The site supplied the data in a scsv format so it was quick to grab.
The following is my code:
import requests
from bs4 import BeautifulSoup
#Specify the url:url
url = "http://rotoguru1.com/cgi-bin/fyday.pl?week=1&year=2017&game=dk&scsv=1"
# Packages the request, send the request and catch the response: r
r = requests.get(url)
#Extract the response:html_doc
html_doc = r.text
soup = BeautifulSoup(html_doc,"html.parser")
#Find the tags associated with the data you need, in this case
# it's the "pre" tags
for data in soup.find_all("pre"):
print(data.text)
Sample Output
Week;Year;GID;Name;Pos;Team;h/a;Oppt;DK points;DK salary
1;2017;1254;Smith, Alex;QB;kan;a;nwe;34.02;5400 1;2017;1344;Bradford,
Sam;QB;min;h;nor;28.54;5900
use open function to write csv file
import requests
from bs4 import BeautifulSoup
url = "http://rotoguru1.com/cgi-bin/fyday.pl?week=1&year=2017&game=dk&scsv=1"
r = requests.get(url)
html_doc = r.content
soup = BeautifulSoup(html_doc,"html.parser")
file = open(“data.csv”,”w”)
for data in soup.find("pre").text.split('\n'):
file.write(data.replace(';',','))
file.close()
Here's one thing you can do, although it's possible that someone who knows pandas better than I can suggest something better.
You have r.text. Put that into a convenient text file, let me call it temp.csv. Now you can use pandas read_csv method to get these data into a dataframe.
>>> df = pandas.read_csv('temp.csv', sep=';')
Addendum:
Suppose results were like this.
>>> results = [['a', 'b', 'c'], [1,2,3], [4,5,6]]
Then you could put them in a dataframe in this way.
>>> df = pandas.DataFrame(results[1:], columns=results[0])
>>> df
a b c
0 1 2 3
1 4 5 6
If u want to convert your existing code into list, using split method might do the job and then use pandas to convert it into dataframe.
import requests
from bs4 import BeautifulSoup
#Specify the url:url
url = "http://rotoguru1.com/cgi-bin/fyday.pl?week=1&year=2017&game=dk&scsv=1"
# Packages the request, send the request and catch the response: r
r = requests.get(url)
#Extract the response:html_doc
html_doc = r.text
soup = BeautifulSoup(html_doc,"html.parser")
#Find the tags associated with the data you need, in this case
# it's the "pre" tags
for data in soup.find_all("pre"):
print(data.text.split(";"))

Parsing through HTML to extract data from table rows with beautiful soup

I'm using BeautifulSoup to extract stock information from the NASDAQ website. I want to retrieve information specifically from the table rows on the HTML page but I am always getting an error (line 12).
#import html-parser
from bs4 import BeautifulSoup
from requests import get
url = 'https://www.nasdaq.com/symbol/amzn' #AMZN is just an example
response = get(url)
#Create parse tree (BeautifulSoup Object)
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.find_all(class_= 'column span-1-of-2')
table = data.find(class_= 'table-row') #This is where the error occurs
print(table)
You can do something like this to get the data from table rows.
import requests
from bs4 import BeautifulSoup
import re
r = requests.get("https://www.nasdaq.com/")
print(r)
soup = BeautifulSoup(r.content, 'html.parser')
data = soup.find('table',{'id':'indexTable', 'class':'floatL marginB5px'}).script.text
matches = re.findall(r'nasdaqHomeIndexChart.storeIndexInfo(.*);\r\n', data)
table_rows = [re.findall(r'\".*\"', row) for row in matches]
print(table_rows)
table_rows is list of lists containing table data.

the Accessing commented HTML Lines with BeautifulSoup

I am attempting to webscrape stats from this specific webpage: https://www.sports-reference.com/cfb/schools/louisville/2016/gamelog/
However, the table for the 'Defensive Game Log' appears to be commented out when I look at the HTML source (starts with <...!-- and ends with -->)
Because of this, when attempting to use BeautifulSoup4 the following code only grabs the offensive data that is not commented out while the defensive data is commented out.
from urllib.request import Request,urlopen
from bs4 import BeautifulSoup
import re
accessurl = 'https://www.sports-reference.com/cfb/schools/oklahoma-state/2016/gamelog/'
req = Request(accessurl)
link = urlopen(req)
soup = BeautifulSoup(link.read(), "lxml")
tables = soup.find_all(['th', 'tr'])
my_table = tables[0]
rows = my_table.findChildren(['tr'])
for row in rows:
cells = row.findChildren('td')
for cell in cells:
value = cell.string
print(value)
I am curious if there are any solutions to be able to add all of the defensive values into a list the same way the offensive data is stored be it inside or outside of BeautifulSoup4. Thanks!
Note that I added onto solution given below derived from here:
data = []
table = defensive_log
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
Comment object will give you what you want:
from urllib.request import Request,urlopen
from bs4 import BeautifulSoup, Comment
accessurl = 'https://www.sports-reference.com/cfb/schools/oklahoma-state/2016/gamelog/'
req = Request(accessurl)
link = urlopen(req)
soup = BeautifulSoup(link, "lxml")
comments=soup.find_all(string=lambda text:isinstance(text,Comment))
for comment in comments:
comment=BeautifulSoup(str(comment), 'lxml')
defensive_log = comment.find('table') #search as ordinary tag
if defensive_log:
break

Resources