Parsing through HTML to extract data from table rows with beautiful soup - python-3.x

I'm using BeautifulSoup to extract stock information from the NASDAQ website. I want to retrieve information specifically from the table rows on the HTML page but I am always getting an error (line 12).
#import html-parser
from bs4 import BeautifulSoup
from requests import get
url = 'https://www.nasdaq.com/symbol/amzn' #AMZN is just an example
response = get(url)
#Create parse tree (BeautifulSoup Object)
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.find_all(class_= 'column span-1-of-2')
table = data.find(class_= 'table-row') #This is where the error occurs
print(table)

You can do something like this to get the data from table rows.
import requests
from bs4 import BeautifulSoup
import re
r = requests.get("https://www.nasdaq.com/")
print(r)
soup = BeautifulSoup(r.content, 'html.parser')
data = soup.find('table',{'id':'indexTable', 'class':'floatL marginB5px'}).script.text
matches = re.findall(r'nasdaqHomeIndexChart.storeIndexInfo(.*);\r\n', data)
table_rows = [re.findall(r'\".*\"', row) for row in matches]
print(table_rows)
table_rows is list of lists containing table data.

Related

Issue webscraping a linked header with Beautiful Soup

I am running into an issue pulling in the human readable header name from this table in an html document. I can pull in the id, but my trouble comes when trying to pull in the correct header between the '>... I am not sure what I need to do in this instance... Below is my code. It all runs except for the last for loop.
# Import libraries
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import numpy as np
# Pull the HTML link into a local file or buffer
# and then parse with the BeautifulSoup library
# ------------------------------------------------
url = 'https://web.dsa.missouri.edu/static/mirror_sites/factfinder.census.gov/bkmk/table/1.0/en/GEP/2014/00A4/0100000US.html'
r = requests.get(url)
#print('Status: ' + str(r.status_code))
#print(requests.status_codes._codes[200])
soup = BeautifulSoup(r.content, "html")
table = soup.find(id='data')
#print(table)
# Convert the data into a list of dictionaries
# or some other structure you can convert into
# pandas Data Frame
# ------------------------------------------------
trs = table.find_all('tr')
#print(trs)
header_row = trs[0]
#print(header_row)
names = []
for column in header_row.find_all('th'):
names.append(column.attrs['id'])
#print(names)
db_names = []
for column in header_row.find_all('a'):
db_names.append(column.attrs['data-vo-id']) # ISSUE ARISES HERE!!!
print(db_names)
Let pandas read_html do the work for you, and simply specify the table id to find:
from pandas import read_html as rh
table = rh('https://web.dsa.missouri.edu/static/mirror_sites/factfinder.census.gov/bkmk/table/1.0/en/GEP/2014/00A4/0100000US.html', attrs = {'id': 'data'})[0]
Hey you can try something like this :
soup = BeautifulSoup(r.content, "html")
table = soup.findAll('table', {'id':'data'})
trs = table[0].find_all('tr')
#print(trs)
names = []
for row in trs[:1]:
td = row.find_all('td')
data_row_txt_list = [td_tag.text.strip() for td_tag in row]
header_row = data_row_txt_list
for column in header_row:
names.append(column)

Iterate all pages and crawler table's elements save as dataframe in Python

I need to loop all the entries of all the pages from this link, then click the menu check in the red part (please see the image below) to enter the detail of each entry:
The objective is to cralwer the infos from the pages such as image below, and save left part as column names and right part as rows:
The code I used:
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
url = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=425000'
content = requests.get(url).text
soup = BeautifulSoup(content, 'lxml')
table = soup.find('table', {'class': 'gridview'})
df = pd.read_html(str(table))[0]
print(df.head(5))
Out:
序号 工程名称 ... 发证日期 详细信息
0 NaN 假日万恒社区卫生服务站装饰装修工程 ... 2020-07-07 查看
The code for entering the detailed pages:
url = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=308891&t=toDetail&GCBM=202006202001'
content = requests.get(url).text
soup = BeautifulSoup(content, 'lxml')
table = soup.find("table", attrs={"class":"detailview"}).findAll("tr")
for elements in table:
inner_elements = elements.findAll("td", attrs={"class":"label"})
for text_for_elements in inner_elements:
print(text_for_elements.text)
Out:
工程名称:
施工许可证号:
所在区县:
建设单位:
工程规模(平方米):
发证日期:
建设地址:
施工单位:
监理单位:
设计单位:
行政相对人代码:
法定代表人姓名:
许可机关:
As you can see, I only get column name, no entries have been successfully extracted.
In order to loop all pages, I think we need to use post requests, but I don't know how to get headers.
Thanks for your help at advance.
This script will go for all pages and gets the data into a DataFrame and saves them to data.csv.
(!!! Warning !!! there are 2405 pages total, so it takes a long time to get them all):
import requests
import pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup
url = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=425000'
payload = {'currentPage': 1, 'pageSize':15}
def scrape_page(url):
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
return {td.get_text(strip=True).replace(':', ''): td.find_next('td').get_text(strip=True) for td in soup.select('td.label')}
all_data = []
current_page = 1
while True:
print('Page {}...'.format(current_page))
payload['currentPage'] = current_page
soup = BeautifulSoup(requests.post(url, data=payload).content, 'html.parser')
for a in soup.select('a:contains("查看")'):
u = 'http://bjjs.zjw.beijing.gov.cn' + a['href']
d = scrape_page(u)
all_data.append(d)
pprint(d)
page_next = soup.select_one('a:contains("下一页")[onclick]')
if not page_next:
break
current_page += 1
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
Prints the data to screen and saves data.csv (screenshot from LibreOffice):

How do I extract data from the table using python?

I am trying to get the data from the table shown on this website 'https://www.qualitydiamonds.co.uk/one-carat-loose-diamonds/#'
I have tried the code below, but I am only able to get the data from the 4 main diamonds appearing on the webpage and none of the data from the actual table.
import requests
from bs4 import BeautifulSoup
url = "https://www.qualitydiamonds.co.uk/one-carat-loose-diamonds/"
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')
one_a_tag = soup.findAll('span', class_='price')
print(one_a_tag)
Hey you can try this code below:
import requests
import bs4 as bs
url = "https://www.qualitydiamonds.co.uk/one-carat-loose-diamonds/"
response = requests.get(url)
soup = bs.BeautifulSoup(response.text, 'lxml')
price = soup.find(class_='price')
print(price.text)

BeautifulSoup python: Get the text with no tags and get the adjacent links

I am trying to extract the movie titles and links for it from this site
from bs4 import BeautifulSoup
from requests import get
link = "https://tamilrockerrs.ch"
r = get(link).content
#r = open('json.html','rb').read()
b = BeautifulSoup(r,'html5lib')
a = b.findAll('p')[1]
But the problem is there is no tag for the titles. I can't extract the titles and if I could do that how can I bind the links and title together.
Thanks in Advance
You can find title and link by this way.
from bs4 import BeautifulSoup
import requests
url= "http://tamilrockerrs.ch"
response= requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
data = soup.find_all('div', {"class":"title"})
for film in data:
print("Title:", film.find('a').text) # get the title here
print("Link:", film.find('a').get("href")) #get the link here

the Accessing commented HTML Lines with BeautifulSoup

I am attempting to webscrape stats from this specific webpage: https://www.sports-reference.com/cfb/schools/louisville/2016/gamelog/
However, the table for the 'Defensive Game Log' appears to be commented out when I look at the HTML source (starts with <...!-- and ends with -->)
Because of this, when attempting to use BeautifulSoup4 the following code only grabs the offensive data that is not commented out while the defensive data is commented out.
from urllib.request import Request,urlopen
from bs4 import BeautifulSoup
import re
accessurl = 'https://www.sports-reference.com/cfb/schools/oklahoma-state/2016/gamelog/'
req = Request(accessurl)
link = urlopen(req)
soup = BeautifulSoup(link.read(), "lxml")
tables = soup.find_all(['th', 'tr'])
my_table = tables[0]
rows = my_table.findChildren(['tr'])
for row in rows:
cells = row.findChildren('td')
for cell in cells:
value = cell.string
print(value)
I am curious if there are any solutions to be able to add all of the defensive values into a list the same way the offensive data is stored be it inside or outside of BeautifulSoup4. Thanks!
Note that I added onto solution given below derived from here:
data = []
table = defensive_log
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
Comment object will give you what you want:
from urllib.request import Request,urlopen
from bs4 import BeautifulSoup, Comment
accessurl = 'https://www.sports-reference.com/cfb/schools/oklahoma-state/2016/gamelog/'
req = Request(accessurl)
link = urlopen(req)
soup = BeautifulSoup(link, "lxml")
comments=soup.find_all(string=lambda text:isinstance(text,Comment))
for comment in comments:
comment=BeautifulSoup(str(comment), 'lxml')
defensive_log = comment.find('table') #search as ordinary tag
if defensive_log:
break

Resources