How to scrape table from website, while BS4 selection won`t find it? - python-3.x

I'm using below code scrape table element from url (www.sfda.gov.sa/en/cosmetics-list). But its coming empty
from bs4 import BeautifulSoup
import requests
import pandas as pd
url="https://www.sfda.gov.sa/en/cosmetics-list"
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
table = soup.find('table', attrs={'class':'table table-striped display'})
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df = pd.DataFrame(res, columns=["ProductName", "Category", "Country", "Company"])
print(df)
Running above code but not getting data

Data is loaded via XHR so you should use this to get your information:
url = 'https://www.sfda.gov.sa/GetCosmetics.php?page=1'
pd.DataFrame(requests.get(url).json()['results'])
Example
Loop over number of pages in range() and collect all data.
import requests
import pandas as pd
data = []
for i in range(1,5):
url = f'https://www.sfda.gov.sa/GetCosmetics.php?page={i}'
data.extend(requests.get(url).json()['results'])
pd.DataFrame(data)
Output
id
cosmatics_Id
productNotificationsId
productNumber
status
productArName
productEnName
brandName
catArabic
catEnglish
counrtyAr
counrtyEn
manufactureType
packageVolume
unitAr
unitEn
barcode
manufacturearabicname
manufactureenglishname
listedNameAr
listedNameEn
imageUrl
batchNumber
country_of_manufacturing_English
country_of_manufacturing_Arabic
productCreationDate
productexpireddate
subCategory1
subCategoryAR
storageCircumstances
protectionInstructions
usageInstructions
notes
mainCommercialRecordNumber
manufacturingLicenseNumber
0
549105
58472
10518
2020-011019101291-245945
Active
ليتسيا كوبيبا
Litsea cubeba oil
MOKSHA LIFE STYLE
منتجات العناية بالبشرة
Skin products
الهند
India
Foreign
250
ملي لتر
Milliliter (ml)
0
موكشا لايف ستايل برودكت
Moksha lifestyle products
مؤسسة شجور الارض للتجارة
shojoor alearth trading
India
الهند
2020-09-28T09:40:46
2025-10-05T09:40:46
Perfumes
العطور
room temperature
تاريخ انتهاء الصلاحية
الاستعمال الخارجي
7016000957
FR555666
...
9
84386
58481
4031
2016-0120132-048982
Active
جودي ثيرابي سيستيم للشعر بالبروتين
Judy protein & Silk hair therapy system
Judy
منتجات العناية بالشعر وفروة الرأس
Hair and scalp products
الولايات المتحدة
United States
Foreign
1000
ملي لتر
Milliliter (ml)
641243925950
معامل ناتيورال كوزماتيك
natural cosmetic labs USA Inc.,
شركه بيت جودي الدوليه للتجارة
bait gody for trading co.
United States
الولايات المتحدة
2016-12-25T14:40:44
2027-01-01T14:40:44
Hair styling products
منتجات تصفيف الشعر
7007289163
FR555666

You can use concurrent.futures to concurrently scrape pages and when all pages are complete concat the results into a single dataframe:
import concurrent.futures
import json
import os
import pandas as pd
import requests
class Scrape:
def __init__(self):
self.root_url = "https://www.sfda.gov.sa/GetCosmetics.php?"
self.pages = self.get_page_count()
self.processors = os.cpu_count()
def get_page_count(self) -> int:
return self.get_data(url=self.root_url).get("pageCount")
#staticmethod
def get_data(url: str) -> dict:
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
return json.loads(response.text)
def process_pages(self) -> pd.DataFrame:
page_range = list(range(1, self.pages + 1))
with concurrent.futures.ProcessPoolExecutor(max_workers=self.processors) as executor:
return pd.concat(executor.map(self.parse_data, page_range)).reset_index(drop=True)
def parse_data(self, page: int) -> pd.DataFrame:
url = f"{self.root_url}page={page}"
data = self.get_data(url=url)
return (pd
.json_normalize(data=data, record_path="results")
)[["productEnName", "catEnglish", "counrtyEn", "brandName"]].rename(
columns={"productEnName": "ProductName", "catEnglish": "Category",
"counrtyEn": "Country", "brandName": "Company"}
)
if __name__ == "__main__":
final_df = Scrape().process_pages()
print(final_df)

Related

python code to loop though a list of postcodes and get the GP practices for those postcodes by scraping the yellow pages (Australia)

The code below gives me the following error:
ValueError: Length mismatch: Expected axis has 0 elements, new values have 1 elements
on the df.columns = ["GP Practice Name"] line.
I tried
import pandas as pd
import requests
from bs4 import BeautifulSoup
postal_codes = ["2000", "2010", "2020", "2030", "2040"]
places_by_postal_code = {}
def get_places(postal_code):
url = f"https://www.yellowpages.com.au/search/listings?clue={postal_code}&locationClue=&latitude=&longitude=&selectedViewMode=list&refinements=category:General%20Practitioner&selectedSortType=distance"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
places = soup.find_all("div", {"class": "listing-content"})
return [place.find("h2").text for place in places]
for postal_code in postal_codes:
places = get_places(postal_code)
places_by_postal_code[postal_code] = places
df = pd.DataFrame.from_dict(places_by_postal_code, orient='index')
df.columns = ["GP Practice Name"]
df = pd.DataFrame(places_by_postal_code.values(), index=places_by_postal_code.keys(), columns=["GP Practice Name"])
print(df)
and was expecting a list of GPs for the postcodes specified in the postal_codes variable.

Loop pages and save detailed contents as dataframe in Python

Say I need to crawler the detailed contents from this link:
The objective is to extract contents the elements from the link, and append all the entries as dataframe.
from bs4 import BeautifulSoup
import requests
import os
from urllib.parse import urlparse
url = 'http://www.jscq.com.cn/dsf/zc/cjgg/202101/t20210126_30144.html'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
text = soup.find_all(text=True)
output = ''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script'
]
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
print(output)
Out:
南京市玄武区锁金村10-30号房屋公开招租成交公告-成交公告-江苏产权市场
body{font-size:100%!important;}
.main_body{position:relative;width:1000px;margin:0 auto;background-color:#fff;}
.main_content_p img{max-width:90%;display:block;margin:0 auto;}
.m_con_r_h{padding-left: 20px;width: 958px;height: 54px;line-height: 55px;font-size: 12px;color: #979797;}
.m_con_r_h a{color: #979797;}
.main_content_p{min-height:200px;width:90%;margin:0 auto;line-height: 30px;text-indent:0;}
.main_content_p table{margin:0 auto!important;width:900px!important;}
.main_content_h1{border:none;width:93%;margin:0 auto;}
.tit_h{font-size:22px;font-family:'微软雅黑';color:#000;line-height:30px;margin-bottom:10px;padding-bottom:20px;text-align:center;}
.doc_time{font-size:12px;color:#555050;height:28px;line-height:28px;text-align:center;background:#F2F7FD;border-top:1px solid #dadada;}
.doc_time span{padding:0 5px;}
.up_dw{width:100%;border-top:1px solid #ccc;padding-top:10px;padding-bottom:10px;margin-top:30px;clear:both;}
.pager{width:50%;float:left;padding-left:0;text-align:center;}
.bshare-custom{position:absolute;top:20px;right:40px;}
.pager{width:90%;padding-left: 50px;float:inherit;text-align: inherit;}
页头部分开始
页头部分结束
START body
南京市玄武区锁金村10-30号房屋公开招租成交公告
组织机构:江苏省产权交易所
发布时间:2021-01-26
项目编号
17FCZZ20200125
转让/出租标的名称
南京市玄武区锁金村10-30号房屋公开招租
转让方/出租方名称
南京邮电大学资产经营有限责任公司
转让标的评估价/年租金评估价(元)
64800.00
转让底价/年租金底价(元)
97200.00
受让方/承租方名称
马尕西木
成交价/成交年租金(元)
97200.00
成交日期
2021年01月15日
附件:
END body
页头部分开始
页头部分结束
But how could I loop all the pages and extract contents, and append them to the following dataframe? Thanks.
Updates for appending dfs as a dataframe:
updated_df = pd.DataFrame()
with requests.Session() as connection_session: # reuse your connection!
for follow_url in get_follow_urls(get_main_urls(), connection_session):
key = follow_url.rsplit("/")[-1].replace(".html", "")
# print(f"Fetching data for {key}...")
dfs = pd.read_html(
connection_session.get(follow_url).content.decode("utf-8"),
flavor="bs4",
)
# https://stackoverflow.com/questions/39710903/pd-read-html-imports-a-list-rather-than-a-dataframe
for df in dfs:
df = dfs[0].T.iloc[1:, :].copy()
updated_df = updated_df.append(df)
print(updated_df)
cols = ['项目编号', '转让/出租标的名称', '转让方/出租方名称', '转让标的评估价/年租金评估价(元)',
'转让底价/年租金底价(元)', '受让方/承租方名称', '成交价/成交年租金(元)', '成交日期']
updated_df.columns = cols
updated_df.to_excel('./data.xlsx', index = False)
Here's how I would do this:
build all main urls
visit every main page
get the follow urls
visit each follow url
grab the table from the follow url
parse the table with pandas
add the table to a dictionary of pandas dataframes
process the tables (not included -> implement your logic)
repeat the 2 - 7 steps to continue scraping the data.
The code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"
def get_main_urls() -> list:
start_url = f"{BASE_URL}/index.html"
return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]
def get_follow_urls(urls: list, session: requests.Session()) -> iter:
for url in urls[:1]: # remove [:1] to scrape all the pages
body = session.get(url).content
s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]
dataframe_collection = {}
with requests.Session() as connection_session: # reuse your connection!
for follow_url in get_follow_urls(get_main_urls(), connection_session):
key = follow_url.rsplit("/")[-1].replace(".html", "")
print(f"Fetching data for {key}...")
df = pd.read_html(
connection_session.get(follow_url).content.decode("utf-8"),
flavor="bs4",
)
dataframe_collection[key] = df
# process the dataframe_collection here
# print the dictionary of dataframes (optional and can be removed)
for key in dataframe_collection.keys():
print("\n" + "=" * 40)
print(key)
print("-" * 40)
print(dataframe_collection[key])
Output:
Fetching data for t20210311_30347...
Fetching data for t20210311_30346...
Fetching data for t20210305_30338...
Fetching data for t20210305_30337...
Fetching data for t20210303_30323...
Fetching data for t20210225_30306...
Fetching data for t20210225_30305...
Fetching data for t20210225_30304...
Fetching data for t20210225_30303...
Fetching data for t20210209_30231...
and then ...

scrape a table in a website with python (no table tag)

I'm trying to scrape daily the stock value of a product. This is the web https://funds.ddns.net/f.php?isin=ES0110407097. And this is the code I'm trying:
import pandas as pd
from bs4 import BeautifulSoup
html_string = 'https://funds.ddns.net/f.php?isin=ES0110407097'
soup = BeautifulSoup(html_string, 'lxml')
new_table = pd.DataFrame(columns=range(0,2), index = [0])
row_marker = 0
column_marker = 0
for row in soup.find_all('tr'):
columns = soup.find_all('td')
for column in columns:
new_table.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
print(new_table)
I would like to get in Python the same format I can see in the web, both the data and the number. How can I get it, please?
There's a simpler way for that particular page:
import requests
import pandas as pd
url = 'https://funds.ddns.net/f.php?isin=ES0110407097'
resp = requests.get(url)
new_table = pd.read_html(resp.text)[0]
print(new_table.head(5))
Output:
0 1
0 FECHA VL:EUR
1 2019-12-20 120170000
2 2019-12-19 119600000
3 2019-12-18 119420000
4 2019-12-17 119390000

Web scrape specific data from imdb project

So i want to extract the movie scores from imdb with the score of 8.7 above.
i have done it up to here like this but i dont know what to do next
import re
import requests
from bs4 import BeautifulSoup
l = list()
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = BeautifulSoup(r.text,'html.parser')
res = soup.find_all('strong')
for x in res:
q = re.sub(r'\s+',' ',x.text)
print(q)
it gives me all the scores. i just want 8.7 and above.
and thanks for answering!
Okay so first of all:
The data presented is not consistent, There are two values of 200 and 206 which are indeed above the 8.7 rating but I guess don't fit the whole model of rating from 0 to 10.
I have edited the code so it is currently printing out the rating values.
Please note the expression:
if my_float >= 8.7 and my_float <= 10:
This expression not only makes sure that You get the scores equal or higher than 8.7 but also they cannot be equal or higher than the maximum rating of 10.
import re
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = BeautifulSoup(r.text,'html.parser')
res = soup.find_all('strong')
for x in res:
q = re.sub(r'\s+',' ',x.text)
try:
my_float = float(q)
if my_float >= 8.7 and my_float <= 10:
print(q)
except ValueError as error:
print(error)
Hope this helps. If You wish to do anything else with the data instead of printing it out. Be sure to check list comprehensions or use another code block provided below
(I have noticed that You created an empty list l = list())
import re
import requests
from bs4 import BeautifulSoup
l = list()
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = BeautifulSoup(r.text,'html.parser')
res = soup.find_all('strong')
for x in res:
q = re.sub(r'\s+',' ',x.text)
try:
my_float = float(q)
if my_float >= 8.7 and my_float <= 10:
l.append(my_float)
print(q)
except ValueError as error:
print(error)
Just extract the rating convert to float and do comparison
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = bs(r.content, 'lxml')
for tr in soup.select('.lister-list tr'):
rating = float(tr.select_one('.imdbRating').text)
if rating >= 8.7:
print(tr.select_one('.titleColumn a').text.strip(), rating )
Less readable list comprehension:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = bs(r.content, 'lxml')
results = [(tr.select_one('.titleColumn a').text.strip(), float(tr.select_one('.imdbRating').text)) for tr in soup.select('.lister-list tr') if float(tr.select_one('.imdbRating').text) >= 8.7]
print(results)
Following code shows how you can create a list of movies whose rating is greater or equal to 8.7 .
import re
import requests
from bs4 import BeautifulSoup
l = list()
r = requests.get('https://www.imdb.com/chart/top?ref_=nv_mv_250')
soup = BeautifulSoup(r.text,'html.parser')
s = soup.find_all("tbody", class_="lister-list")
desired_list = []
for movie in s[0].findChildren("tr" , recursive=False):
title = movie.find_all("a")
rating = movie.find_all("strong")
q = re.sub(r'\s+',' ',rating[0].text)
try:
rating = float(q)
if rating >= 8.7:
desired_list.append((rating, title[1].contents[0]))
except Exception as e:
print e
print desired_list

How do I fix the AttributeError: 'NoneType' object has no attribute 'text'...when looping

I am a beginner and answers on this forum have been invaluable. I am using Python 3 and Beautiful Soup to scrape (non-table) data from multiple web pages on the same website by looping the page number. It works but I keep getting the AttributeError: 'NoneType' object has no attribute 'text' after the first iteration.
Here is the code I have tried thus far:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(1,8)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'lxml')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
for container in listing_containers:
address = container.a.text
addresses.append(address)
geography = container.find('div', class_='_1dhrl').text
geographies.append(geography)
rent = container.find('div', class_='_3e12V').text
rents.append(rent)
unit = container.find('div', class_='_2tApa').text
units.append(unit)
availability = container.find('div', class_='_2P6xE').text
availabilities.append(availability)
import pandas as pd
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Here is the output:
240 Properties
<class 'bs4.element.ResultSet'>
30
Street City-State-Zip Rent BR/BA Units Available
0 Quarry Place at Tuckahoe 64 Midland PlaceTuckahoe, NY 10707 $2,490+ 1–2 Beds • 1–2 Baths 2 Units Available
Traceback (most recent call last):
File "renttucktabletest.py", line 60, in <module>
availability = container.find('div', class_='_2P6xE').text
AttributeError: 'NoneType' object has no attribute 'text'
The result I am looking for is all 240 listings in the pandas dataframe exactly like the first iteration shown in the output above. Can anyone help to fix this error? Would be much appreciated. Thank you!
As pointed out, the issue is some of the containers are missing certain div elements. eg no 'unit' or 'availability' information.
One way to deal with this would be to use if - else statements. Append only if the element exists, else append a NaN value. Something like:
import requests
import numpy as np
from bs4 import BeautifulSoup
import csv
import lxml
# Lists to store the scraped data in
addresses = []
geographies = []
rents = []
units = []
availabilities = []
# Scraping all pages
pages_url = requests.get('https://www.rent.com/new-york/tuckahoe-apartments')
pages_soup = BeautifulSoup(pages_url.text, 'html.parser')
list_nums = pages_soup.find('div', class_='_1y05u').text
print(list_nums)
pages = [str(i) for i in range(1,8)]
for page in pages:
response = requests.get('https://www.rent.com/new-york/tuckahoe-apartments?page=' + page).text
html_soup = BeautifulSoup(response, 'lxml')
# Extract data from individual listing containers
listing_containers = html_soup.find_all('div', class_='_3PdAH')
print(type(listing_containers))
print(len(listing_containers))
for container in listing_containers:
address = container.a
if address:
addresses.append(address.text)
else:
addresses.append(np.nan)
geography = container.find('div', class_='_1dhrl')
if geography:
geographies.append(geography.text)
else:
geographies.append(np.nan)
rent = container.find('div', class_='_3e12V')
if rent:
rents.append(rent.text)
else:
rents.append(np.nan)
unit = container.find('div', class_='_2tApa')
if unit:
units.append(unit.text)
else:
units.append(np.nan)
availability = container.find('div', class_='_2P6xE')
if availability:
availabilities.append(availability.text)
else:
availabilities.append(np.nan)
import pandas as pd
test_df = pd.DataFrame({'Street' : addresses,
'City-State-Zip' : geographies,
'Rent' : rents,
'BR/BA' : units,
'Units Available' : availabilities
})
print(test_df)
Street City-State-Zip Rent \
0 Quarry Place at Tuckahoe 64 Midland PlaceTuckahoe, NY 10707 $2,490+
1 address not disclosed Tuckahoe, NY 10707 $2,510
2 address not disclosed Tuckahoe, NY 10707 $4,145
3 60 Washington St 1 60 Washington StTuckahoe, NY 10707 $3,500
4 269 Columbus Ave 5 269 Columbus AveTuckahoe, NY 10707 $2,700
BR/BA Units Available
0 1–2 Beds • 1–2 Baths 2 Units Available
1 1 Bed • 1 Bath NaN
2 2 Beds • 2 Bath NaN
3 3 Beds • 2 Bath NaN
4 2 Beds • 1 Bath NaN
If you pull the info from a script tag and treat as json that problem goes away. None or 0 is returned from the json where had you been trying for class name etc you would have got an error.
import requests
import json
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
def add_records(url, s):
res = requests.get(url)
soup = bs(res.content, 'lxml')
r = re.compile(r'window.__APPLICATION_CONTEXT__ = (.*)')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
items = json.loads(script)['store']['listings']['listings']
for item in items:
street = item['address']
geography = ', '.join([item['city'], item['state'], item['zipCode']])
rent = item['aggregates']['prices']['low']
BR_BA = 'beds: ' + str(item['aggregates']['beds']['low']) + ' , ' + 'baths: ' + str(item['aggregates']['baths']['low'])
units = item['aggregates']['totalAvailable']
listingId = item['listingId']
url = base_url + item['listingSeoPath']
# all_info = item
record = {'Street' : street,
'Geography' : geography,
'Rent' : rent,
'BR/BA' : BR_BA,
'Units Available' : units,
'ListingId' : listingId,
'Url' : url}
results.append(record)
url = 'https://www.rent.com/new-york/tuckahoe-apartments?page={}'
base_url = 'https://www.rent.com/'
results = []
with requests.Session() as s:
for page in range(1, 9):
add_records(url.format(page), s)
df = pd.DataFrame(results, columns = [ 'Street', 'Geography', 'Rent', 'BR/BA', 'Units Available', 'ListingId', 'Url'])
print(df)
Here is another approach to achieve the same.
import pandas
import requests
from bs4 import BeautifulSoup
urls = ['https://www.rent.com/new-york/tuckahoe-apartments?page={}'.format(page) for page in range(1,9)]
def get_content(links):
for url in links:
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
for items in soup.select("._3PdAH"):
d = {}
d['address'] = items.select_one("[data-tid='property-title']").text
try:
d['geographies'] = items.select_one("[data-tid='listing-info-address']").text
except AttributeError: d['geographies'] = ""
try:
d['rent'] = items.select_one("[data-tid='price']").text
except AttributeError: d['rent'] = ""
try:
d['units'] = items.select_one("[data-tid='beds-baths']").text
except AttributeError: d['units'] = ""
try:
d['availabilities'] = items.select_one("[data-tid='property-unitAvailText']").text
except AttributeError: d['availabilities'] = ""
dataframe.append(d)
return dataframe
if __name__ == '__main__':
dataframe = []
item = get_content(urls)
df = pandas.DataFrame(item)
df.to_csv("output.csv",index=False)

Resources