I would like to scrape the infobox for a number of artist wikipedias. But I keep on getting the no attribute error. What am I doing wrong?
from bs4 import BeautifulSoup
from urllib.request import urlopen
url= "http://en.wikipedia.org/wiki/The_Beatles"
page = urlopen(url)
soup = BeautifulSoup(page.read(), "lxml")
table = soup.find('table', class_='infobox vcard plainlist')
result = {}
exceptional_row_count = 0
for tr in table.find_all('tr'):
if tr.find('th'):
result[tr.find('th').text] = tr.find('td').text
else:
exceptional_row_count += 1
if exceptional_row_count > 1:
print ('WARNING ExceptionalRow>1: ', table)
print (result)
The error is in this line:
---> 13 result[tr.find('th').text] = tr.find('td').text
AttributeError: 'NoneType' object has no attribute 'text'
Related
How do I resolve AttributeError: 'NoneType' object has no attribute 'text' when using BeautifulSoup?
below is the current code I have.
from bs4 import BeautifulSoup
import urllib.request
with open('websites_mn.txt') as f:
txtdata = f.readlines()
for raw_url in txtdata:
raw_url = raw_url.strip('\n')
url = urllib.request.urlopen(raw_url)
content = url.read()
soup = BeautifulSoup(content, 'lxml')
table = soup.findAll('div',attrs={"class":"journal-content-article"})
for x in table:
print(x.find('p').text)
here is the websites txt file.
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/24-avgusta-sostoalas-vstreca-crezvycajnogo-i-polnomocnogo-posla-rossijskoj-federacii-v-mongolii-i-k-azizova-s-ministrom-energetiki-mongolii-n-tavinbeh?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/19-avgusta-2020-goda-sostoalas-vstreca-crezvycajnogo-i-polnomocnogo-posla-rossijskoj-federacii-v-mongolii-i-k-azizova-s-zamestitelem-ministra-inostran?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/vsemirnye-zimnie-igry-special-noj-olimpiady-2022-goda-projdut-v-kazani?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/mezdunarodnyj-voenno-tehniceskij-forum-armia-2020-projdet-v-period-s-23-po-29-avgusta-2020-goda-na-territorii-kongressno-vystavocnogo-centra-patriot-?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/crezvycajnyj-i-polnomocnyj-posol-rossijskoj-federacii-v-mongolii-i-k-azizov-vstretilsa-s-ministrom-obrazovania-i-nauki-mongolii-l-cedevsuren-v-hode-be?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/10-iula-sostoalas-vstreca-crezvycajnogo-i-polnomocnogo-posla-rossijskoj-federacii-v-mongolii-i-k-azizova-i-ministra-inostrannyh-del-mongolii-n-enhtajv?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/prezident-rossijskoj-federacii-v-v-putin-pozdravil-prezidenta-mongolii-h-battulgu-s-nacional-nym-prazdnikom-naadam-?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/predsedatel-pravitel-stva-rossijskoj-federacii-m-v-misustin-pozdravil-prem-er-ministra-mongolii-u-hur?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/inistr-inostrannyh-del-rossijskoj-federacii-s-v-lavrov-pozdravil-ministra-inostrannyh-del-mongolii-n-enhtajvana-s-naznaceniem-i-nacional-nym-prazdniko?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/pozdravlenie-crezvycajnogo-i-polnomocnogo-posla-rossijskoj-federacii-v-mongolii-i-k-azizova-s-nacional-nym-prazdnikom-mongolii-naada-1?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
Instead of printing all the p tags under div with journal-content-article, it stops because of NoneType error.
In your loop you can check if the object exists before accessing an attribute:
for x in table:
if x.find('p'): # add this check
print(x.find('p').text)
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pdfkit
import re
URL = 'https://timesofindia.indiatimes.com/'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'lxml')
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'lxml')
all_links=set()
for link in soup.find_all('a'):
all_links.add(link.get('href'))
s = list(all_links)
print(s)
x=[i for i in s if i._contains_(URL)]
m=[]
find_words= ['cbse', 'first-day']
for s in x:
if any(f in s for f in find_words):
m.append(s)
print(m)
your contains line is not valid.
Try
x=[i for i in s if URL in i]
Code:
from selenium import webdriver
from bs4 import BeautifulSoup
driver=webdriver.Chrome('H:\datascience-python\selinium\chromedriver.exe')
driver.get('https://www.aljazeera.com/news/')
button = driver.find_element_by_id('btn_showmore_b1_418')
driver.execute_script("arguments[0].click();", button)
content = driver.page_source
soup = BeautifulSoup(content, 'html.parser')
container = soup.select('div.topics-sec-item-cont')
titlelist = []
urllist = []
for items in container:
if items is not None:
title = items.find_element_by_xpath('//div[#class="col-sm-7 topics-sec-item-cont"]/a/h2').text
url = items.find_element_by_xpath('//div[#class="col-sm-7 topics-sec-item-cont"]/a')
titlelist.append(title)
urllist.append(url.get_attribute('href'))
print(str(titlelist) + '\n')
print(str(urllist) + '\n')
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1-acf307cfccb3> in <module>
18 for items in container:
19 if items is not None:
---> 20 title = items.find_element_by_xpath('//div[#class="col-sm-7 topics-sec-item-cont"]/a/h2').text
21 url = items.find_element_by_xpath('//div[#class="col-sm-7 topics-sec-item-cont"]/a')
22
TypeError: 'NoneType' object is not callable
title = items.find_element_by_xpath('//div[#class="col-sm-7 topics-sec-item-cont"]/a/h2').text
the xpath provided in this line of code is not returning object.
This issue can arise only because of two reason:
Either the provided xpath is wrong.
The div that you are trying to extract from is not yet loaded completely.
I am trying to pull a table from a list of URL's. When I only input one URL it only prints out the first items in the table and when I add more URL's to the list I get the error message 'list' object has no attribute 'timeout'. What is the best way to get the rest of the items and adding more URL's?
Below is the code I am running.
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
uClient = uReq(my_urls)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
title_container = container.findAll('td')
Product_name = title_container[0].text.strip()
product_name_list.append(Product_name)
CatNo_container = container.findAll('td')
CatNo = CatNo_container[1].text.strip()
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price_container = container.findAll('td')
Price = Price_container[4].text.strip()
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)
You are passing a list here, uClient = uReq(my_urls) as my_urls where a string is required.
You need to pass the individual element of the list i.e. the strings.
Here is the edited code that works for multiple urls.
UPDATED CODE (to get all items):
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
for url in my_urls:
print("URL using: ", url)
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
items = container.findAll('tr')
for item in items:
item = item.text.split('\n')
Product_name = item[1]
product_name_list.append(Product_name)
CatNo = item[2]
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price = item[6]
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)
I am using following code. Everything works except the 'affiliation' part.
It returns an error:
AttributeError: 'NoneType' object has no attribute 'text'
Without the .text, it returns everything--whole code inside the class
import requests
import bs4
import re
headers = {'User-Agent':'Mozilla/5.0'}
url = 'http://pubs.acs.org/toc/jacsat/139/5'
html = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(html.text, 'lxml')
tags = soup.findAll('a', href=re.compile("full"))
for tag in tags:
new_url = tag.get('href', None)
newurl = 'http://pubs.acs.org' + new_url
newhtml = requests.get(newurl, headers=headers)
newsoup = bs4.BeautifulSoup(newhtml.text, 'lxml')
article_title = newsoup.find(class_="articleTitle").text
print(article_title)
affiliations = newsoup.find(class_="affiliations").text
print(affiliations)
authors = newsoup.find(id="authors").text
print(authors)
citation_year = newsoup.find(class_="citation_year").text
print(citation_year)
citation_volume = newsoup.find(class_="citation_volume").text
print(citation_volume)
citation = newsoup.find(id="citation").text
print(citation)
pubdate = newsoup.find(id="pubDate").text
print(pubdate)
This exception was triggered because it did not find any element with the class "affiliation".
I have checked and could not find any element with this class value in the source HTML (or any other attribute to that matter) in the first url your script scrapes.
I would catch the error to avoid your script to break and return None or a default string when it does not find the element.
Something like that would work:
try:
affiliations = newsoup.find(class_="affiliations").text
print(affiliations)
except AttributeError:
affiliations = None