Webscraping with python to extract data - python-3.x

I am using following code. Everything works except the 'affiliation' part.
It returns an error:
AttributeError: 'NoneType' object has no attribute 'text'
Without the .text, it returns everything--whole code inside the class
import requests
import bs4
import re
headers = {'User-Agent':'Mozilla/5.0'}
url = 'http://pubs.acs.org/toc/jacsat/139/5'
html = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(html.text, 'lxml')
tags = soup.findAll('a', href=re.compile("full"))
for tag in tags:
new_url = tag.get('href', None)
newurl = 'http://pubs.acs.org' + new_url
newhtml = requests.get(newurl, headers=headers)
newsoup = bs4.BeautifulSoup(newhtml.text, 'lxml')
article_title = newsoup.find(class_="articleTitle").text
print(article_title)
affiliations = newsoup.find(class_="affiliations").text
print(affiliations)
authors = newsoup.find(id="authors").text
print(authors)
citation_year = newsoup.find(class_="citation_year").text
print(citation_year)
citation_volume = newsoup.find(class_="citation_volume").text
print(citation_volume)
citation = newsoup.find(id="citation").text
print(citation)
pubdate = newsoup.find(id="pubDate").text
print(pubdate)

This exception was triggered because it did not find any element with the class "affiliation".
I have checked and could not find any element with this class value in the source HTML (or any other attribute to that matter) in the first url your script scrapes.
I would catch the error to avoid your script to break and return None or a default string when it does not find the element.
Something like that would work:
try:
affiliations = newsoup.find(class_="affiliations").text
print(affiliations)
except AttributeError:
affiliations = None

Related

Why is my company_url variable not being defined?

I am trying to scrape the rank, name and url of a company from a website. This involves two pages and I have nested functions to get all the information I need. However, when I try to print the details I get an error that the company_url variable is not defined. I thought that calling the company_button_url function within the main function would do the job, but something is wrong. I have tried calling company_button_url() at differing points in the code, but cannot get it to work.
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
# Handle 403- Forbidden Error
url = 'https://www.b.co.uk/the-lists/mid-companies/'
req = Request(url, headers={'User-Agent': 'Mozilla'})
html = urlopen(req).read()
html_page = html.decode('utf-8')
soup = BeautifulSoup(html_page, 'html.parser') # create soup object
'''Main Function'''
def company_details():
# find rank
rank = soup.find('div', class_="company-score-redesign").text
# find company name
company_name = soup.find('div', class_="company-name-redesign").text
# find company website
''' Find Button Url...Parse HTML from new Url...Find Company Website '''
def company_button_url():
comp = soup.find('div', class_="company-name-redesign-mobile")
comp_btn = comp.find('a', href = True)
comp_btn_url = comp_btn['href']
new_url = comp_btn_url
# Handle 403- Forbidden Error
new_req = Request(new_url, headers={'User-Agent': 'Mozilla'})
nhtml = urlopen(new_req).read() # Getting new page
nhtml_page = nhtml.decode('utf-8')
nsoup = BeautifulSoup(nhtml_page, 'html.parser') # create new soup object
div_company_url = nsoup.find('div', class_="profile-info")
href_company_url = div_company_url.find('a', href = True)
company_url = href_company_url['href']
return company_url
company_button_url()
print(rank, company_name, company_url)
return()
company_details()
Feel very free to pull my coding to pieces - I am very new to this!
Thanks in advance.

Error while using BeautifulSoup in Python, attribute error

How do I resolve AttributeError: 'NoneType' object has no attribute 'text' when using BeautifulSoup?
below is the current code I have.
from bs4 import BeautifulSoup
import urllib.request
with open('websites_mn.txt') as f:
txtdata = f.readlines()
for raw_url in txtdata:
raw_url = raw_url.strip('\n')
url = urllib.request.urlopen(raw_url)
content = url.read()
soup = BeautifulSoup(content, 'lxml')
table = soup.findAll('div',attrs={"class":"journal-content-article"})
for x in table:
print(x.find('p').text)
here is the websites txt file.
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/24-avgusta-sostoalas-vstreca-crezvycajnogo-i-polnomocnogo-posla-rossijskoj-federacii-v-mongolii-i-k-azizova-s-ministrom-energetiki-mongolii-n-tavinbeh?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/19-avgusta-2020-goda-sostoalas-vstreca-crezvycajnogo-i-polnomocnogo-posla-rossijskoj-federacii-v-mongolii-i-k-azizova-s-zamestitelem-ministra-inostran?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/vsemirnye-zimnie-igry-special-noj-olimpiady-2022-goda-projdut-v-kazani?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/mezdunarodnyj-voenno-tehniceskij-forum-armia-2020-projdet-v-period-s-23-po-29-avgusta-2020-goda-na-territorii-kongressno-vystavocnogo-centra-patriot-?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/crezvycajnyj-i-polnomocnyj-posol-rossijskoj-federacii-v-mongolii-i-k-azizov-vstretilsa-s-ministrom-obrazovania-i-nauki-mongolii-l-cedevsuren-v-hode-be?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/10-iula-sostoalas-vstreca-crezvycajnogo-i-polnomocnogo-posla-rossijskoj-federacii-v-mongolii-i-k-azizova-i-ministra-inostrannyh-del-mongolii-n-enhtajv?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/prezident-rossijskoj-federacii-v-v-putin-pozdravil-prezidenta-mongolii-h-battulgu-s-nacional-nym-prazdnikom-naadam-?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/predsedatel-pravitel-stva-rossijskoj-federacii-m-v-misustin-pozdravil-prem-er-ministra-mongolii-u-hur?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/inistr-inostrannyh-del-rossijskoj-federacii-s-v-lavrov-pozdravil-ministra-inostrannyh-del-mongolii-n-enhtajvana-s-naznaceniem-i-nacional-nym-prazdniko?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
https://mongolia.mid.ru:443/en_US/novosti/-/asset_publisher/hfCjAfLBKGW0/content/pozdravlenie-crezvycajnogo-i-polnomocnogo-posla-rossijskoj-federacii-v-mongolii-i-k-azizova-s-nacional-nym-prazdnikom-mongolii-naada-1?inheritRedirect=false&redirect=https%3A%2F%2Fmongolia.mid.ru%3A443%2Fen_US%2Fnovosti%3Fp_p_id%3D101_INSTANCE_hfCjAfLBKGW0%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3Dcolumn-1%26p_p_col_count%3D1
Instead of printing all the p tags under div with journal-content-article, it stops because of NoneType error.
In your loop you can check if the object exists before accessing an attribute:
for x in table:
if x.find('p'): # add this check
print(x.find('p').text)

Web Scraping - iterating over

I'm looking to scrape a web-site hotel platform for reviews.
I cannot figure out two things:
1 - Why I cannot extract all reviews at one time? Say there are 14 reviews, I retrieve only 7 of them or so. I assume there is restriction by the server hosting the website?
2 - When I iterate over the object review_list the children objects that are retrieved are the same each time - i.e I retrieve the same review_item. Instead of iterating through the various objects the are tag li and of class review_item (see second code snippet).
I'm running Python 3.7 and an example url is:
url example
Hope you can shed some light here.
Thanks!
Code Snippet 1:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import re
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")#For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE# url = input('Enter url - ' )
url=input("Enter Url - ")
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
html = soup.prettify("utf-8")
hotel_json_details = {}
hotel_json = {}
for line in soup.find_all('script',attrs={"type" : "application/ld+json"}):
details = line.text.strip()
details = json.loads(details)
hotel_json_details["name"] = details["name"]
hotel_json_details["aggregateRating"]={}
hotel_json_details["aggregateRating"]["ratingValue"]=details["aggregateRating"]["ratingValue"]
hotel_json_details["aggregateRating"]["reviewCount"]=details["aggregateRating"]["reviewCount"]
hotel_json_details["address"]={}
hotel_json_details["address"]["Street"]=details["address"]["streetAddress"]
hotel_json_details["address"]["Locality"]=details["address"]["addressLocality"]
hotel_json_details["address"]["Region"]=details["address"]["addressRegion"]
hotel_json_details["address"]["Zip"]=details["address"]["postalCode"]
hotel_json_details["address"]["Country"]=details["address"]["addressCountry"]
print(hotel_json_details)
div = soup.find_all(['li'],attrs={"class" : "review_item"})
print(div)
Code Snippet 2:
hotel_reviews= []
for line in soup.find_all('li', class_='review_item'):
review={}
review["review_metadata"]={}
review["review"]={}
review["review_metadata"]["review_date"] = soup.find('p', class_='review_item_date').text.strip()
review["review_metadata"]["review_staydate"] = soup.find('p', class_='review_staydate').text.strip()
review["review_metadata"]["reviewer_name"] = soup.find('p', class_='reviewer_name').text.strip()
review["review_metadata"]["reviewer_country"] = soup.find('span', class_='reviewer_country').text.strip()
review["review_metadata"]["reviewer_score"] = soup.find('span', class_='review-score-badge').text.strip()
review["review"]["review_pos"] = soup.find('p', class_='review_pos').text.strip()
review["review"]["review_neg"] = soup.find('p', class_='review_neg').text.strip()
scoreword = soup.find('span', class_='review_item_header_scoreword')
if scoreword != None :
review["review_metadata"]["review_header"] = scoreword.text.strip()
else:
review["review_metadata"]["review_header"] = ""
hotel_reviews.append(x)
print(hotel_reviews)
When you are iterating over the review items, you need to use line.find() instead of soup.find(). This way, you'll be looking for review fields inside every review container as opposed to searching the whole HTML tree:
for line in soup.find_all('li', class_='review_item'):
review = {"review_metadata": {}, "review": {}}
review["review_metadata"]["review_date"] = line.find('p', class_='review_item_date').text.strip()
# ^ HERE

Scraping a lift with Python and BeautifulSoup

I am new to Python and trying to write some code that scrapes information form a website. I currently have:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "[REMOVED]/footwear?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'product-block__info')
for item in items:
for val in item.find_all('a','product-block'):
stock = item.find_all('class','count_product_stock hidden')[0].text
brand = item.find_all('div','brand')[0].text
price = item.find_all('span','selling_price')[0].text
print (items)
Which returns the error IndexError: list index out of range. If I put 'product-block__info' in the place of 'product-block' then I am able to print off the full list of the content within the 'product-block__info' tag on the page, but I'd like to just select a handful of elements and return these.
Can anyone explain to me what's happening here and how I can select just the elements i want from inside 'product-block__info'?
When selecting attributes with find_all you should either use the attrs dictionary or the keyword arguments, otherwise bs4 is lookink for tags.
for i in range(1, 300):
url = "[REMOVED]/footwear?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
print(stock, brand, price)

Can't print tag 'content' anymore

I had a perfectly well working scraper for TripAdvisor, it met all my needs, then I tried to use it after a four day break and something went wrong, I quickly realized that TA had changed some of the tags, I made the appropriate changes and I still couldn't get it working as before. I want to grab the value of the 'content' tag within an element.
This is the element:
<div class="prw_rup prw_common_bubble_rating bubble_rating" data-prwidget-init="" data-prwidget-name="common_bubble_rating"><span alt="5 of 5 bubbles" class="ui_bubble_rating bubble_50" content="5" property="ratingValue" style="font-size:18px;"></span></div>
and here is the code:
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([img["content"] for img in bubs.select("img[content]")])
but now it only gives me an empty '[]' instead of the content which is '5'.
Anybody know what may have changed?
here is the rest of my code
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
theurl = "https://www.tripadvisor.com/Hotels-g147364-c3-Cayman_Islands-Hotels.html"
thepage = urllib
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
base_url = "https://www.tripadvisor.com"
urls = []
init_info = []
init_data = open('/Users/paribaker/Desktop/scrapping/TripAdvisor/Inv/speccaydata.txt', 'w')
for link in soup.findAll('a',href=re.compile('/Hotel_Review')):
urls.append(base_url + (link.get('href')).strip("#REVIEWS"))
def remove_duplicates(urls):
output= []
seen = set()
for line in urls:
if line not in seen:
output.append(line)
seen.add(line)
return output
urls2 = remove_duplicates(urls)
for url in urls2:
try:
driver = webdriver.Chrome()
driver.get(url)
element = driver.find_element_by_id("taplc_prodp13n_hr_sur_review_filter_controls_0_filterLang_ALL").click()
print("succesfull")
moreinfo = driver.page_source
moresoup = BeautifulSoup(moreinfo,"html.parser")
driver.close()
#moreinfo = urllib
#moreinfo = urllib.request.urlopen(url)
#moresoup = BeautifulSoup(moreinfo,"html.parser")
except:
print("none")
for data in moresoup.findAll('div', {"class":"heading_2014 hr_heading"}):
try:
for title in data.findAll('h1',{'id':"HEADING"}):
init_info.append(title.text.strip("\n")+ ",\t")
for add_data in data.findAll('span',{'class':'format_address'}):
print((add_data.find('span',{'class':'street-address'}).text +",\t"))
init_info.append(add_data.find('span',{'class':'street-address'}).text +",\t")
init_info.append(add_data.find('span',{'class':'locality'}).text + ",\t")
init_info.append(add_data.find('span',{'class':'country-name'}).text + ",\t")
for reviews in data.findAll('a',{'class':'more taLnk'}):
init_info.append(reviews.text).strip("\n")
init_info.append(", \t")
#init_info.append([img["alt"] for img in stars.select("img[alt]")])
#init_info.append([img["content"] for img in stars.select("img[content]")])
except :
init_info.append("N/A" + ", /t")
The element with the content="5" attribute is a span, not an img.
Does this get what you want?
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([elem["content"] for elem in bubs.select("span[content]")])

Resources