Why is my company_url variable not being defined? - python-3.x

I am trying to scrape the rank, name and url of a company from a website. This involves two pages and I have nested functions to get all the information I need. However, when I try to print the details I get an error that the company_url variable is not defined. I thought that calling the company_button_url function within the main function would do the job, but something is wrong. I have tried calling company_button_url() at differing points in the code, but cannot get it to work.
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
# Handle 403- Forbidden Error
url = 'https://www.b.co.uk/the-lists/mid-companies/'
req = Request(url, headers={'User-Agent': 'Mozilla'})
html = urlopen(req).read()
html_page = html.decode('utf-8')
soup = BeautifulSoup(html_page, 'html.parser') # create soup object
'''Main Function'''
def company_details():
# find rank
rank = soup.find('div', class_="company-score-redesign").text
# find company name
company_name = soup.find('div', class_="company-name-redesign").text
# find company website
''' Find Button Url...Parse HTML from new Url...Find Company Website '''
def company_button_url():
comp = soup.find('div', class_="company-name-redesign-mobile")
comp_btn = comp.find('a', href = True)
comp_btn_url = comp_btn['href']
new_url = comp_btn_url
# Handle 403- Forbidden Error
new_req = Request(new_url, headers={'User-Agent': 'Mozilla'})
nhtml = urlopen(new_req).read() # Getting new page
nhtml_page = nhtml.decode('utf-8')
nsoup = BeautifulSoup(nhtml_page, 'html.parser') # create new soup object
div_company_url = nsoup.find('div', class_="profile-info")
href_company_url = div_company_url.find('a', href = True)
company_url = href_company_url['href']
return company_url
company_button_url()
print(rank, company_name, company_url)
return()
company_details()
Feel very free to pull my coding to pieces - I am very new to this!
Thanks in advance.

Related

How To Refactor Web Scraping Code In Python

I am web scraping data from the below url and was able to do it correctly but i am looking for more reliable and beautiful way to do it
import pandas as pd
from bs4 import BeautifulSoup
import requests
pages = list(range(1, 548))
list_of_url = []
for page in pages:
URL = "https://www.stats.gov.sa/ar/isic4?combine=&combine_1=All&items_per_page=5" + "&page=" + str(page)
#print (URL)
list_of_url.append(URL)
print(list_of_url)
list_activities = []
#page_number = 1
for url in list_of_url:
URL = url
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find('div', class_='view-content')
#print(results.prettify())
try:
activities = results.find_all("tr", class_=["views-row-first odd","even","odd","even","views-row-last odd"])
except:
print("in the activities line thisis a pad url", URL)
continue
try:
for activity in activities:
activity_section = activity.find('td', class_='views-field views-field-field-chapter-desc-en-et').text.strip()
activity_name = activity.find("td", class_="views-field views-field-field-activity-description-en-et").text.strip()
activity_code = activity.find("td", class_="views-field views-field-field-activity-code active").text.strip()
list_activities.append([activity_section,activity_name,activity_code])
except:
print("url not founf")
continue
page_number += 1
df = pd.DataFrame(list_activities, columns=["activity_section", "activity_name", "activity_code"])
df.head()
I am web scraping data from the below url and was able to do it correctly but i am looking for more reliable and beautiful way to do it
Here is a shorter version for your code:
import pandas as pd
from bs4 import BeautifulSoup
import requests
list_activities = []
URLS = [f'https://www.stats.gov.sa/ar/isic4?combine=&combine_1=All&items_per_page=5&page={page}' for page in range(1,3)]
for URL in URLS:
page = requests.get(URL)
soup = BeautifulSoup(page.text, "html.parser")
results = soup.find('div', class_='view-content')
activities = results.find_all("tr", class_=["views-row-first odd","even","odd","even","views-row-last odd"])
list_activities += [[
activity.find('td', class_='views-field views-field-field-chapter-desc-en-et').text.strip(),
activity.find("td", class_="views-field views-field-field-activity-description-en-et").text.strip(),
activity.find("td", class_="views-field views-field-field-activity-code active").text.strip()
] for activity in activities]
df = pd.DataFrame(list_activities, columns=["activity_section", "activity_name", "activity_code"])
df.head()
However, as an engineer at WebScrapingAPI I would recommend you implement a stealthier scraper if you want to scrape this website on the long run. As per my testing, it does not feature any known bot detection providers right now. But being a government website it might use a private detection system.

Unable to Scrape Product Price from Amazon

def scrapedPage(URL, user_agent):
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
header = {"User-Agent": user_agent}
proxy = {'https': 'using-proxy'}
page = requests.get(URL, headers=header, proxies=proxy)
return page
I am not able to scrape the price of the product which is Rs 12,300
user_agent = random.choice(user_agent_list)
pricelink='https://www.amazon.in/gp/offer-listing/B07NXTH1BD/ref=dp_olp_afts?ie=UTF8&condition=all'
new_page = scrapedPage(pricelink, user_agent)
new_soup = BeautifulSoup(new_page.content, 'html.parser')
print(new_soup.prettify())
find_price = new_soup.find_all('span',attrs={'style':'text-decoration: inherit; white-space: nowrap'})
I tried many methods like findAll(), select(), find(), etc but I am not able to scrape the price for the same. Please help me out.
It is returning empty list
Male following changes in yout code:
new_soup = BeautifulSoup(new_page.content, 'lxml') # html.parser > lxml
find_price = new_soup.select('span.olpOfferPrice > span') # change selector
print(find_price[0].text)
Output
Rs. 12,300.00

Web Scraping - iterating over

I'm looking to scrape a web-site hotel platform for reviews.
I cannot figure out two things:
1 - Why I cannot extract all reviews at one time? Say there are 14 reviews, I retrieve only 7 of them or so. I assume there is restriction by the server hosting the website?
2 - When I iterate over the object review_list the children objects that are retrieved are the same each time - i.e I retrieve the same review_item. Instead of iterating through the various objects the are tag li and of class review_item (see second code snippet).
I'm running Python 3.7 and an example url is:
url example
Hope you can shed some light here.
Thanks!
Code Snippet 1:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import re
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")#For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE# url = input('Enter url - ' )
url=input("Enter Url - ")
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
html = soup.prettify("utf-8")
hotel_json_details = {}
hotel_json = {}
for line in soup.find_all('script',attrs={"type" : "application/ld+json"}):
details = line.text.strip()
details = json.loads(details)
hotel_json_details["name"] = details["name"]
hotel_json_details["aggregateRating"]={}
hotel_json_details["aggregateRating"]["ratingValue"]=details["aggregateRating"]["ratingValue"]
hotel_json_details["aggregateRating"]["reviewCount"]=details["aggregateRating"]["reviewCount"]
hotel_json_details["address"]={}
hotel_json_details["address"]["Street"]=details["address"]["streetAddress"]
hotel_json_details["address"]["Locality"]=details["address"]["addressLocality"]
hotel_json_details["address"]["Region"]=details["address"]["addressRegion"]
hotel_json_details["address"]["Zip"]=details["address"]["postalCode"]
hotel_json_details["address"]["Country"]=details["address"]["addressCountry"]
print(hotel_json_details)
div = soup.find_all(['li'],attrs={"class" : "review_item"})
print(div)
Code Snippet 2:
hotel_reviews= []
for line in soup.find_all('li', class_='review_item'):
review={}
review["review_metadata"]={}
review["review"]={}
review["review_metadata"]["review_date"] = soup.find('p', class_='review_item_date').text.strip()
review["review_metadata"]["review_staydate"] = soup.find('p', class_='review_staydate').text.strip()
review["review_metadata"]["reviewer_name"] = soup.find('p', class_='reviewer_name').text.strip()
review["review_metadata"]["reviewer_country"] = soup.find('span', class_='reviewer_country').text.strip()
review["review_metadata"]["reviewer_score"] = soup.find('span', class_='review-score-badge').text.strip()
review["review"]["review_pos"] = soup.find('p', class_='review_pos').text.strip()
review["review"]["review_neg"] = soup.find('p', class_='review_neg').text.strip()
scoreword = soup.find('span', class_='review_item_header_scoreword')
if scoreword != None :
review["review_metadata"]["review_header"] = scoreword.text.strip()
else:
review["review_metadata"]["review_header"] = ""
hotel_reviews.append(x)
print(hotel_reviews)
When you are iterating over the review items, you need to use line.find() instead of soup.find(). This way, you'll be looking for review fields inside every review container as opposed to searching the whole HTML tree:
for line in soup.find_all('li', class_='review_item'):
review = {"review_metadata": {}, "review": {}}
review["review_metadata"]["review_date"] = line.find('p', class_='review_item_date').text.strip()
# ^ HERE

Writing multiple files as output when webscraping - python bs4

to preface - I am quite new to python and my HTML skills are kindergarten level.
So I am trying to save the quotes from this website which has many links in it for each member of the US Election candidates.
I have managed to get the actual code to extract the quotes (with the help of soem stackoverflow users), but am lost on how to write these quotes in to separate text files for each candidate.
For example, the first page, with all of Justin Amash's quotes should be written to a file: JustinAmash.txt.
The second page, with all of Michael Bennet's quotes should be written to MichaelBennet.txt (or something in that form). and so on.. Is there a way to do this?
For reference, to scrape the pages, the following code works:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
file1 = open("Quotefile.txt","w")
# get list of all URLS
for links in tags:
link = links.get('href')
if "java" in link:
print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
#text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
print(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
print(item.get_text())
#Hence, grab that string too
print(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
print(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
#print(text_data)
You can store that text data into the list you had started with text_data. Join all those items and then write to file:
So something like:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
#file1 = open("Quotefile.txt","w")
# get list of all URLS
candidates = []
for links in tags:
link = links.get('href')
if "java" in link:
#print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
candidate = link.split('/')[-1].split('_Free_Trade')[0]
if candidate in candidates:
continue
else:
candidates.append(candidate)
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
#print(item.get_text())
text_data.append(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
#print(item.get_text())
text_data.append(item.get_text())
#Hence, grab that string too
#print(item.next_sibling)
text_data.append(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
#print(item.get_text())
text_data.append(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
candidates.remove(candidate)
continue
text_data = '\n'.join(text_data)
with open("C:/%s.txt" %(candidate), "w") as text_file:
text_file.write(text_data)
print('Aquired: %s' %(candidate))

Scraping a lift with Python and BeautifulSoup

I am new to Python and trying to write some code that scrapes information form a website. I currently have:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "[REMOVED]/footwear?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'product-block__info')
for item in items:
for val in item.find_all('a','product-block'):
stock = item.find_all('class','count_product_stock hidden')[0].text
brand = item.find_all('div','brand')[0].text
price = item.find_all('span','selling_price')[0].text
print (items)
Which returns the error IndexError: list index out of range. If I put 'product-block__info' in the place of 'product-block' then I am able to print off the full list of the content within the 'product-block__info' tag on the page, but I'd like to just select a handful of elements and return these.
Can anyone explain to me what's happening here and how I can select just the elements i want from inside 'product-block__info'?
When selecting attributes with find_all you should either use the attrs dictionary or the keyword arguments, otherwise bs4 is lookink for tags.
for i in range(1, 300):
url = "[REMOVED]/footwear?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
print(stock, brand, price)

Resources