Web Scraping - iterating over - python-3.x

I'm looking to scrape a web-site hotel platform for reviews.
I cannot figure out two things:
1 - Why I cannot extract all reviews at one time? Say there are 14 reviews, I retrieve only 7 of them or so. I assume there is restriction by the server hosting the website?
2 - When I iterate over the object review_list the children objects that are retrieved are the same each time - i.e I retrieve the same review_item. Instead of iterating through the various objects the are tag li and of class review_item (see second code snippet).
I'm running Python 3.7 and an example url is:
url example
Hope you can shed some light here.
Thanks!
Code Snippet 1:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import re
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")#For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE# url = input('Enter url - ' )
url=input("Enter Url - ")
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
html = soup.prettify("utf-8")
hotel_json_details = {}
hotel_json = {}
for line in soup.find_all('script',attrs={"type" : "application/ld+json"}):
details = line.text.strip()
details = json.loads(details)
hotel_json_details["name"] = details["name"]
hotel_json_details["aggregateRating"]={}
hotel_json_details["aggregateRating"]["ratingValue"]=details["aggregateRating"]["ratingValue"]
hotel_json_details["aggregateRating"]["reviewCount"]=details["aggregateRating"]["reviewCount"]
hotel_json_details["address"]={}
hotel_json_details["address"]["Street"]=details["address"]["streetAddress"]
hotel_json_details["address"]["Locality"]=details["address"]["addressLocality"]
hotel_json_details["address"]["Region"]=details["address"]["addressRegion"]
hotel_json_details["address"]["Zip"]=details["address"]["postalCode"]
hotel_json_details["address"]["Country"]=details["address"]["addressCountry"]
print(hotel_json_details)
div = soup.find_all(['li'],attrs={"class" : "review_item"})
print(div)
Code Snippet 2:
hotel_reviews= []
for line in soup.find_all('li', class_='review_item'):
review={}
review["review_metadata"]={}
review["review"]={}
review["review_metadata"]["review_date"] = soup.find('p', class_='review_item_date').text.strip()
review["review_metadata"]["review_staydate"] = soup.find('p', class_='review_staydate').text.strip()
review["review_metadata"]["reviewer_name"] = soup.find('p', class_='reviewer_name').text.strip()
review["review_metadata"]["reviewer_country"] = soup.find('span', class_='reviewer_country').text.strip()
review["review_metadata"]["reviewer_score"] = soup.find('span', class_='review-score-badge').text.strip()
review["review"]["review_pos"] = soup.find('p', class_='review_pos').text.strip()
review["review"]["review_neg"] = soup.find('p', class_='review_neg').text.strip()
scoreword = soup.find('span', class_='review_item_header_scoreword')
if scoreword != None :
review["review_metadata"]["review_header"] = scoreword.text.strip()
else:
review["review_metadata"]["review_header"] = ""
hotel_reviews.append(x)
print(hotel_reviews)

When you are iterating over the review items, you need to use line.find() instead of soup.find(). This way, you'll be looking for review fields inside every review container as opposed to searching the whole HTML tree:
for line in soup.find_all('li', class_='review_item'):
review = {"review_metadata": {}, "review": {}}
review["review_metadata"]["review_date"] = line.find('p', class_='review_item_date').text.strip()
# ^ HERE

Related

How To Refactor Web Scraping Code In Python

I am web scraping data from the below url and was able to do it correctly but i am looking for more reliable and beautiful way to do it
import pandas as pd
from bs4 import BeautifulSoup
import requests
pages = list(range(1, 548))
list_of_url = []
for page in pages:
URL = "https://www.stats.gov.sa/ar/isic4?combine=&combine_1=All&items_per_page=5" + "&page=" + str(page)
#print (URL)
list_of_url.append(URL)
print(list_of_url)
list_activities = []
#page_number = 1
for url in list_of_url:
URL = url
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find('div', class_='view-content')
#print(results.prettify())
try:
activities = results.find_all("tr", class_=["views-row-first odd","even","odd","even","views-row-last odd"])
except:
print("in the activities line thisis a pad url", URL)
continue
try:
for activity in activities:
activity_section = activity.find('td', class_='views-field views-field-field-chapter-desc-en-et').text.strip()
activity_name = activity.find("td", class_="views-field views-field-field-activity-description-en-et").text.strip()
activity_code = activity.find("td", class_="views-field views-field-field-activity-code active").text.strip()
list_activities.append([activity_section,activity_name,activity_code])
except:
print("url not founf")
continue
page_number += 1
df = pd.DataFrame(list_activities, columns=["activity_section", "activity_name", "activity_code"])
df.head()
I am web scraping data from the below url and was able to do it correctly but i am looking for more reliable and beautiful way to do it
Here is a shorter version for your code:
import pandas as pd
from bs4 import BeautifulSoup
import requests
list_activities = []
URLS = [f'https://www.stats.gov.sa/ar/isic4?combine=&combine_1=All&items_per_page=5&page={page}' for page in range(1,3)]
for URL in URLS:
page = requests.get(URL)
soup = BeautifulSoup(page.text, "html.parser")
results = soup.find('div', class_='view-content')
activities = results.find_all("tr", class_=["views-row-first odd","even","odd","even","views-row-last odd"])
list_activities += [[
activity.find('td', class_='views-field views-field-field-chapter-desc-en-et').text.strip(),
activity.find("td", class_="views-field views-field-field-activity-description-en-et").text.strip(),
activity.find("td", class_="views-field views-field-field-activity-code active").text.strip()
] for activity in activities]
df = pd.DataFrame(list_activities, columns=["activity_section", "activity_name", "activity_code"])
df.head()
However, as an engineer at WebScrapingAPI I would recommend you implement a stealthier scraper if you want to scrape this website on the long run. As per my testing, it does not feature any known bot detection providers right now. But being a government website it might use a private detection system.

Getting incorrect link on parsing web page in BeautifulSoup

I'm trying to get the download link from the button in this page. But when I open the download link that I get from my code I get this message
I noticed that if I manually click the button and open the link in a new page the csrfKey part of the link is always same whereas when I run the code I get a different key every time. Here's my code
from bs4 import BeautifulSoup
import requests
import re
def GetPage(link):
source_new = requests.get(link).text
soup_new = BeautifulSoup(source_new, 'lxml')
container_new = soup_new.find_all(class_='ipsButton')
for data_new in container_new:
#print(data_new)
headline = data_new # Display text
match = re.findall('download', str(data_new), re.IGNORECASE)
if(match):
print(f'{headline["href"]}\n')
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
GetPage(link)
Before you get to the actual download links of the files, you need to agree to Terms and Conditions. So, you need to fake this with requests and then parse the next page you get.
Here's how:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
with requests.Session() as connection:
r = connection.get("https://eci.gov.in/")
confirmation_url = BeautifulSoup(
connection.get(link).text, 'lxml'
).select_one(".ipsApp .ipsButton_fullWidth")["href"]
fake_agree_to_continue = connection.get(
confirmation_url.replace("?do=download", "?do=download&confirm=1")
).text
download_links = [
a["href"] for a in
BeautifulSoup(
fake_agree_to_continue, "lxml"
).select(".ipsApp .ipsButton_small")[1:]]
for download_link in download_links:
response = connection.get(download_link)
file_name = (
response
.headers["Content-Disposition"]
.replace('"', "")
.split(" - ")[-1]
)
print(f"Downloading: {file_name}")
with open(file_name, "wb") as f:
f.write(response.content)
This should output:
Downloading: Number And Types Of Constituencies.pdf
Downloading: Number And Types Of Constituencies.xls
And save two files: a .pdf and a .xls. The later one looks like this:

Why is my company_url variable not being defined?

I am trying to scrape the rank, name and url of a company from a website. This involves two pages and I have nested functions to get all the information I need. However, when I try to print the details I get an error that the company_url variable is not defined. I thought that calling the company_button_url function within the main function would do the job, but something is wrong. I have tried calling company_button_url() at differing points in the code, but cannot get it to work.
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
# Handle 403- Forbidden Error
url = 'https://www.b.co.uk/the-lists/mid-companies/'
req = Request(url, headers={'User-Agent': 'Mozilla'})
html = urlopen(req).read()
html_page = html.decode('utf-8')
soup = BeautifulSoup(html_page, 'html.parser') # create soup object
'''Main Function'''
def company_details():
# find rank
rank = soup.find('div', class_="company-score-redesign").text
# find company name
company_name = soup.find('div', class_="company-name-redesign").text
# find company website
''' Find Button Url...Parse HTML from new Url...Find Company Website '''
def company_button_url():
comp = soup.find('div', class_="company-name-redesign-mobile")
comp_btn = comp.find('a', href = True)
comp_btn_url = comp_btn['href']
new_url = comp_btn_url
# Handle 403- Forbidden Error
new_req = Request(new_url, headers={'User-Agent': 'Mozilla'})
nhtml = urlopen(new_req).read() # Getting new page
nhtml_page = nhtml.decode('utf-8')
nsoup = BeautifulSoup(nhtml_page, 'html.parser') # create new soup object
div_company_url = nsoup.find('div', class_="profile-info")
href_company_url = div_company_url.find('a', href = True)
company_url = href_company_url['href']
return company_url
company_button_url()
print(rank, company_name, company_url)
return()
company_details()
Feel very free to pull my coding to pieces - I am very new to this!
Thanks in advance.

How can I make the code follow another link after the first one?

I am trying to find the link at position 3 of the url (the first name is 1). Follow that link. Repeat this process 4 times. At the end I want to print the final link.
The issue that I am running into is getting the url to repeat. Please give me some advice how I can make this code run.
import urllib.request, urllib.parse, urllib.error
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
lst = list()
lst2 = list()
count = 0
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
count = int(input('Enter Count '))
position = int(input('Enter Position ')) -1
while count >= 0:
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
tags = soup('a')
for tag in tags:
values = tag.get('href', None)
values = str(values)
lst.append(values)
count = count - 1
lst2.append(lst[position:position+1])
url = lst2[0]
url = str(url)
print(re.findall('http.+html',url))
lst.clear()
lst2.clear()
return url
If I am parsing your question correctly, one way to do this (I'll leave error checking as an exercise for you; also code has not been run) is like this:
# Loop count times; "_" effectively means ignore the counter
for _ in range(count):
# Get an array of <a> elements,
# then get the (position-1)th
# then get the text of the 'href' tag
next_url = soup.find_all('a')[position-1]['href']
# And repeat for the URL found there
html = urlopen(next_url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
# Finally, print out the (position-1)th URL on the last page
print(soup.find_all('a')[position-1]['href'])
Of course, if there are not enough links on the page, or there are <a> tags without an href, or the href URL is malformed the program will crash.
I was able to answer my own question after playing with the code for a little while longer. I am sure there is a much more eloquent solution, but I am really happy I finally got it to run correctly.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
lst = list()
count = 0
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
count = int(input('Enter Count '))
position = int(input('Enter Position ')) -1
while True:
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
tags = soup('a')
for tag in tags:
values = tag.get('href', None)
values = str(values)
lst.append(values)
url = str(lst[position:position+1])
url = url[2:-2]
print(url)
lst.clear()
count = count -1
if count == 0:break

Writing multiple files as output when webscraping - python bs4

to preface - I am quite new to python and my HTML skills are kindergarten level.
So I am trying to save the quotes from this website which has many links in it for each member of the US Election candidates.
I have managed to get the actual code to extract the quotes (with the help of soem stackoverflow users), but am lost on how to write these quotes in to separate text files for each candidate.
For example, the first page, with all of Justin Amash's quotes should be written to a file: JustinAmash.txt.
The second page, with all of Michael Bennet's quotes should be written to MichaelBennet.txt (or something in that form). and so on.. Is there a way to do this?
For reference, to scrape the pages, the following code works:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
file1 = open("Quotefile.txt","w")
# get list of all URLS
for links in tags:
link = links.get('href')
if "java" in link:
print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
#text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
print(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
print(item.get_text())
#Hence, grab that string too
print(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
print(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
#print(text_data)
You can store that text data into the list you had started with text_data. Join all those items and then write to file:
So something like:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
#file1 = open("Quotefile.txt","w")
# get list of all URLS
candidates = []
for links in tags:
link = links.get('href')
if "java" in link:
#print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
candidate = link.split('/')[-1].split('_Free_Trade')[0]
if candidate in candidates:
continue
else:
candidates.append(candidate)
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
#print(item.get_text())
text_data.append(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
#print(item.get_text())
text_data.append(item.get_text())
#Hence, grab that string too
#print(item.next_sibling)
text_data.append(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
#print(item.get_text())
text_data.append(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
candidates.remove(candidate)
continue
text_data = '\n'.join(text_data)
with open("C:/%s.txt" %(candidate), "w") as text_file:
text_file.write(text_data)
print('Aquired: %s' %(candidate))

Resources