How to scrape price from booking.com using beautifulsoup? - python-3.x

I am trying to scrape price from booking.com but not successful. Any suggestions
My code as follows
#Importing necessary library
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import pandas as pd
import time
import re
import requests
from itertools import zip_longest
from webdriver_manager.chrome import ChromeDriverManager
price = []
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get('https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ&lang=en-gb&sid=422b3ff3c0e98b522259ad1cad2505ea&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ%3Bsid%3D422b3ff3c0e98b522259ad1cad2505ea%3Btmpl%3Dsearchresults%3Bclass_interval%3D1%3Bdest_id%3D-1506909%3Bdest_type%3Dcity%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D2%3Bgroup_children%3D0%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dcity%3Broom1%3DA%252CA%3Bsb_price_type%3Dtotal%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bsrpvid%3D912403b6d1220012%3Bss%3DAuckland%3Bss_all%3D0%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DAuckland%3Bssne_untouched%3DAuckland%3Btop_ufis%3D1%26%3B&sr_autoscroll=1&ss=Auckland&is_ski_area=0&ssne=Auckland&ssne_untouched=Auckland&city=-1506909&checkin_year=2020&checkin_month=9&checkin_monthday=1&checkout_year=2020&checkout_month=9&checkout_monthday=2&group_adults=2&group_children=0&no_rooms=1&from_sf=1')
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
for item in soup.findAll('label', {'class': "tpi_price_label tpi_price_label__orange"}):
price.append(item.get_text(strip=True))
print(price)
The above code is not showing any output. It gives an empty list.

You need properly wait for the page to load.
This is done using WebDriverWait and it will throw exception if the page isnt loaded during the specified timeout.
Try running my sample code bellow:
# test_scrape.py
import atexit
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
URL = ("https://www.booking.com/searchresults.en-gb.html?"
"label=gen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ"
"&lang=en-gb&sid=422b3ff3c0e98b522259ad1cad2505ea&sb=1&src=searchresults&src_elem=sb"
"&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-"
"1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ%3Bsid%3D422b3ff"
"3c0e98b522259ad1cad2505ea%3Btmpl%3Dsearchresults%3Bclass_interval%3D1%3Bdest_id%3D-150690"
"9%3Bdest_type%3Dcity%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D2%3Bgroup_children%3D0%3"
"Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcar"
"d%3D0%3Braw_dest_type%3Dcity%3Broom1%3DA%252CA%3Bsb_price_type%3Dtotal%3Bshw_aparth%3D1%3Bs"
"lp_r_match%3D0%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bsrpvid%3D912403b6d1220012%3Bss%3DAuckland%3B"
"ss_all%3D0%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DAuckland%3Bssne_untouched%3DAuckland%3Btop_ufi"
"s%3D1%26%3B&sr_autoscroll=1&ss=Auckland&is_ski_area=0&ssne=Auckland&ssne_untouched=Auckland&ci"
"ty=-1506909&checkin_year=2020&checkin_month=9&checkin_monthday=1&checkout_year=2020&checkout_m"
"onth=9&checkout_monthday=2&group_adults=2&group_children=0&no_rooms=1&from_sf=1'")
class page_loaded:
def __call__(self, driver):
document_ready = driver.execute_script("return document.readyState;") == "complete"
jquery_ready = driver.execute_script("return jQuery.active == 0;")
print(f"document ready: [({type(document_ready).__name__}){document_ready}]")
print(f"jquery ready: [({type(jquery_ready).__name__}){jquery_ready}]")
return document_ready and jquery_ready
def wait_for_page_to_load(driver, timeout_seconds=20):
WebDriverWait(driver, timeout_seconds, 0.2).until(page_loaded(), f"Page could not load in {timeout_seconds} s.!")
def go_to_url(driver, url):
driver.get(url)
wait_for_page_to_load(driver)
def get_orange_prices(soup):
return [price_label.get_text(strip=True)
for price_label
in soup.select("label.tpi_price_label.tpi_price_label__orange")]
def get_normal_prices(soup):
return [price_label.get_text(strip=True)
for price_label
in soup.select("div[class*=bui-price-display__value]")]
def start_driver():
driver = webdriver.Chrome()
atexit.register(driver.quit)
driver.maximize_window()
return driver
def main():
driver = start_driver()
go_to_url(driver, URL)
soup = BeautifulSoup(driver.page_source, 'html.parser')
orange_prices = get_orange_prices(soup)
print(orange_prices)
normal_prices = get_normal_prices(soup)
print(normal_prices)
if __name__ == '__main__':
main()
If you're having issues with the chromedriver not being discovered, try specify exact path to it like this:
def start_driver():
driver = webdriver.Chrome(executable_path="/path/to/cromedriver")
atexit.register(driver.quit)
driver.maximize_window()
return driver

Related

Aws lambda, Web Scraping cannot find Chrome binary (Python)

I am trying to write a web scraping job through aws lambda(python) and I am getting this error when I execute it.
Error
Message: unknown error: cannot find Chrome binary
How am I running:
I have downloaded chromedriver from this website and zipped code along with below python code.Please let me know if this way or do I need to make any modifications to my code?
https://chromedriver.storage.googleapis.com/index.html?path=111.0.5563.19/
import concurrent.futures
import requests
from selenium import webdriver
import os
import subprocess
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import boto3
from datetime import datetime
def scrape_data():
try:
years = [2023]
states = ["alnb"]
for state in states:
"""Creating s3 connection to write into state folder"""
for year in years:
url = 'https://www.govinfo.gov/app/collection/uscourts/bankruptcy/'+state+'/'+str(year)+'/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
options = webdriver.ChromeOptions()
options.add_argument("headless")
driver = webdriver.Chrome(executable_path='./chromedriver', chrome_options=options)
driver.get(url)
elements = WebDriverWait(driver, 2).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "panel-body"))
)
soup = BeautifulSoup(driver.page_source, 'html.parser')
bankruptcy_element = soup.findAll('div',{"class": "panel-collapse collapse in","class": "panel-title","class": "panel-body","class":"panel panel-default","class": "panel-collapse collapse in"})
print("scraping data for state "+state.capitalize() +" for "+str(year).capitalize())
data = []
for i in bankruptcy_element:
for xmlfile in i.findAll('a', href=True):
if ("pdf" in (xmlfile['href'])):
xmlfile['href']=xmlfile['href'].replace(".pdf","/mods.xml")
xmlfile['href']=xmlfile['href'].replace("/pdf","")
xmlfile['href']=xmlfile['href'].replace("/pkg/","/")
xmlfile['href']=xmlfile['href'].replace("/content","")
xmlfile['href']="https://www.govinfo.gov/metadata/granule"+xmlfile['href']
data.append(xmlfile['href'])
return data
except Exception as e:
pass
print(e)
def lambda_handler(event, context):
s3 = boto3.client('s3')
today_date=datetime.today().strftime('%Y-%m-%d')
s3.put_object(Bucket='w-zone', Key='Banktcy/'+today_date+"/xmlfiles.txt", Body=scrape_data())
#

How to get post links from the whole page using BeautifulSoup Selenium

I'm having trouble trying to web scraping using BeautifulSoup and Selenium. The problem I have is i want to try pulling data from pages 1-20. But somehow the data that was successfully pulled was only up to page 10. It is possible that the number of the last page limit that I would take could be more than 20, but the results of the code I made could only pull 10 pages. Does anyone have an understanding for the problem to be able to pull a lot of data without page limit?
options = webdriver.ChromeOptions()
options.add_argument('-headless')
options.add_argument('-no-sandbox')
options.add_argument('-disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
apartment_urls = []
try:
for page in range(1,20):
print(f"Extraction Page# {page}")
page="https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan&hlmn=" + str(page)
driver.get(page)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a[href]')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
apartment_urls.append(get_url)
except:
print("Good Bye!")
This is the output of the code. When pages 10,11,12 and so on I can't get the data
Now, pagination is working fine without page limit.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
if next_button:
button = next_button.click()
time.sleep(3)
else:
break
If you would prefer to use: webdriverManager
Alternative solution: As the next page url isn't dynamic, It's also working fine.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
# next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
# if next_button:
# button = next_button.click()
# time.sleep(3)
next_page = soup.select_one('li.next > a ')
if next_page:
next_page = f'https://www.99.co{next_page}'
else:
break

Scrape livechat till the end of stream with selenium

I'm trying to scrape the youtube livechat. I need to save all old and incoming messages. For this purpose i use a css selector and a infinite loop to accomplish this, however this results in duplicate entries and previous messages being omitted. What is the proper way to do this? The target url is the first commandline argument.
from selenium import webdriver
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
import os,re,sys
def parseyt():
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--executable_path="chromedriver.exe"')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-extensions')
chrome_bin = os.getenv('GOOGLE_CHROME_SHIM', None)
is_local = os.getenv('IS_LOCAL', None)
chromedriver_path = r'chromedriver.exe'
service_log_path = "{}/chromedriver.log".format('\.')
service_args = ['"--verbose", "--log-path=scrape.log"']
chromedriver_path = 'chromedriver.exe'
chrome_options.binary_location = r'C:\Program Files (x86)\Chromium\Application\chrome.exe'
browser = webdriver.Chrome(executable_path=chromedriver_path,chrome_options=chrome_options,service_args=service_args)
url = sys.argv[1]
url = url.replace(r'watch?',r'live_chat?')
print(url)
browser.get(url)
browser.implicitly_wait(1)
while True:
innerHTML = browser.execute_script("return document.body.innerHTML")
chats = []
for chat in browser.find_elements_by_css_selector('yt-live-chat-text-message-renderer'):
author_name = chat.find_element_by_css_selector("#author-name").get_attribute('innerHTML')
message = chat.find_element_by_css_selector("#message").get_attribute('innerHTML')
author_name_encoded = author_name.encode('utf-8').strip()
message_encoded = message.encode('utf-8').strip()
print(message+" "+author_name+"\n")
browser.quit()
return chats
It is better to use YouTube API instead.

Python3 beautifulsoup4 and selenium

I wrote this code for scraping score details from livescore.com . But I have some problems. Maybe I wrote incorrect code. Please help me.
Code run output:
Traceback (most recent call last):
File "web.py", line 15, in <module>
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
AttributeError: 'NoneType' object has no attribute 'find_all'
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'class':'container'})
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
for data in box2:
test = data.find('div',{'class': 'sco'}).text.replace('\n', '')
print (test)
Try This:
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
box = soup.find('div',{'class':'container'})
box2 = box.find_all('a',{'class' : 'match-row scorelink'})
for data in box2:
test = data.find('div',{'class': 'sco'}).text.replace('\n', '')
print (test)
Use the following css selector.However container is not a class attrribute value.Its data-type='container' attribute value.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
soup = BeautifulSoup(res, 'lxml')
for item in soup.select("div[data-type='container'] .match-row.scorelink>.sco"):
test=item.text.replace('\n', '')
print(test)
Give this a go. I have skipped 'box2' as it's not really needed for getting the scores. Also, judging by the data I fetched, .replace('\n', '') is not needed either, but feel free to use it if you think you will get score containing "\n" character.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'data-type':'container'})
scores=box.find_all('div',{'class': 'sco'})
for score in scores:
print(score.text)
Thanks for answers. Solved problem
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://livescore.com')
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
#page = requests.get('https://livescore.com')
soup = BeautifulSoup(res, 'lxml')
box = soup.find('div',{'data-type':'container'})
box2 = box.find_all('a',{'class' : 'match-row'})
for data in box2:
test1 = data.find('div',{'class': 'sco'}).text.replace('\n', '')
test2 = data.find('div',{'class': 'ply tright name'}).text.replace('\n', '')
test3 = data.find('div',{'class': 'ply name'}).text.replace('\n', '')
print(test2,test1,test3)

error while using selenium and writing to file

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import pandas as pd
from pytube import YouTube
browser = webdriver.Chrome("C:/Users/Downloads/chromedriver_win32/chromedriver.exe")
browser.get("https://www.youtube.com/channel/UCaKt8dvEIPnEHWSbLYhzrxg/videos")
time.sleep(1)
elem = browser.find_element_by_tag_name("body")
no_of_pagedowns = 100
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
no_of_pagedowns-=1
html = browser.page_source
soup = BeautifulSoup(html, "lxml")
tags = soup.find_all('a')
fname = "C:/Stock_in_CFD/Output.txt"
text_file = open(fname, "w+", encoding="utf-8")
for tag in tags:
t = tag.get('href')
text_file.write(t)
When I am running the above code. I am getting error
TypeError: write() argument must be str, not None
When I am not using selenium I am able to do it.
I am using selenium since I want scroll down entire page before parsing before using BeautifulSoup

Resources