Selenium finds only a fraction of href links - python-3.x

I am trying to get all the products' url from this webpage, but I managed to get only a fraction of it.
My first attempt was to scrape the webpage with Beautifulsoup, but then I realized selenium would be better as I needed to click the "Show more" button several times. I also added a code to scroll down the page as I though that was the problem, but the result didn't change.
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def getListingLinks(link):
# Open the driver
driver = webdriver.Chrome(executable_path="")
driver.maximize_window()
driver.get(link)
time.sleep(3)
# scroll down: repeated to ensure it reaches the bottom and all items are loaded
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
listing_links = []
while True:
try:
driver.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="main-content"]/div[2]/div[2]/div[4]/button'))))
driver.execute_script("arguments[0].click();", WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#main-content > div:nth-child(2) > div.main-column > div.btn-wrapper.center > button"))))
print("Button clicked")
links = driver.find_elements_by_class_name('fop-contentWrapper')
for link in links:
algo=link.find_element_by_css_selector('.fop-contentWrapper a').get_attribute('href')
print(algo)
listing_links.append(str(algo))
except:
print("No more Buttons")
break
driver.close()
return listing_links
fresh_food = getListingLinks("https://www.ocado.com/browse/fresh-20002")
print(len(fresh_food)) ## Output: 228
As you can see, I get 228 urls while I would like to get 5605 links, that is the actual number of products in the webpage according to Ocado. I believe I have a problem with the order of my code, but can't find the proper order. I would sincerely appreciate any help.

Related

Selenium problem [don't show up error](download few items)

I'm in need of a solution to my code, I tried to web scraping a dynamic web page call easy.cl and just get 4 items and sometimes none (only when I download title, cant download price because don't show anything). Well, anyhow, I need a guide of where is my error, because Selenium don't show me any in my result (Sublime text3). Also, easy.cl is dynamic going with a button to show more info products. Finally, I'm thinking in a scroll solution but can't tell, What would you do in my position? any tip to try to find a solution?
in advanced, thanks.
import random
from time import sleep
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
#baseUrl = 'https://www.easy.cl'
driver.get('https://www.easy.cl/tienda/categoria/ceramicas?cur_page=1&cur_view=grid&bc_links=pisos-y-muros&bc_names=Pisos')
#boton = driver.find_element_by_xpath('//button[#class="primary_plp load-more-products"]')
inicio = []
for i in range(5):
try:
boton = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//button[#class="primary_plp load-more-products"]')))
boton.click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH,'//a[#class="product_image"]')))
except:
break
# Espero que carguen los productos...
links_productos = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,'//a[#class="product_image"]')))
# obtengo los links de los detalles de los productos
links_pagina=[]
for tag_a in links_productos:
links_pagina.append(tag_a.get_attribute("href"))
for link in links_pagina:
try:
driver.get(link)
titulo = driver.find_element(By.XPATH, '//h1[#class="product-details__title"]').text
#if driver.find_element(By.XPATH, '//span[#class="priceNumber priceNumberAlignRight"]'):
# precio = find_element_by_xpath('//span[#class="priceNumber priceNumberAlignRight"]').text
#else:
# precio = "Sin precio M2"
print(titulo)
#print(precio)
except:
break
The load more products button appears on the bottom of the page, out of the visible screen, so possibly after the element is presented (loaded) you need to scroll to that element before clicking it
from selenium.webdriver.common.action_chains import ActionChains
boton = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//button[#class="primary_plp load-more-products"]')))
actions = ActionChains(driver)
actions.move_to_element(boton).perform()
boton.click()
But the main issue here, as I see, is
links_productos = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,'//a[#class="product_image"]')))
You assume that this will give you all the elements that can be presented on that page matching that locator, but actually this expected condition is waiting for at least 1 element matching this condition and once it catches it - it returns a list of web elements matching that locator it found.
I see no expected condition in Python like we have in Java like this ExpectedConditions.numberOfElementsToBe or this ExpectedConditions.numberOfElementsToBeMoreThan that is actually waiting for desired amount of elements located by passed locator. So, what I can advise you here, is to add a delay of 1-2 seconds after clicking the boton.click() before reading the links_productos. This should resolve your problem.

trip advisor scraper using selenium, unable to get all images having same class name using find_elements_by_classname selenium

The problem I'm facing is I'm using
driver.find_elements_by_classname("a_classname_common_to_all_images_in_tripadvisor_hotels")
However, on running script each time I'm getting less as well as different outcomes.
For instance, sometimes it scrapes first 5 out of 30 on the page, sometimes 4/30 and so on.
I'm scraping images from this link:
https://www.tripadvisor.in/Hotels-g304551-New_Delhi_National_Capital_Territory_of_Delhi-Hotels.html
images = driver.find_elements_by_class_name("_1a4WY7aS")
I am able to find all names of the hotels using class_name method, however with images it's variable.
Any help is appreciated, thanks :)
From How can I scroll a web page using selenium webdriver in python?
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Inducing a webdriver wait to load all your elements.
images = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_1a4WY7aS")))
Import
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
When it scrapes only 5 of the images present means only 5 images were loaded. You should do 2 things to get every image on the page.
Scroll down to the end of the page: You can do this by selecting the body element and then sending the down keys.
from selenium.webdriver.common.keys import Keys
import time
for _ in range(10):
driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
After scrolling, wait for the elements to be present
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,
"img._1a4WY7aS")))

Python: Webdriver Scrolling Down Page Stopped Working

I have been using the below function to scroll down a page for over 2 years now and on the 31st December 2019 it just stopped working, no errors, just stopped scrolling down.
I'm using Chrome version Version 79.0.3945.88 and ChromeDriver 2.36.540470. Any ideas or help is greatly appreciated.
def scrollToEndOfPage(self, driver):
try:
time.sleep(1)
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight;")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(randint(2,4))
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight;")
if new_height == last_height:
break
last_height = new_height
except Exception as e:
print(str(e))
Update: 1
I've run document.body.scrollHeight; on the website in question (internal site) and it displays the page height but when I try and execute driver.execute_script("return document.body.scrollHeight;") via a script, it hangs on this request and doesn't return anything and there are no errors.
You can try to wait for the page to be fully loaded before scrolling.
For that you can use code below to wait for JavaScript to finish:
from selenium.webdriver.support.ui import WebDriverWait
# ...
WebDriverWait(browser, 30).until(lambda d: d.execute_script(
'return (document.readyState == "complete" || document.readyState == "interactive")'))
Or use WebDriverWait and wait visibility/clickable of specific element/elements like below:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_all_elements_located((By.XPATH, "some elements on locator")))
# or
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "some clickable element locator")))

Unable to click HREF under headers (invisible elements)

I am wanting to click all the Href tabs under the main headers and to navigate to those pages to scrape them. For speed of the job, I do am wanting to click the href without having to click the headers. My question is, is there a way to click these buttons even though it is not visible like the page on the right? It does not seem to be working for me. It seems to give me:
Traceback (most recent call last):
File "C:/Users/Bain3/PycharmProjects/untitled4/Centrebet2.py", line 58, in <module>
EC.element_to_be_clickable((By.XPATH, '(//*[#id="accordionMenu1_ulSports"]/li/ul/li/ul/li/a)[%s]' % str(index + 1)))).click()
File "C:\Users\Bain3\Anaconda3\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
I have replaced
EC.element_to_be_clickable((By.XPATH, '(//*[#id="accordionMenu1_ulSports"]/li/ul/li/ul/li/a)[%s]' % str(index + 1)))).click()
with
driver.find_element_by_xpath('(//*[#id="accordionMenu1_ulSports"]/li/ul/li/ul/li/a)[%s]' % str(index + 1)).click()
This however does not seem to remedy it as it only clicks visible elements.
My code below is:
from random import shuffle
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium import webdriver as web
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from random import randint
from time import sleep
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import csv
import requests
import time
from selenium import webdriver
success = False
while not success:
try:
driver = webdriver.Chrome()
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('http://centrebet.com/')
success = True
except:
driver.quit()
sleep(5)
sports = driver.find_element_by_id("accordionMenu1_ulSports")
if sports.get_attribute("style") == "display: none;":
driver.find_element_by_xpath('//ul[#id="menu_acc"]/li[3]/a').click()
driver.find_element_by_xpath(".//*[#data-type ='sports_l1'][contains(text(), 'Soccer')]").click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
options = driver.find_elements_by_xpath('//*[#id="accordionMenu1_ulSports"]/li/ul/li/ul/li/a')
# Get list of inetegers [1, 2, ... n]
indexes = [index for index in range(len(options))]
# Shuffle them
shuffle(indexes)
for index in indexes:
# Click on random option
wait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '(//*[#id="accordionMenu1_ulSports"]/li/ul/li/ul/li/a)[%s]' % str(index + 1)))).click()
I have also tried:
driver.execute_script('document.getElementByxpath("//*[#id="accordionMenu1_ulSports"]/li/ul/li/ul/li/a").style.visibility = "visible";')
To remedy this. Though this simply gives an error. Any ideas on how to resolve this issue of invisible elements?
driver.execute_script('document.getElementByxpath("//*[#id="accordionMenu1_ulSports"]/li/ul/li/ul/li/a").style.visibility = "visible";')
gives you error because it's not correct way to use XPath in Javascript. Correct way you can find here
To scrape required data you can use below code:
import requests
import time
from selenium import webdriver
url = "http://centrebet.com/"
success = False
while not success:
try:
driver = webdriver.Chrome()
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
success = True
except:
driver.quit()
time.sleep(5)
sports = driver.find_element_by_id("accordionMenu1_ulSports")
links = [url + link.get_attribute("onclick").replace("menulink('", "").replace("')", "") for link in sports.find_elements_by_xpath('.//a[starts-with(#onclick, "menulink")]')]
for link in links:
print(requests.get(link).text)
Instead of clicking on each link, you can request content of each page with HTTP-GET
You can even try using JavascriptExecutor.
Use below code to make your style attribute = display:block;
driver.execute_script("arguments[0].style.display = 'none'", driver.find_element_by_xpath("//*[#id='accordionMenu1_ulSports']/li/ul/li/ul"))
Note : Make sure you are using correct xpath. your <ul> element is hidden not <a> so so take xpath of that <ul> tag only and try

How to Detect Popup and Close it Using Selenium Python (Google Chrome)

Hey I am scraping Shopify Review Shop Url, but while I am navigating from the search results, a pop up appears and I have no idea how to detect it and close it.
Here's my code
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
driver = webdriver.Chrome()
url='https://apps.shopify.com/sales-pop'
driver.get(url)
#Loop and Navigate Through the Search Results
page_number = 2
while True:
try:
link = driver.find_element_by_link_text(str(page_number))
except NoSuchElementException:
break
if page_number > 8:
timeout = 20
try:
WebDriverWait(driver,timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[#title="close"]')))
except TimeoutException:
print("Timed out waiting for page to load")
driver.quit()
#Switch to the Popup
driver.switch_to_alert()
driver.find_element_by_xpath('//div[#title="close"]').click()
driver.implicitly_wait(5)
link.click()
print(driver.current_url)
page_number += 1
else:
driver.implicitly_wait(5)
link.click()
print(driver.current_url)
page_number += 1
#Scraping Rating
stars = driver.find_elements_by_xpath('//figure[#class="resourcesreviews-reviews-star"]')
starstars = []
for star in stars:
starstar=star.find_element_by_xpath('.//div/span')
starstars.append(starstar.get_attribute('class'))
#Scraping URL
urls = driver.find_elements_by_xpath('//figcaption[#class="clearfix"]')
titles=[]
for url in urls:
title=url.find_element_by_xpath('.//strong/a')
titles.append(title.get_attribute('href'))
#Print Titles and Rating Side by Side
for titless, starstarss in zip(titles, starstars):
print(titless + " " + starstarss)
You can just use WebDriverWaitandwindow_handles. Specifically, you can probably replace your #Switch to Popup section with something like:
WebDriverWait(driver, 5).until(lambda d: len(d.window_handles) == 2)
driver.switch_to_window(driver.window_handles[1]).close()
driver.switch_to_window(driver.window_handles[0])

Resources