Selenium fails to scroll down - python-3.x

I am using Selenium to scrape data from here. The website is using some animation to show the sections after your scroll down. I am trying to scroll down to the footer and wait for the animation to get the data from the page.
Although I am not sure if that's the only approach that get me the data, cause I can see that the animation is only adding class aos-animate to the main class, and if that class is not in the HTML element, it wont get the text!
In the get_service_data function, I am trying to scroll down to the end of the page. I tried to scroll down before I start the loop.
I tried:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html = driver.find_element(By.CLASS_NAME, 'html')
html.send_keys(Keys.END)
html.send_keys(Keys. PAGE_DOWN)
copyright = driver.find_element(By.CLASS_NAME, 'copyright')
driver.execute_script("arguments[0].scrollIntoView();", copyright)
Here is my full script:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
language = "en" # to take this from the user
main_link = f"https://www.atlp.ae/{language}"
driver_path = os.path.join(os.getcwd(), "chromedriver")
# options = webdriver.ChromeOptions()
# options.headless = True
driver = webdriver.Chrome(driver_path) # options=options
driver.maximize_window()
def get_services_links():
links = []
driver.get(main_link)
services_header_xpath = '//*[#id="fixed-header"]/div/div[2]/div/nav/ul/li[5]/button'
driver.find_element(By.XPATH, services_header_xpath).click()
services_menu_xpath = '//*[#id="serviceInfotitle"]/nav/ul'
services_menu = driver.find_element(By.XPATH, services_menu_xpath)
options = services_menu.find_elements(By.TAG_NAME ,"li")
for option in options:
a_tag = option.find_element(By.TAG_NAME ,"a")
links.append(a_tag.get_attribute("href"))
return links[:-1] if len(links) > 0 else []
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[#id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH,service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# row serviceSubsetRow ng-star-inserted
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
container = services_wrapper.find_element(By.CLASS_NAME, 'container')
service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
for service in service_sections:
textual_div = service.find_element(By.CLASS_NAME, 'textCol')
something = textual_div.find_element(By.CLASS_NAME, 'serviceSubsetTitle')
print("Text: ", something.text)
if __name__ == '__main__':
# try:
links = get_services_links()
for link in links:
get_service_data(link)
break
driver.quit()

What you need is this:
something.get_attribute('innerText') because, perhaps, due to the added animation, the regular text is not working.
Also, I have removed a few lines as I thought they were not needed (at least for this exercise). I have directly added a loop to make it work with serviceSubsetTitle
def get_service_data(link):
driver.get(link)
wait = WebDriverWait(driver, 10)
service_name_xpath = '//*[#id="main-scrollbar"]/div[1]/main/sc-placeholder/app-intro-section/section/div/div[1]/div[1]/div/p'
wait.until(EC.visibility_of_element_located((By.XPATH, service_name_xpath)))
service_name = driver.find_element(By.XPATH, service_name_xpath).text
print("Service Name: ", service_name)
# ---- removed these lines --------
# row serviceSubsetRow ng-star-inserted
# wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'ServiceSubsetWrapper')))
# services_wrapper = driver.find_element(By.CLASS_NAME, 'ServiceSubsetWrapper')
#
# container = services_wrapper.find_element(By.CLASS_NAME, 'container')
# service_sections = container.find_elements(By.CLASS_NAME, 'serviceSubsetRow')
# ----- End of lines removal ----------
# Clicking out the cookie acceptance button
try:
driver.find_element(By.XPATH, "//*[#class='cc-btn cc-allow']").click()
except:
print("nothing there")
# --- removed these lines
# for service in service_sections:
# textual_div = service.find_element(By.CLASS_NAME, 'textCol')
# time.sleep(3)
# --- end of lines removal ---------
# These are my lines here from below:
somethings = driver.find_elements(By.XPATH, "//*[contains(#class, 'serviceSubsetTitle')]")
print(len(somethings))
for something in somethings:
# time.sleep(2)
title_txt = something.get_attribute('innerText')
print(title_txt)
here is the output:
Service Name: Sea Services
5
Vessel Management and Marine Services
Passenger Handling and Cargo Operations
Issuance of Certificates and Approvals in Ports
Ports Licensing
Property Leasing Services - Ports
Process finished with exit code 0

This is one way of scrolling that page down:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.atlp.ae/en'
browser.get(url)
browser.execute_script('window.scrollBy(0, 100);')
cookie_b = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='deny cookies']")))
cookie_b.click()
body = WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, "main-scrollbar")))
body.click()
body.send_keys(Keys.END)
print('scrolled down')
Setup is chrome/chromedriver on linux, however it can be adapted to your system, just observe the imports, and the code after defining the browser/driver. Selenium docs: https://www.selenium.dev/documentation/

Related

How to have better structure in my webscraping output

Here's my script :
from selenium import webdriver
import pandas as pd
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
PATH = "chromedriver.exe"
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(executable_path=PATH,options=options)
url = 'https://www.google.com/maps/place/Lazeo+Paris+11e/#48.8532051,2.3859464,17z/data=!3m1!4b1!4m5!3m4!1s0x47e6738875606957:0xdfb3822564e33888!8m2!3d48.8532051!4d2.3881404'
driver.get(url)
time.sleep(5)
driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span/span[2]/span[1]/button').click()
#to make sure content is fully loaded we can use time.sleep() after navigating to each page
import time
time.sleep(3)
SCROLL_PAUSE_TIME = 5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
number = 0
while True:
number = number+1
# Scroll down to bottom
ele = driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
driver.execute_script('arguments[0].scrollBy(0, 5000);', ele)
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
print(f'last height: {last_height}')
ele = driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
new_height = driver.execute_script("return arguments[0].scrollHeight", ele)
print(f'new height: {new_height}')
if number == 2:
break
#if new_height == last_height:
# break
print('cont')
last_height = new_height
item = driver.find_elements_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[10]')
time.sleep(3)
name_list = []
stars_list = []
review_list = []
duration_list = []
for i in item:
button = i.find_elements_by_tag_name('button')
for m in button:
if m.text == "More":
m.click()
time.sleep(5)
name = i.find_elements_by_class_name("d4r55")
stars = i.find_elements_by_class_name("kvMYJc")
review = i.find_elements_by_xpath("//span[#class='wiI7pd']")
duration = i.find_elements_by_class_name("rsqaWe")
for j,k,l,p in zip(name,stars, review, duration):
name_list.append(j.text)
stars_list.append(p.text)
review_list.append(k.text.strip())
duration_list.append(l.text)
review = pd.DataFrame(
{'name': name_list,
'rating': stars_list,
'review': review_list,
'duration': duration_list})
review.to_csv('google_review.csv',index=False, encoding='utf-8-sig')
print(review)
driver.quit()
But sometimes the format of the review is not convenient, like that (sorry it's in french) :
That's this one on google map :
How to avoid that ? Because it count as several rows and I cannot work with a structure like that.
I hope the fact that's in french isn't problematic to understand the structure of the sentences.

How to click the next span value which has same class name

import urllib3
import certifi
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
import time
import ssl
http = urllib3.PoolManager(ca_certs=certifi.where())
chrome_options = Options()
chrome_options.add_argument("--incognito")
driver = webdriver.Chrome(options=chrome_options, executable_path="D:\\python works\\driver\\chromedriver.exe")
URL= "https://physicians.wustl.edu/"
driver.get(URL)
time.sleep(5)
driver.find_element_by_link_text("Find a Doctor").click()
find_doc = driver.current_url
print(find_doc)
driver.get(find_doc)
# content = driver.page_source
# print(content)
response = http.request('GET', find_doc)
url_text = response.data #text
time.sleep(10)
count = len(driver.find_elements_by_xpath("//span[#class='entry-title-link']"))
print(count)
s = driver.find_element_by_css_selector("span[class='entry-title-link']") #firstpage click
s.click()
urls = []
provider = []
print(driver.current_url)
urls.append(driver.current_url)
name = driver.find_element_by_css_selector("h1[class='washu-ppi-name entry-title']").text
print(name)
provider.append(name)
specialization = driver.find_element_by_css_selector("ul[class='wuphys-specialties']").text
print(specialization)
location= driver.find_element_by_css_selector("a[class='wuphys-addr name']").text
print(location)
time.sleep(5)
driver.find_element_by_css_selector("a[href='https://physicians.wustl.edu/find-a-doctor/']").click()
time.sleep(10)
I have same classname of span but I need to loop the same class name but the div is different. In the url there is doctors name with details after click I get details and I need to move to next doctor which has same class name
I think you are looking for something of this kind (to loop through all the doctor links and get info from there). Here I have written a basic action which you can scale to add more data related to each doctor.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
import time
driver = webdriver.Chrome(options=chrome_options, executable_path="D:\\python works\\driver\\chromedriver.exe")
driver.maximize_window()
driver.get("https://physicians.wustl.edu/")
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.LINK_TEXT, "Find a Doctor"))).click()
print(driver.current_url)
doc_cnt = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//span[#class='entry-title-link']")))
print(len(doc_cnt))
doc_list=[] # to append all the doctor urls into the list for further processing, if required.
for doc in doc_cnt:
ActionChains(driver).key_down(Keys.CONTROL).click(doc).key_up(Keys.CONTROL).perform()
driver.switch_to.window(driver.window_handles[1])
doc_list.append(driver.current_url)
# ... you could include any code of yours related to each doctor here...
# After this one the tab terminates and a new doctor link would open
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(1)
print(doc_list)

Selenium Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'

I'm currently working on a python bot program that grabs 3 way odds(home, draw, away) of the same soccer match(same clubs) from two different bookie websites, then compares the odds and gets the large odds, the large odds are finally passed into an arbitrage calculator function named oddchecker.py to get the total implied probability. For example: Osasuna vs Barcelona, bookie site one has the three odds as: 4.60, 3.60, 1.90 and bookie two has odds as 4.70, 3.60, 1.87, so my bot will grab4.70, 3.60, 1.90.
I'm using python and selenium for this project.
The bot is made up of four functions, namely ab2.py, ab.py, main.py, and oddchecker.py
Have imported ab2.py, ab.py and oddchecker.py into main.py file main.py is my main function to run the entire bot code.
ab2.py queries one bookie site and ab.py queries the other bookie site.
My code is able to grab the odds from the two sites and perform the comparison, however it only prints home odd list(i'm storing the odds in a list for comparison purposes) for the two teams when i run the main.py, then code breaks and its unable to go back and grab for odds for draw and away.
Below is the error that i'm getting. Have done research regarding the error, and it seems that am making too many requests to the target server, i'm not sure if its a localhost port overuse or the target bookie servers. They say resolution is to implement sleep waits using time.sleep(10) module, in my code how can i implement that.
HTTPConnectionPool(host='localhost', port=58202): Max retries exceeded with url: /session/1eaec216e17f72e3a4f30235093e0021/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001F16BC30100>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))
None
Below is my code
ab2.py
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # for implicit and explict waits
from selenium.webdriver.chrome.options import Options # for suppressing the browser
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument('headless')
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH, options=option)
import time
def betika_func():
web_betika= 'https://www.betika.com/s/soccer/spain-laliga'
driver.get(web_betika)
driver.implicitly_wait(3)
try:
btk_team1 = driver.find_element(By.XPATH, '//div[#class="prebet-match"]//div[#class="prebet-match__odd-market__container"]//div[#class="prebet-match__teams"]//span[#class="prebet-match__teams__home"]').get_attribute('innerHTML')
btk_team2 = driver.find_element(By.XPATH, '//div[#class="prebet-match"]//div[#class="prebet-match__odd-market__container"]//div[#class="prebet-match__teams"]/span[2]').get_attribute('innerHTML')
# odds = driver.find_element(By.XPATH, '//div[#class="prebet-match"]//div[#class="prebet-match__odd-market__container"]//div[#class="prebet-match__odds__container"]//[#class="prebet-match__odds"]//button[#class="prebet-match__odd"]//span[#class="prebet-match__odd__odd-value"]')
odds = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, 'prebet-match__odd__odd-value'))
)
hw = odds[0].get_attribute('innerHTML')
draw = odds[1].get_attribute('innerHTML')
aw = odds[2].get_attribute('innerHTML')
print(btk_team1)
print(btk_team2)
betika_home_odd = print('Betika home odd: '+hw)
betika_draw_odd = print('Betika draw odd: '+draw)
betika_away_odd = print('Betika away odd: '+aw)
betika_dict = {
'btk_home': hw,
'btk_draw': draw,
'btk_away': aw
}
return betika_dict
finally:
driver.quit()
# Testing the function
# response = betika_func()
# print(response)
ab.py code
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # for implicit and explict waits
from selenium.webdriver.chrome.options import Options # for suppressing the browser
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument('headless')
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH, options=option)
def odi_func():
web_odi = 'https://odibets.com/'
driver.get(web_odi)
driver.implicitly_wait(3)
try:
# Login form =================================================
# btn = driver.find_element(By.CSS_SELECTOR, 'button[id="mobile-web-login"]')
# btn.click()
# print('Login form opened')
# driver.implicitly_wait(2)
# phone_num = driver.find_element(By.XPATH, '//*[#placeholder="07xxxxxxxx or 01xxxxxxxx"]')
# password = driver.find_element(By.XPATH, '//*[#placeholder="xxxxxxxx"]')
# login = driver.find_element(By.XPATH, '//*[#id="modal"]/div/div/div/form/div[4]/button')
# phone_num.send_keys()
# password.send_keys()
# login.click()
# ==============================
# elem = driver.find_element_by_xpath("(//a[#href='/league'] [.//span[contains(.,'League')]])[1]")
# driver.execute_script("arguments[0].click()", elem)
# Print the first soccer title
# item = driver.find_element(By.XPATH, '//div[#class="l-events-games"]//div[#class="l-games-title"]//div[#class="l"]').get_attribute("innerHTML")
# print('The first sport league or title is: '+item)
# Get all clubs in the page.
teams = driver.find_elements(By.XPATH, '//div[2][#class="l-events-games"]//div[#class="l-events-games-matches"]//div[#class="l-games-event"]//a[#class="inf"]//div[#class="t-i"]')
# Get the first two club names in the page
odi_team1 = teams[0].get_attribute("innerHTML")
odi_team2 = teams[1].get_attribute("innerHTML")
print(odi_team1)
print(odi_team2)
# Get three way odds
odds = driver.find_elements(By.XPATH, '//div[2][#class="l-events-games"]//div[#class="l-events-games-matches"]//div[#class="l-games-event"]//div[#class="mar-cont"]//span[#class="b"]')
odi_hw = odds[0].get_attribute("innerHTML")
odi_draw = odds[1].get_attribute("innerHTML")
odi_aw = odds[2].get_attribute("innerHTML")
# print('Home odd for: '+team1+' is: '+odi_hw+' Draw odd is: '+odi_draw+ ' Away team is: '+team2+' Away win odd is: '+odi_aw)
odi_home_odd = print('Odi home odd is: '+odi_hw)
odi_draw_odd = print('Oddi draw odd is: '+odi_draw)
odi_away_odd = print('Odi away odd is: '+odi_aw)
odi_dict = {
'odi_home': odi_hw,
'odi_draw': odi_draw,
'odi_away': odi_aw
}
return odi_dict
finally:
driver.quit()
# Testing the function
# response = odi_func()
# print(response)
oddchecker.py code
def func(odd1, odd2, odd3):
implied1 = (1/odd1) * 100
implied2 = (1/odd2) * 100
implied3 = (1/odd3) * 100
roundedodd1 = round(implied1, 2)
roundedodd2 = round(implied2, 2)
roundedodd3 = round(implied3, 2)
totalimpliedprob = roundedodd1 + roundedodd2 + roundedodd3
print('Implied prob for: '+str(odd1)+' is '+str(roundedodd1))
print('Implied prob for: '+str(odd2)+' is '+str(roundedodd2))
print('Implied prob for: '+str(odd3)+' is '+str(roundedodd3))
totalimpliedprob = round(totalimpliedprob, 2)
# print('Total implied odd market probability is: '+str(totalimpliedprob))
return totalimpliedprob
# Testing the arbitrage function
# response = func(large_home_odd, large_draw_odd, large_away_odd)
# print(response)
Main.py code
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # for implicit and explict waits
from selenium.webdriver.chrome.options import Options # for suppressing the browser
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument('headless')
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH, options=option)
import time
import ab2 as betika
import ab as odi
import oddchecker as od
def main_func():
try:
home_list = []
draw_list = []
away_list = []
time.sleep(20)
odi_hm = odi.odi_func().get('odi_home')
btk_hm = betika.betika_func().get('btk_home')
home_list.append(odi_hm)
home_list.append(btk_hm)
print(home_list)
time.sleep(20)
odi_dr = odi.odi_func().get('odi_draw')
btk_dr = betika.betika_func().get('btk_draw')
home_list.append(odi_dr)
draw_list.append(btk_dr)
print(draw_list)
time.sleep(20)
odi_aw = odi.odi_func().get('odi_away')
btk_aw = betika.betika_func().get('btk_away')
home_list.append(odi_aw)
draw_list.append(btk_aw)
print(away_list)
# Get the large home odd
large_home_odd = max(home_list)
large_draw_odd = max(draw_list)
large_away_odd = max(away_list)
# Pass the three maximum odds to arbitrage calculator, to check if total implied probability
# is equal or less than 100 percent
final_response = od.func(large_home_odd, large_draw_odd, large_away_odd)
print(final_response)
print(str(large_home_odd))
except Exception as e:
print(e)
response = main_func()
print(response)

How to open and access multiple (nearly 50) tabs in Chrome using ChromeDriver and Selenium through Python

I'm trying to gather some information from certain webpages using selenium and python.I have a working code for a single tab. But now i have a situation where i need to open 50 tabs in chrome at once and process each page data.
1) So open 50 tabs at once - The code i got already
2) Change the control between tabs and process the information from the page and close the tab and move to next tab and do the same.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import psycopg2
import os
import datetime
final_results=[]
positions=[]
saerched_url=[]
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
#options.add_argument('--headless')
options.add_argument("—-incognito")
browser = webdriver.Chrome(executable_path='/users/user_123/downloads/chrome_driver/chromedriver', chrome_options=options)
browser.implicitly_wait(20)
#def db_connect():
try:
DSN = "dbname='postgres' user='postgres' host='localhost' password='postgres' port='5432'"
TABLE_NAME = 'staging.search_url'
conn = psycopg2.connect(DSN)
print("Database connected...")
cur = conn.cursor()
cur.execute("SET datestyle='German'")
except (Exception, psycopg2.Error) as error:
print('database connection failed')
quit()
def get_products(url):
browser.get(url)
names = browser.find_elements_by_xpath("//span[#class='pymv4e']")
upd_product_name_list=list(filter(None, names))
product_name = [x.text for x in upd_product_name_list]
product = [x for x in product_name if len(x.strip()) > 2]
upd_product_name_list.clear()
product_name.clear()
return product
links = ['https://www.google.com/search?q=Vitamin+D',
'https://www.google.com/search?q=Vitamin+D3',
'https://www.google.com/search?q=Vitamin+D+K2',
'https://www.google.com/search?q=D3',
'https://www.google.com/search?q=Vitamin+D+1000']
for link in links:
# optional: we can wait for the new tab to open by comparing window handles count before & after
tabs_count_before = len(browser.window_handles)
# open a link
control_string = "window.open('{0}')".format(link)
browser.execute_script(control_string)
# optional: wait for windows count to increment to ensure new tab is opened
WebDriverWait(browser, 1).until(lambda browser: tabs_count_before != len(browser.window_handles))
# get list of currently opened tabs
tabs_list = browser.window_handles
print(tabs_list)
# switch control to newly opened tab (the last one in the list)
last_tab_opened = tabs_list[len(tabs_list)-1]
browser.switch_to_window(last_tab_opened)
# now you can process data on the newly opened tab
print(browser.title)
for lists in tabs_list:
last_tab_opened = tabs_list[len(tabs_list)-1]
browser.switch_to_window(last_tab_opened)
filtered=[]
filtered.clear()
filtered = get_products(link)
saerched_url.clear()
if not filtered:
new_url=link+'+kaufen'
get_products(link)
print('Modified URL :'+link)
if filtered:
print(filtered)
positions.clear()
for x in range(1, len(filtered)+1):
positions.append(str(x))
saerched_url.append(link)
gobal_position=0
gobal_position=len(positions)
print('global postion first: '+str(gobal_position))
print("\n")
company_name_list = browser.find_elements_by_xpath("//div[#class='LbUacb']")
company = []
company.clear()
company = [x.text for x in company_name_list]
print('Company Name:')
print(company, '\n')
price_list = browser.find_elements_by_xpath("//div[#class='e10twf T4OwTb']")
price = []
price.clear()
price = [x.text for x in price_list]
print('Price:')
print(price)
print("\n")
urls=[]
urls.clear()
find_href = browser.find_elements_by_xpath("//a[#class='plantl pla-unit-single-clickable-target clickable-card']")
for my_href in find_href:
url_list=my_href.get_attribute("href")
urls.append(url_list)
print('Final Result: ')
result = zip(positions,filtered, urls, company,price,saerched_url)
final_results.clear()
final_results.append(tuple(result))
print(final_results)
print("\n")
print('global postion end :'+str(gobal_position))
i=0
try:
for d in final_results:
while i <= gobal_position:
print( d[i])
cur.execute("""INSERT into staging.pla_crawler_results(position, product_name, url,company,price,searched_url) VALUES (%s, %s, %s,%s, %s,%s)""", d[i])
print('Inserted succesfully')
conn.commit()
i=i+1
except (Exception, psycopg2.Error) as error:
print (error)
pass
browser.close()
Ideally you shouldn't attempt to open 50 tabs at once as:
Handling 50 concurrent TABs through Selenium will invite complicated logic/algorithm to maintain.
Additionally, you may run into CPU and memory usage issues as:
Chrome maintains many processes.
Where as at times Firefox uses too much RAM
Solution
If you are having a List of the urls as follows:
['https://selenium.dev/downloads/', 'https://selenium.dev/documentation/en/']
You can iterate over the list to open them one by one in the adjacent tab for scraping using the following Locator Strategy:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.keys import Keys
links = ['https://selenium.dev/downloads/', 'https://selenium.dev/documentation/en/']
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
for link in links:
driver = webdriver.Chrome(options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get(link)
print(driver.title)
print("Perform webscraping here")
driver.quit()
print("End of program")
Console Output:
Downloads
Perform webscraping here
The Selenium Browser Automation Project :: Documentation for Selenium
Perform webscraping here
End of program
Reference
You can find a relevant detailed discussion in:
WebScraping JavaScript-Rendered Content using Selenium in Python

Save to excel file using openpyxl instead of csv

The code below is working and is currently saving to a csv file, however I want to save to an excel file instead using openpyxl. I attempted it further below but had no success. I'd eventually like to save this to an existing sheet and be able to overwrite the existing data. Can anyone help? Thanks
Working Code:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import csv
import urllib
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
try:
selector = '.js-field-total.tv-screener-table__field-value--total'
condition = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
matches = WebDriverWait(driver, 10).until(condition)
matches = int(matches.text.split()[0])
except (TimeoutException, Exception):
print ('Problem finding matches, setting default...')
matches = 4895 # Set default
# The page loads 150 rows at a time; divide matches by
# 150 to determine the number of times we need to scroll;
# add 5 extra scrolls just to be sure
num_loops = int(matches / 150 + 5)
for _ in range(num_loops):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1) # Pause briefly to allow loading time
search_entries = [
("tickers", "//tbody/tr/td[1]/div/a"),
("rev annual", "//tbody/tr/td[10]"),
("income", "//tbody/tr/td[11]")]
with open('textfile.csv', 'w+', newline= '' ) as f_output:
csv_output = csv.writer(f_output)
# Write header
csv_output.writerow([name for name, xpath in search_entries])
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
csv_output.writerows(zip(*entries))
Tried this:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from openpyxl import Workbook
import urllib
wb = Workbook(write_only=True)
ws = wb.create_sheet()
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
try:
selector = '.js-field-total.tv-screener-table__field-value--total'
condition = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
matches = WebDriverWait(driver, 10).until(condition)
matches = int(matches.text.split()[0])
except (TimeoutException, Exception):
print ('Problem finding matches, setting default...')
matches = 4895 # Set default
# The page loads 150 rows at a time; divide matches by
# 150 to determine the number of times we need to scroll;
# add 5 extra scrolls just to be sure
num_loops = int(matches / 150 + 5)
for _ in range(num_loops):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1) # Pause briefly to allow loading time
search_entries = [
("tickers", "//tbody/tr/td[1]/div/a"),
("rev annual", "//tbody/tr/td[10]"),
("income", "//tbody/tr/td[11]")]
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
wb.save('new_big_file.xlsx')

Resources