the content of the floating window is not included in the web code. it only appears when the mouse is over it. So how can I obtain the value in this situation? I find it tricky cuz there's no anchor point for me to control the movement of the mouse.
The code is like below,
def button_click(driver, button_num):
driver.execute_script("arguments[0].click();", button_num)
def catogory_obtain_tokyo(driver):
time_waiting_max = 20
try:
page_kansai = WebDriverWait(driver, time_waiting_max).until(
EC.presence_of_element_located((By.ID, 'snippet-13'))
)
buttons = WebDriverWait(page_kansai, time_waiting_max).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "mv-button-base.mv-hyperlink-button"))
)
return buttons
except:
print('catogory_obtain error')
driver.quit()
return ''
path = r'chromedriver.exe'
tokyo_url = r'https://www.eex.com/en/market-data/power/futures#%7B%22snippetpicker%22%3A%22EEX%20Japanese%20Power%20Futures%20-%20Tokyo%22%7D'
# --- time line ---
timeline = '//*[#id="null"]/div/div[2]/div'
# ------- price trade reg ----
pane_pr = '//*[#id="null"]/div/div[1]/div[1]/div[2]'
# --------volume trade registration ------
pane_vtr = '//*[#id="null"]/div/div[1]/div[3]/div[2]'
driver = webdriver.Chrome(path)
driver.get(tokyo_url)
btns = catogory_obtain_tokyo(driver)
button_click(driver, btns[0])
time.sleep(3)
# sep-03 btn
date = '//*[#id="symbolheader_jft"]/div/div[1]/div[2]/table/tbody/tr[1]/td[5]'
date_btn = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, date))
)
time.sleep(5)
date_btn.click()
# hit icon
icon_path = '//*[#id="baseloadwidget_jft"]/table/tbody/tr[2]/td[5]'
icon = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, icon_path))
)
time.sleep(5)
icon.click()
time.sleep(5)
# --------- click volume btn ------
vtr_path = '//*[#id="baseloadwidget_jft"]/table/tbody/tr[3]/td/div/div[2]/div[3]/div[2]'
vtr_btn = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, vtr_path))
)
time.sleep(5)
vtr_btn.click()
time.sleep(5)
tl = driver.find_element(By.XPATH, timeline)
webdriver.ActionChains(driver).move_to_element(tl).perform()
time.sleep(5)
pr = driver.find_element(By.XPATH, pane_pr)
webdriver.ActionChains(driver).move_to_element(pr).perform()
time.sleep(5)
vtr = driver.find_element(By.XPATH, pane_vtr)
webdriver.ActionChains(driver).move_to_element(vtr).perform()
time.sleep(5)
time.sleep(5)
driver.quit()
Basically, I tried the move_to_element method, but it only moves the mouse to the center of element. However, here this inline chart is considered as one element, how can I control the movement of the mouse within one single web element?
I hope this is answer you are looking for.
from selenium import webdriver
import time
from selenium.webdriver.common.action_chains import ActionChains
driver = webdriver.Chrome(executable_path="path to chromedriver.exe")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://www.eex.com/en/market-data/power/futures#%7B%22snippetpicker%22%3A%22EEX%20Japanese%20Power%20Futures%20-%20Tokyo%22%7D")
time.sleep(30) # Manually selected the options.
blocks = driver.find_elements_by_xpath("//div[#class='mv-panes-host']/div[3]/div[2]//*[local-name()='svg']/*[name()='g'][2]//*[name()='rect']")
actions = ActionChains(driver)
for block in blocks:
actions.move_to_element(block).perform()
time.sleep(2)
print(driver.find_element_by_xpath("//div[#id='null']/div/div[1]/div[3]/div[2]/div/div[contains(#class,'date')]").text)
print(driver.find_element_by_xpath("//div[#id='null']/div/div[1]/div[3]/div[2]/div//div[contains(#class,'name')]").text)
print(driver.find_element_by_xpath("//div[#id='null']/div/div[1]/div[3]/div[2]/div//div[contains(#class,'value')]").text)
driver.quit()
9/3/2021, 01:53:23 PM
Volume Trade Registration
840.000
9/3/2021, 01:56:26 PM
Volume Trade Registration
840.000
Related
I tried to scrape the content of table from wyscout.com, which seems that is built by Reacjs.
After log in, script selects the country(e.g. England), League(e.g. Premier League), Team(e.g. Arsenal). Here choose Stats tab.
Then, it shows the table to scrape the data. Even if there is a button to export a excel file, I want to scrape the content manually using selenium or beautifulsoup.
However, script gets only 18 rows even though the number of rows on the table is more than 100.
Please let me know the solution.
Thanks.
Here is my code.
from re import search
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import string
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
# Load Chrome Browser
show_browser = True
options = Options()
# options.add_argument('--headless')
scraped_data = []
def bot_driver(url, user_name, user_password):
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(url)
driver.maximize_window()
time.sleep(2)
# Log in
# login = driver.find_element_by_xpath("//ul[#id='avia-menu']/li[5]/a")
# login.click()
time.sleep(10)
idd = driver.find_element_by_xpath("//input[#id='login_username']")
idd.send_keys(user_name)
passW = driver.find_element_by_xpath("//input[#id='login_password']")
passW.send_keys(user_password)
time.sleep(2)
submit = driver.find_element_by_xpath("//button[#id='login_button']")
submit.click()
time.sleep(10)
try:
force_login = driver.find_element_by_xpath("//button[#class='btn2_zFM sc-jDwBTQ cUKaFo -block3u2Qh -primary1dLZk']")
force_login.click()
print('force loging')
except :
print('force login error')
return driver
def select_country(driver, country_name):
# Specific Country
# country = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[#commandsource='list#area_list#30']")))
# country = driver.find_element_by_xpath("//div[#commandsource='list#area_list#30']")
# All the countries
list_country = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#id='detail_0_home_navy']/div[1]/div/div")))
time.sleep(3)
for entry in list_country:
if country_name == entry.text:
print('country click here')
entry.click()
return driver, 1
return driver, 0
def select_league(driver, league_name):
# Specific League
# league = driver.find_element_by_xpath("//div[#commandsource='list#competition_list#0']")
# All the leagues
list_league = driver.find_elements_by_xpath("//div[#id='detail_0_area_navy_0']/div[1]/div/div")
for entry in list_league:
if league_name == entry.text:
entry.click()
return driver, 1
return driver, 0
def select_team(driver, team_names):
# Specific Team
# team = driver.find_element_by_xpath("//div[#commandsource='list#team_list#0']")
flag_team = 0
list_team = driver.find_elements_by_xpath("//div[#id='detail_0_competition_navy_0']/div[1]/div/div")
for entry in list_team:
if entry.text in team_names:
flag_team = 1
print('selected team = ', entry.text)
entry.click()
time.sleep(2)
# Stats
stats = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Stats')))
stats.click()
time.sleep(3)
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH, "//div[#id='detail_0_team_stats']/div/div/div/main/div[3]/div[2]/div/table")))
content_stats = driver.page_source
soup_stats = BeautifulSoup(content_stats, "html.parser")
table_stats = soup_stats.find('table', attrs={'class': 'teamstats__Index-module__table___1K93L teamstats__Index-module__with-opp___16Rp5'})
# print(table_stats)
tbody_stats = table_stats.find('tbody')
tr_stats = tbody_stats.find_all('tr')
print('number of tr = ', len(tr_stats))
# Return to team selection
back_team = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[#id='detail_0_team_back']")))
back_team.click()
time.sleep(5)
if flag_team == 1:
return driver, 1
else:
return driver, 0
if __name__ == "__main__":
# User input
# Login - wyscout_url = 'https://wyscout.com/'
wyscout_url = 'https://platform.wyscout.com/app/?/'
wyscout_user_name = '' # username
wyscout_user_password = '' # password
wyscout_driver = bot_driver(wyscout_url, wyscout_user_name, wyscout_user_password)
time.sleep(10)
# Select a Country
country = 'England' # .upper()
wyscout_driver, succeed = select_country(wyscout_driver, country)
if succeed == 0:
print('NO country!')
time.sleep(7)
# Select a league
league = 'Premier League' # .upper()
wyscout_driver, succeed = select_league(wyscout_driver, league)
if succeed == 0:
print('NO League!')
time.sleep(7)
# Select team
team_options = ['Arsenal']
wyscout_driver, succeed = select_team(wyscout_driver, team_options)
time.sleep(7)
if succeed == 0:
print('NO Team!')
time.sleep(7)
print('!!!Wyscout END!!!')
# wyscout_driver.quit()
Finally I figured it out by myself.
Here is my solution.
# Scroll down
print('scroll down')
last_height = driver.execute_script("return arguments[0].scrollHeight;", table_stats)
time.sleep(3)
while True:
driver.execute_script("arguments[0].scrollBy(0,arguments[0].scrollHeight)", table_stats)
time.sleep(5)
new_height = driver.execute_script("return arguments[0].scrollHeight;", table_stats)
if new_height == last_height:
break
last_height = new_height
print('scroll end')
Below code works and fills out two forms which is required to get to the table on this page: https://forsikringsguiden.dk/#!/bilforsikring/resultatside
Once the forms are filled out the table shows an overview over difference insurance firms and how much you will pay yearly for car insurance. So a comparison service. I need to scrape this overview once a week.
I am unsure how to do this. I know how to use BS4 for scraping, but I need the firm names as well, and not only the information, and this is not available by inspecting the website in Chrome. If I dive into
the network and look into XHR i find this link
https://forsikringsguiden.dk/signalr/poll?transport=longPolling&messageId=d-D7589F50-A%2C0%7C9%2C0%7C_%2C1%7C%3A%2C0&clientProtocol=1.4&connectionToken=fUYa3MT52oKf77Y6yU1sLnXiVzPw2CD4XgA8x50EfifJlz8XTPjBeP0klHUKt2uXmnisqO0KLk3fCb5bjOZ8k%2FeJl8zaXAgtRIALW9rzMF%2F8L7Pk3MOYwPRY4md1sDk5&connectionData=%5B%7B%22name%22%3A%22insuranceofferrequesthub%22%7D%5D&tid=9&_=1572505813840
This shows me all the data I need, but I cannot navigate to this page in selenium.
How do I tackle this problem?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
# enable browser logging
d = DesiredCapabilities.CHROME
d['loggingPrefs'] = { 'browser':'ALL' }
driver = webdriver.Chrome(desired_capabilities = d, options=chrome_options)
driver.fullscreen_window()
wait = WebDriverWait(driver,1)
driver.get("https://forsikringsguiden.dk/#!/bilforsikring/manuel")
#time.sleep(5)
#remove cookie bar
driver.find_element_by_id('cookieBarAccept').click()
maerke = driver.find_element_by_xpath('//*[#id="s2id_carSelectedMake"]/a').click()
driver.find_element_by_xpath('//*[#id="s2id_autogen1_search"]').send_keys("Hyundai")
driver.minimize_window()
driver.maximize_window()
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
model = driver.find_element_by_xpath('//*[#id="s2id_autogen2"]').click()
driver.find_element_by_xpath('//*[#id="s2id_autogen3_search"]').send_keys("i30")
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
driver.execute_script("scrollBy(0,250)")
aargang = driver.find_element_by_xpath('//*[#id="s2id_autogen4"]/a').click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="s2id_autogen5_search"]').send_keys("2009")
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
driver.execute_script("scrollBy(0,250)")
motor_str = driver.find_element_by_xpath('//*[#id="s2id_autogen6"]/a').click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="s2id_autogen7_search"]').send_keys("1,6")
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
variant = driver.find_element_by_xpath('//*[#id="s2id_autogen8"]').click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="s2id_autogen9_search"]').send_keys("1,6 CRDi 116HK 5d")
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
driver.execute_script("scrollBy(0,250)")
godkend_oplysninger = driver.find_element_by_xpath('//*[#id="content"]/div[4]/form/div[6]/div/button').click()
#Om dig siden
driver.get("https://forsikringsguiden.dk/#!/bilforsikring/omdig")
alder = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="content"]/div/div[2]/div[2]/form/div[1]/div[1]/div/input')))
alder.send_keys("50")
adresse = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="adresse-autocomplete"]')))
adresse.send_keys("Havevang 8, 3. th, 4300 Holbæk", Keys.ENTER)
aar = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="content"]/div/div[2]/div[2]/form/div[2]/div/div/input')))
aar.send_keys("10")
driver.execute_script("scrollBy(0,250)")
#Antal skader
driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div/div[2]/div[2]/form/div[3]/div/div/div[2]').click()
wait
driver.find_element_by_xpath('/html/body/div[11]/ul/li[3]').click()
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
time.sleep(1)
#skade 1
driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div/div[2]/div[2]/form/div[4]/div/div[1]/div/div[2]').click()
wait
driver.find_element_by_xpath('/html/body/div[12]/ul/li[5]').click()
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
time.sleep(1)
#skade 2
driver.find_element_by_xpath('/html/body/div[6]/div/div[2]/div/div[2]/div[2]/form/div[4]/div/div[2]/div/div[2]').click()
wait
driver.find_element_by_xpath('/html/body/div[13]/ul/li[3]').click()
driver.minimize_window()
driver.maximize_window()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="select2-drop"]').click()
time.sleep(1)
find_daekning = driver.find_element_by_xpath('//*[#id="content"]/div/div[2]/div[2]/form/div[5]/div/button').click()
EDIT:
I added this to my code using Selenium:
##### Get Data #####
driver.get("https://forsikringsguiden.dk/#!/bilforsikring/resultatside")
wait = WebDriverWait(driver,10)
wait
res_element = driver.find_elements_by_xpath('/html/body/div[7]/div/div[2]/div[1]/div[2]/div[2]')
res = [x.text for x in res_element]
print(res, "\n")
But it doesn't get me the numbers, just some of the text.
Here the result
['Sortér efter: Forklaring\nGå til selskab\nDin dækning\nkr./år -\nMed
samlerabat kr./år\nSelvrisiko\nSe detaljer\nSammenlign\nGå til selskab\nDin dækning\nkr./år -\nMed samlerabat kr./år\nSelvrisiko
\nSe detaljer\nSammenlign\nGå til selskab\nDin dækning\nkr./år -
\nMed samlerabat kr./år\nSelvrisiko\nSe detaljer\nSammenlign\n
There is a div tag which I want to control click using selenium chromedriver. As soon as Action Chain is performed, new tab opens up with the desired link(new url) but this is also triggering a click on current/main window and new url opens in the main window.
Below is the code:
class Scraper:
def __init__(self):
self.driver = webdriver.Chrome('/usr/bin/chromedriver')
self.wait = WebDriverWait(self.driver, 10)
def get_last_line_number(self):
return len(self.driver.find_elements_by_xpath('//div[contains(#class, "gwt-Label") and contains(#class, "WJSO") and contains(#class, "WPTO")]'))
def get_links(self, max_company_count=15000):
self.driver.get('https://ascenaretail.wd5.myworkdayjobs.com/us_corporate_jobs')
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.WN1N")))
main_window = self.driver.current_window_handle
last_line_number = 0
while last_line_number < max_company_count:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
self.wait.until(lambda driver: self.get_last_line_number() != last_line_number)
last_line_number = self.get_last_line_number()
all_elements = self.driver.find_elements_by_xpath('//div[contains(#class, "WJYF") and contains(#class, "WHYF")]')
data_array = []
for element in all_elements:
title_element = element.find_element_by_xpath('.//div[contains(#class, "gwt-Label") and contains(#class, "WPTO") and contains(#class, "WJSO")]')
element_text = title_element.text
location_element = element.find_element_by_xpath('.//span[contains(#class, "gwt-InlineLabel") and contains(#class, "WM-F") and contains(#class, "WLYF")]')
ActionChains(self.driver).key_down(Keys.CONTROL).click(title_element).key_up(Keys.CONTROL).perform()
self.driver.switch_to_window(self.driver.window_handles[1])
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.GWTCKEditor-Disabled")))
print(self.driver.current_url)
self.driver.switch_to_window(main_window)
print(self.driver.current_url)
break
break
return True;
if __name__ == '__main__':
scraper = Scraper()
company_links = scraper.get_links(max_company_count=146)
The ActionChains is working correct and opening url in new tab but also opening the url in current tab and I am losing control over the tab
I am trying to grab a text element from a page. To get to this element my scrips clicks on two filters on the page. I need to crawl 5,000 pages. The script works, in terms of collecting the text element, however, after a certain number of pages it always returns a message "element not visible". I am assuming it's due to the fact that page didn't load in time, since I checked the pages where it breaks and the text element is there. (I have time.sleep(3) already implemented after every click). What can I use in my script to just skip that page if it doesn't load in time?
def yelp_scraper(url):
driver.get(url)
# get total number of restaurants
total_rest_loc = '//span[contains(text(),"Showing 1")]'
total_rest_raw = driver.find_element_by_xpath(total_rest_loc).text
total_rest = int(re.sub(r'Showing 1.*of\s','',total_rest_raw))
button1 = driver.find_element_by_xpath('//span[#class="filter-label filters-toggle js-all-filters-toggle show-tooltip"]')
button1.click()
time.sleep(1)
button2 = driver.find_element_by_xpath('//span[contains(text(),"Walking (1 mi.)")]')
button2.click()
time.sleep(2)
rest_num_loc = '//span[contains(text(),"Showing 1")]'
rest_num_raw = driver.find_element_by_xpath(rest_num_loc).text
rest_num = int(re.sub(r'Showing 1.*of\s','',rest_num_raw))
if total_rest==rest_num:
button3 = driver.find_element_by_xpath('//span[contains(text(),"Biking (2 mi.)")]')
button3.click()
time.sleep(2)
button4 = driver.find_element_by_xpath('//span[contains(text(),"Walking (1 mi.)")]')
button4.click()
time.sleep(2)
rest_num_loc = '//span[contains(text(),"Showing 1")]'
rest_num_raw = driver.find_element_by_xpath(rest_num_loc).text
rest_num = int(re.sub(r'Showing 1.*of\s','',rest_num_raw))
return(rest_num)
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
chrome_options = Options()
# add headless mode
chrome_options.add_argument("--headless")
# turn off image loading
prefs = {"profile.managed_default_content_settings.images":2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(chromedriver, chrome_options=chrome_options)
for url in url_list:
yelp_data[url] = yelp_scraper(url)
json.dump(yelp_data, open('../data/yelp_json/yelp_data.json', 'w'), indent="\t")
driver.close()
EXAMPLE:
from selenium.common.exceptions import NoSuchElementException
for item in driver.find_elements_by_class_name('item'):
try:
model = item.find_element_by_class_name('product-model')
price = item.find_element_by_class_name('product-display-price')
title = item.find_element_by_class_name('product-title')
url = item.find_element_by_class_name('js-detail-link')
items.append({'model': model, 'price': price, 'title': title, 'url': url})
print (model.text, price.text, title.text, url.get_attribute("href"))
c = (model.text, price.text, title.text, url.get_attribute("href"))
a.writerow(c)
except NoSuchElementException:
#here you can do what you want to do when an element is not found. Then it'll continue with the next one.
b.close()
Here is my code:
for i in range(1,20):
driver.find_element_by_link_text('Next').click()
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
token_holders = token_holders + soup.find_all('tr')
del token_holders[50*i]
time.sleep(5)
I want the webdriver to keep clicking "Next" until it can't rather than me entering the amount of times it should be clicked in the for loop, but I'm not sure how I can do that
Just replace for loop with while loop as follows:
from selenium.common.exceptions import NoSuchElementException
while True:
try:
driver.find_element_by_link_text('Next').click()
except NoSuchElementException:
break
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
token_holders = token_holders + soup.find_all('tr')
del token_holders[50*i]
time.sleep(5)
This should allow you click "Next" button while it's possible or stop in case no button found on page