I am trying to get text out of div class statement with Selenium.
from selenium import webdriver
import time
chrome_path = r"C:\Users\User1\Downloads\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get("https://www.gigantti.fi/cms/gigantti-outlet/gigantti-outlet/")
time.sleep(10)
posts1 = driver.find_elements_by_class_name('description')
print(posts1)
posts2 = driver.find_elements_by_css_selector('description')
print(posts2)
posts3 = driver.find_elements_by_tag_name('description')
print(posts3)
posts4 = driver.find_elements_by_id('description')
print(posts4)
posts5 = driver.find_elements_by_name('description')
print(posts5)
posts6 = driver.find_elements_by_xpath("//div[#class='description']")
driver.close()
Output is like this
DevTools listening on ws://127.0.0.1:58551/devtools/browser/4ebf2909-a977-43b8-b0bc-2824c2d371d7
[]
[]
[]
[]
[<selenium.webdriver.remote.webelement.WebElement (session="5a662510a9af8bec45752c91b4397d06", element="6e54f7d0-c586-4f97-a974-1af8d19610ce")>]
This it what Chrome shows when inspecting site.
<div class="description">Samsug HD39J1230GW wifi sovitin, erä</div>
I am trying to extract Samsug HD39J1230GW wifi sovitin, erä
The elements are available inside iframe Name Gigantti Outlet.You need to switch to iframe first.Try below code.
Induce WebDrivberWait and frame_to_be_available_and_switch_to_it()
Induce WebDrivberWait and visibility_of_all_elements_located()
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_path = r"C:\Users\User1\Downloads\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get("https://www.gigantti.fi/cms/gigantti-outlet/gigantti-outlet/")
WebDriverWait(driver,10).until(EC.frame_to_be_available_and_switch_to_it((By.NAME,"Gigantti Outlet")))
posts1=WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".description")))
for post in posts1:
print(post.text)
Related
I am trying to collect data on a google map webpage, this is the link. link
This is the code I have tried. My idea is to scroll to the "website name" (you can find the website name once you scroll down) once is present in the browser. but it is not scrolling.
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.maximize_window()
driver.get("https://www.google.com/maps/place/Amsterdamsche+Athleten+Club+Hercules/#52.36937,4.8049968,16.25z/data=!4m5!3m4!1s0x47c5e3c9cbb3d913:0xef85f93ef996cc06!8m2!3d52.3692292!4d4.8056684")
img_result = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,'//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[7]/div[5]/a/div[1]/div[2]/div[1]')))
driver.execute_script("arguments[0].scrollIntoView(true);",img_result)
print(img_result.text)
driver.close()
what is the solution for this?
EDIT:This is what I'm trying to get.
At least on my side I see no need to scroll.
The following code worked:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 20)
url = "https://www.google.com/maps/place/Amsterdamsche+Athleten+Club+Hercules/#52.36937,4.8049968,16z/data=!4m5!3m4!1s0x47c5e3c9cbb3d913:0xef85f93ef996cc06!8m2!3d52.3692292!4d4.8056684"
driver.get(url)
name = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "[data-item-id='authority']"))).text
print(name)
Output:
aachercules.nl
The same could be done with XPath instead of CSS Selectors.
This is the XPath I used:
name = wait.until(EC.visibility_of_element_located((By.XPATH, "//a[#data-item-id='authority']"))).text
I'm trying to scrape a website with show more button; and I'm not able to click on it.
The website is: https://www.wtatennis.com/rankings/singles
And my code is:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import ActionChains
from tqdm import tqdm
import time
options = Options()
options.add_argument("--headless")
browser = webdriver.Chrome(ChromeDriverManager().install(),options=options)
browser.get('https://www.wtatennis.com/rankings/singles')
action = ActionChains(browser)
showmore = browser.find_elements_by_xpath(".//button[contains(#class, 'btn widget-footer__more-button rankings__show-more js-show-more-button')]")
action.move_to_element(showmore).perform()
showmore.click()
time.sleep(5)
Has anyone any idea? Thanks!
Don't use './/' in your locator when you are starting the search from root, as there is no current element your locator won't find any element. Also you can use any attribute to find elements uniquely. see below code:
browser = webdriver.Chrome(options=options)
browser.get('https://www.wtatennis.com/rankings/singles')
WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.XPATH,
'//*[#data-text="Accept Cookies"]'))).click()
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,
'//*[#data-text = "Show More"]'))).click()
use webdriver wait and data attributes
tu use wait import:
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
To wait till all elements are loaded you have to make sure last element is not changing , if its changing keep scrolling .
browser.get('https://www.wtatennis.com/rankings/singles')
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,
'//*[#data-text="Accept Cookies"]'))).click()
value = "start"
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,
'//*[#data-text = "Show More"]'))).click()
while(browser.find_element_by_xpath("(//tr[#class='rankings__row'])[last()]").text != value):
elem = browser.find_element_by_xpath(
'(//*[contains(text(),"Loading")])[2]')
value = browser.find_element_by_xpath(
"(//tr[#class='rankings__row'])[last()]").text
browser.execute_script("arguments[0].scrollIntoView()", elem)
WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located((By.XPATH,
"//tr[#class='rankings__row']")))
try:
WebDriverWait(browser, 10).until_not(EC.text_to_be_present_in_element((By.XPATH,
"(//tr[#class='rankings__row'])[last()]"), value))
except:
None
I'm trying to get the links of the posts on this page, but they are apparently generated by clicking each of the post images. I'm using Selenium and beautifulsoup4 in Python 3.8.
Any idea how to get the links while selenium continues to the next pages?
url: https://www.goplaceit.com/cl/mapa?id_modalidad=1&tipo_pro//*[#id=%22gpi-property-list-container%22]/div[3]/div[1]/div[1]/imgpiedad=1%2C2&selectedTool=list#12/-33.45/-70.66667
after clicking on the image it opens a new tab with the following type of shortening url: https://www.goplaceit.com/propiedad/6198212
which sends me to a url type:
https://www.goplaceit.com/cl/propiedad/venta/departamento/santiago/6198212-depto-con-1d-1b-y-terraza-a-pasos-del-metro-toesca-bodega
My code:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import winsound
from timeit import default_timer as timer
from selenium.webdriver.common.keys import Keys
start = timer()
PROXY = "PROXY" # IP:PORT or HOST:PORT
path_to_extension = r"extension"
options = Options()
#options.add_argument("--incognito")
options.add_argument('load-extension=' + path_to_extension)
#options.add_argument('--disable-java')
options.headless = False
prefs = {"profile.default_content_setting_values.notifications" : 2}
prefs2 = {"profile.managed_default_content_settings.images": 2}
prefs.update(prefs2)
prefs3 = {"profile.default_content_settings.cookies": 2}
prefs.update(prefs3)
options.add_experimental_option("prefs",prefs)
options.add_argument("--start-maximized")
options.add_argument('--proxy-server=%s' % PROXY)
driver = webdriver.Chrome('chromedriver.exe', options=options)
driver.get('https://www.goplaceit.com/cl/')
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="root"]/nav/div/div[2]/div[1]/button'))).click()
correo = driver.find_element(By.XPATH, '//*[#id="email"]')
correo.send_keys("Mail")
contraseña = driver.find_element(By.XPATH, '//*[#id="password"]')
contraseña.send_keys("password")
contraseña.send_keys(Keys.ENTER)
time.sleep(7)
elem.driver.find_element(By.XPATH, '//*[#id="gpi-main-landing-search-input"]/div/input')
elem.click()
elem.send_keys("keywords")
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="gpi-main-landing-search-input"]/div/div[1]/ul/li[1]/a/div/div[1]'))).click()
buscador.send_keys(Keys.ENTER)
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="root"]/div/div/div[1]/div[2]/div/div[1]/div/div[1]/button'))).click()
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="custom-checkbox"]'))).click()
page_number = 0
max_page_number = 30
while page_number<=max_page_number:
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(),"paginator-btn-right")]'))).click()
You can get easily the urls by clicking on an image, saving your url, coming back to the first page and repeating this for all the images:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
driver.get("https://www.goplaceit.com/cl/mapa?id_modalidad=1&tipo_propiedad=1%2C2&selectedTool=list#8/-33.958/-71.206")
images = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#class='sc-iyvyFf ljSqTz']//img")))
urls = []
for i, image in enumerate(images):
window_before = driver.window_handles[0]
image.click()
driver.implicitly_wait(2)
window_after = driver.window_handles[i+1]
driver.switch_to.window(window_after)
urls.append(driver.current_url)
driver.switch_to.window(window_before)
I have to extract the third line of a text.
URL = 'https://simonsmith.github.io/github-user-search/#/search?q=benjamn'
browser = webdriver.Chrome()
browser.get(URL)
time.sleep(20)
content = browser.page_source
soup = BeautifulSoup(content)
for link in soup.find_all('a'):
n=link.get('href')
n = re.sub(r"\#",'',n)
print(n)
In this example I would only collect benjamn from the output and discard the rest.
OUTPUT:
/
https://developer.github.com/v3/
/benjamn
/BenjamNathan
/benjamni
/benjamnnzz
/BenjamnTal
/benjamncresnik
/benjamn1012990
/benjamnsmith
/benjamn77
/BENJAMNDO4FO
/benjamnzzzz
/benjamn25
/benjamnn
/benjamn2
/benjamnwilliams
https://github.com/simonsmith/github-user-search
You wrote:
for link in soup.find_all('a'):
Suppose you instead had:
links = list(soup.find_all('a'))
for link in links:
Then links[2] would contain the desired link.
Equivalently, you could use:
for i, link in enumerate(soup.find_all('a')):
and focus on the particular link where i == 2.
You can instead better refine your selector and use a css selector
if you use either
li .u-flex
or
[class^=User].u-flex
the first being faster; you will get only the 15 links for people. If you then use find_element_by_css_selector, you will only return the first match.
That is:
browser.find_element_by_css_selector("li .u-flex").get_attribute("href")
No need for BeautifulSoup but the equivalent is:
soup.select_one('li .u-flex')['href']
To fetch the value benjamn use WebdriverWait and element_to_be_clickable with following xpath.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
URL = 'https://simonsmith.github.io/github-user-search/#/search?q=benjamn'
browser = webdriver.Chrome()
browser.get(URL)
element=WebDriverWait(browser, 15).until(EC.element_to_be_clickable((By.XPATH,"(//a[starts-with(#class,'User_')]//p[starts-with(#class,'User_')])[1]")))
print(element.text)
Output Printed on console:
benjamn
To Print all the text values use following code.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
URL = 'https://simonsmith.github.io/github-user-search/#/search?q=benjamn'
browser = webdriver.Chrome()
browser.get(URL)
elements=WebDriverWait(browser, 15).until(EC.visibility_of_all_elements_located((By.XPATH,"//a[starts-with(#class,'User_')]//p[starts-with(#class,'User_')]")))
for element in elements:
print(element.text)
Output:
benjamn
BenjamNathan
benjamni
benjamnnzz
BenjamnTal
benjamncresnik
benjamn1012990
benjamnsmith
benjamn77
BENJAMNDO4FO
benjamnzzzz
benjamn25
benjamnn
benjamn2
benjamnwilliams
You can grab that link using selenium making use of xpath and surely not hardcoding index like the following:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
LINK = 'https://simonsmith.github.io/github-user-search/#/search?q=benjamn'
with webdriver.Chrome() as driver:
wait = WebDriverWait(driver, 10)
driver.get(LINK)
expected_link = wait.until(EC.presence_of_element_located((By.XPATH,"//a[./*[contains(#class,'username')]]")))
print(expected_link.get_attribute("href"))
Output:
https://simonsmith.github.io/github-user-search/#/benjamn
So I am trying to scrape some information from a website, and when I try to get element by xpath I am getting an error "Unable to locate element" when the path that I provide is copied directly from the inspection tool. I tried a couple of things but it did not work, so I told my self I was going to try an easier path (TEST) but still don't work. Is it possible that the website does not show all the html code when inspecting?
Here is the code, with the website and the xpath that I tried.
URL_TRADER = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
TEST = 'html/body/div[#id="app"]/div[#class="logged-out free"]/div[#class="client-components-app-app__wrapper undefined undefined"]'#/div/div[1]/div/div[2]/div/section/main/table/tbody/tr[3]/td[3]/div/div/div/div[1]/span'
X_PATH = '//*[#id="app"]/div/div/div[2]/div/div[1]/div/div[2]/div/section/main/table/tbody/tr[1]/td[3]/div/div/div/div[1]/span'
The main function is:
def trader_table():
# Loading Chrome and getting to the website
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
driver.get(URL_TRADER)
driver.implicitly_wait(10)
text = driver.find_element_by_xpath(X_PATH).get_attribute('innerHTML')
return text
I added a wait condition and used a css selector combination instead but this is the same as your xpath I think
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
driver = webdriver.Chrome()
driver.get(url)
data = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".client-components-experts-infoTable-expertTable__table .client-components-experts-infoTable-expertTable__dataRow td:nth-child(3)"))).get_attribute('innerHTML')
print(data)
You have provided all the necessary details required to construct an answer but you didn't explicitly mention which element you were trying to get.
However, the commented out xpath within TEST gives us a hint you were after the Price Target and to extract the text within those elements as the elements are JavaScript enabled elements, you need to induce WebDriverWait for the visibility_of_all_elements_located() and you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument('--disable-extensions')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get("https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly")
print([element.get_attribute('innerHTML') for element in WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='client-components-experts-infoTable-expertTable__isBuy']//span")))])
Console Output:
['$14.00', '$110.00', '$237.00', '$36.00', '$150.00', '$71.00', '$188.00', '$91.00', '$101.00', '$110.00']
I guess you are looking after price Here you go.
from selenium import webdriver
URL_TRADER = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
TEST = 'html/body/div[#id="app"]/div[#class="logged-out free"]/div[#class="client-components-app-app__wrapper undefined undefined"]'#/div/div[1]/div/div[2]/div/section/main/table/tbody/tr[3]/td[3]/div/div/div/div[1]/span'
X_PATH = "//div[#class='client-components-experts-infoTable-expertTable__isBuy']/div/span"
def trader_table():
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
driver.get(URL_TRADER)
driver.implicitly_wait(10)
text = driver.find_element_by_xpath(X_PATH).get_attribute('innerHTML')
print(text)
return text
Edited for All rows
from selenium import webdriver
URL_TRADER = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
X_PATH = "//div[#class='client-components-experts-infoTable-expertTable__isBuy']/div/span"
def trader_table():
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
driver.get(URL_TRADER)
driver.implicitly_wait(10)
list_ele= driver.find_elements_by_xpath(X_PATH)
price_list = []
for ele in list_ele:
print(ele.text)
price_list.append(ele.text)
return price_list
list=trader_table()
print(list)
from selenium import webdriver
import time
driver = webdriver.Chrome("your webdriver location")
driver.get("https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly")
time.sleep(10)
y = driver.find_element_by_id('app').get_attribute('innerHTML')
print(y)
prints full inner html