I am trying to scrape the links of different poems from https://www.poets.org/poetsorg/poems but I am getting a Stale Element Reference Exception error. I have tried increasing sleep time and WebDriverWait as well with no success. Any help will be greatly appreciated. My code is below.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def poem_scraper(url):
driver = webdriver.Chrome("\chromedriver.exe")
driver.get(url)
all_links = []
for _ in range(10):
soup = BeautifulSoup(driver.page_source,"html.parser")
total_poems = soup.find_all('td',attrs={'class':"views-field views-field-title"})
for div in total_poems:
links = div.find_all('a')
for a in links:
all_links.append('https://www.poets.org'+a['href'])
timeout = 15
#time.sleep(6)
try:
element_present = EC.presence_of_element_located((By.LINK_TEXT, 'next'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print("Timed out waiting for page to load")
test = driver.find_element_by_link_text('next')
time.sleep(6)
test.click()
return(all_links)
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
Related
I have been strugling to scrape the contents/text of news articles from each URLs. The extraction of URLs works fine, but scraping the texts from each URLs has been challenging. Below is my code:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for j in article_link:
news_response = requests.get(j)
news_data = news_response.content
news_soup = BeautifulSoup(news_data, 'html.parser')
art_cont = news_soup.find('div', 'Article__StyledArticleContent-sc-uw4nkg-0')
full_text.append(art_cont.text)
print(article_link)
print(full_text)
I tried to use beautifulsoup, but it doesn't seem to work. I will be grateful for any help.
First off you should probably unindent the second for loop, it shouldn't be running inside of the first loop (you will be doubling and getting all of the information countless extra times).
Second. The requests that you send are returning a webpage that has content blocked (I could not figure out a way around this with inserting headers into the request). What you could do is use the driver to load each of the links and grab the text from there, here is how you could do that.
for link in article_link:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
The full script would look like this:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for link in article_link[:5]:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
print(article_link)
print(full_text)
The best course of action is to utilize selenium throughout as the site's content is cloudflare secured. Although #Andrew Ryan has already addressed the issue, I thought I'd come up with a shorter version of it since this answer was already halfway through at the time of his posting.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
link = 'https://www.iol.co.za/news/south-africa/eastern-cape'
def get_links_and_texts(driver,url):
driver.get(url)
for article_link in [i.get_attribute('href') for i in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//article/a[starts-with(#class,'Link__StyledLink')]")))]:
driver.get(article_link)
art_content = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,".article-content"))).text
yield {"Link":article_link,"article_content":art_content}
if __name__ == '__main__':
with webdriver.Chrome() as driver:
for item in get_links_and_texts(driver,link):
print(item)
I'm trying to scrape from the moneycontrol.com. When I tried to send value in the search box I keep getting the same error in except block as "Element not Found".
I tried using XPath id as well as using the full XPath but in both cases, it doesn't work.
WITHOUT MAXIMIZING THE WINDOW
XPath id - //*[#id="search_str"]
Full XPath - /html/body/div[1]/header/div[1]/div[1]/div/div/div[2]/div/div/form/input[5]
Attaching the full code below:
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
def search_stock():
driver = webdriver.Chrome(
r'./chromedriver')
driver.get('https://www.moneycontrol.com/')
time.sleep(5)
search_icon = driver.find_element_by_xpath(
'//*[#id="fixedheader"]/div[4]/span')
search_icon.click()
time.sleep(2)
try:
search_box = driver.find_element_by_xpath('//*[#id="search_str"]')
print("Element is visible? " + str(search_box.is_displayed()))
time.sleep(10)
if search_box.is_displayed():
search_box.send_keys('Zomato')
search_box.send_keys(Keys.RETURN)
except NoSuchElementException:
print("Element not found")
driver.close()
search_stock()
Sometimes, it started working but most of the time it throwing exceptions and errors. Struggling since 3 days but none of the solutions working.
web scraping like that seems quite inefficient it is prob better to use requests and bs4. However if you want to do it like this you could try using action chains. found here Or you can do driver.get('https://www.moneycontrol.com/india/stockpricequote/consumer-food/zomato/Z') from the start instead of typing it in.
You may wanna try the below code :
def search_stock():
driver = webdriver.Chrome(r'./chromedriver')
driver.maximize_window()
driver.implicitly_wait(30)
driver.get('https://www.moneycontrol.com/')
wait = WebDriverWait(driver, 10)
time.sleep(5)
try:
ActionChains(driver).move_to_element(wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[id='search_str']")))).perform()
search_box = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[id='search_str']")))
print("Element is visible? ", search_box.is_displayed())
time.sleep(10)
if search_box.is_displayed():
search_box.send_keys('Zomato')
search_box.send_keys(Keys.RETURN)
except NoSuchElementException:
print("Element not found")
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
Try clicking on search_box and only after that sending text there.
search_box = driver.find_element_by_xpath('//form[#id="form_topsearch"]//input[#id="search_str"]')
search_box.click()
time.sleep(0.1)
search_box.send_keys('Zomato')
search_box.send_keys(Keys.RETURN)
Additionally I would advise you using explicit waits of expected conditions instead of hardcoded sleeps.
With it your code will be faster and more reliable.
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def search_stock():
driver = webdriver.Chrome(r'./chromedriver')
wait = WebDriverWait(driver, 20)
driver.get('https://www.moneycontrol.com/')
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="fixedheader"]/div[4]/span')).click()
search_box = wait.until(EC.element_to_be_clickable((By.XPATH, '//form[#id="form_topsearch"]//input[#id="search_str"]')))
search_box.send_keys('Zomato')
search_box.send_keys(Keys.RETURN)
#I'm not sure you should close the driver immediately after involving searching....
#driver.close()
search_stock()
UPD
Let's try this
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
def search_stock():
driver = webdriver.Chrome(r'./chromedriver')
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
driver.get('https://www.moneycontrol.com/')
search_icon = wait.until(EC.presence_of_element_located((By.XPATH, '//*[#id="fixedheader"]/div[4]/span')).click()
time.sleep(0.5)
driver.execute_script("arguments[0].scrollIntoView();", search_icon)
driver.execute_script("arguments[0].click();", search_icon)
search_box = wait.until(EC.presence_of_element_located((By.XPATH, '//form[#id="form_topsearch"]//input[#id="search_str"]')))
driver.execute_script("arguments[0].scrollIntoView();", search_icon)
driver.execute_script("arguments[0].click();", search_icon)
time.sleep(0.5)
search_box.send_keys('Zomato')
search_box.send_keys(Keys.RETURN)
#I'm not sure you should close the driver immediately after involving searching....
#driver.close()
search_stock()
If the above solution is still not working instead of
actions.move_to_element(search_box).click().perform()
try
driver.execute_script("arguments[0].click();", search_box)
Im trying to extract data from articles using selenium in python, the code is identifying the articles but while running the loop a few articles are skipped randomly. Any help resolving this issue will be appreciated.
#Importing libraries
import requests
import os
import json
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
import time
import requests
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import traceback
from webdriver_manager.chrome import ChromeDriverManager
#opening a chrome instance
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r"C:/selenium/chromedriver.exe")
#getting into the website
driver.get('https://academic.oup.com/rof/issue/2/2')
#getting the articles
articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, '/html/body/div[3]/main/section/div/div/div[1]/div/div[3]/div[2]/div[3]/div/div/div/div/h5')))
#loop to get in and out of articles
for article in articles:
try:
ActionChains(driver).key_down(Keys.CONTROL).click(article).key_up(Keys.CONTROL).perform()
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
window1 = driver.window_handles[1]
driver.switch_to_window(window1)
driver.close()
driver.switch_to_window(window0)
except:
print("couldnt get the article")
First, for collect all article element, you can use this css selector:
articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.customLink.item-title a')))
Second, This is wrong method:
driver.switch_to_window(window1)
It's should:
driver.switch_to.window(window1)
See the difference between _ and . above.
Third, you forgot to initialize the window0 variable:
window0 = driver.window_handles[0]
And finally, try the following code:
#getting into the website
driver.get('https://academic.oup.com/rof/issue/2/2')
#getting the articles
articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.customLink.item-title a')))
#loop to get in and out of articles
for article in articles:
try:
ActionChains(driver).key_down(Keys.CONTROL).click(article).key_up(Keys.CONTROL).perform()
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
window1 = driver.window_handles[1]
driver.switch_to.window(window1)
driver.close()
window0 = driver.window_handles[0]
driver.switch_to.window(window0)
except:
print("couldnt get the article")
driver.quit()
So I am trying to scrape some information from a website, and when I try to get element by xpath I am getting an error "Unable to locate element" when the path that I provide is copied directly from the inspection tool. I tried a couple of things but it did not work, so I told my self I was going to try an easier path (TEST) but still don't work. Is it possible that the website does not show all the html code when inspecting?
Here is the code, with the website and the xpath that I tried.
URL_TRADER = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
TEST = 'html/body/div[#id="app"]/div[#class="logged-out free"]/div[#class="client-components-app-app__wrapper undefined undefined"]'#/div/div[1]/div/div[2]/div/section/main/table/tbody/tr[3]/td[3]/div/div/div/div[1]/span'
X_PATH = '//*[#id="app"]/div/div/div[2]/div/div[1]/div/div[2]/div/section/main/table/tbody/tr[1]/td[3]/div/div/div/div[1]/span'
The main function is:
def trader_table():
# Loading Chrome and getting to the website
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
driver.get(URL_TRADER)
driver.implicitly_wait(10)
text = driver.find_element_by_xpath(X_PATH).get_attribute('innerHTML')
return text
I added a wait condition and used a css selector combination instead but this is the same as your xpath I think
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
driver = webdriver.Chrome()
driver.get(url)
data = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".client-components-experts-infoTable-expertTable__table .client-components-experts-infoTable-expertTable__dataRow td:nth-child(3)"))).get_attribute('innerHTML')
print(data)
You have provided all the necessary details required to construct an answer but you didn't explicitly mention which element you were trying to get.
However, the commented out xpath within TEST gives us a hint you were after the Price Target and to extract the text within those elements as the elements are JavaScript enabled elements, you need to induce WebDriverWait for the visibility_of_all_elements_located() and you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument('--disable-extensions')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get("https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly")
print([element.get_attribute('innerHTML') for element in WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='client-components-experts-infoTable-expertTable__isBuy']//span")))])
Console Output:
['$14.00', '$110.00', '$237.00', '$36.00', '$150.00', '$71.00', '$188.00', '$91.00', '$101.00', '$110.00']
I guess you are looking after price Here you go.
from selenium import webdriver
URL_TRADER = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
TEST = 'html/body/div[#id="app"]/div[#class="logged-out free"]/div[#class="client-components-app-app__wrapper undefined undefined"]'#/div/div[1]/div/div[2]/div/section/main/table/tbody/tr[3]/td[3]/div/div/div/div[1]/span'
X_PATH = "//div[#class='client-components-experts-infoTable-expertTable__isBuy']/div/span"
def trader_table():
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
driver.get(URL_TRADER)
driver.implicitly_wait(10)
text = driver.find_element_by_xpath(X_PATH).get_attribute('innerHTML')
print(text)
return text
Edited for All rows
from selenium import webdriver
URL_TRADER = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
X_PATH = "//div[#class='client-components-experts-infoTable-expertTable__isBuy']/div/span"
def trader_table():
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
driver.get(URL_TRADER)
driver.implicitly_wait(10)
list_ele= driver.find_elements_by_xpath(X_PATH)
price_list = []
for ele in list_ele:
print(ele.text)
price_list.append(ele.text)
return price_list
list=trader_table()
print(list)
from selenium import webdriver
import time
driver = webdriver.Chrome("your webdriver location")
driver.get("https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly")
time.sleep(10)
y = driver.find_element_by_id('app').get_attribute('innerHTML')
print(y)
prints full inner html
I am making a program for scrapping the Amazon websites mobile phones but my program is giving me timeout exception even after the page is loaded on time.
Here is my code
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import urllib.request
class Amazon_all_mobile_scraper:
def __init__(self):
self.driver = webdriver.Firefox()
self.delay = 60
self.url = "https://www.amazon.in/mobile-phones/b/ref=sd_allcat_sbc_mobcomp_all_mobiles?ie=UTF8&node=1389401031"
def load_amazon(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver,self.delay)
wait.until(EC.presence_of_element_located((By.CLASS_NAME,"acs-ln-link")))
print("Page is ready.")
except TimeoutException:
print("Took too much time to load!")
except:
print("Something went wrong in loading part!!")
def extract_list_of_mobiles(self):
try:
mobile_list = self.driver.find_element_by_xpath('//div[#class = "acs-ln-link"]')
print(mobile_list)
except NoSuchElementException:
print("Sorry, Unable to get the requested element")
scraper = Amazon_all_mobile_scraper()
scraper.load_amazon()
scraper.extract_list_of_mobiles()
Please help me to figure out whats wrong in this code.
Only changing from acs-ln-link to acs-ln-links will not do the trick. Your xpath should look more like '//div[contains(#class,"acs-ln-nav-expanded")]//*[#class="acs-ln-links"]//a'. This is, however, you can cope with to get the required output:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class Amazon_all_mobile_scraper:
url = "https://www.amazon.in/mobile-phones/b/ref=sd_allcat_sbc_mobcomp_all_mobiles?ie=UTF8&node=1389401031"
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 15)
def load_n_get_from_amazon(self):
self.driver.get(self.url)
mobile_list = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[contains(#class,"acs-ln-nav-expanded")]//*[#class="acs-ln-links"]//a')))
return mobile_list
def __del__(self):
self.driver.close()
if __name__ == '__main__':
scraper = Amazon_all_mobile_scraper()
for item in scraper.load_n_get_from_amazon():
print(f'{item.text}\n{item.get_attribute("href")}\n')
The class wasn't matching "acs-ln-link" should be "acs-ln-links".