Python selenium just screenshots the first element multiple times throughout the loop - python-3.x

I'm trying to take a screenshot of each comment in a reddit post using selenium python. All comments have the same id/class and that's what I have used to select them.
Here's my code;
import requests
from bs4 import BeautifulSoup
import pyttsx3, pyautogui
from PIL import Image
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome(executable_path='C:\Selenium_Drivers\chromedriver.exe')
url = 'https://www.reddit.com/user/UoPeople09/comments/wlt4qj/what_made_you_apply_at_uopeople/'
driver.get(url)
driver.implicitly_wait(5)
total_height = int(driver.execute_script("return document.body.scrollHeight"))
u = 1
for i in range(1, total_height*2, 50):
driver.execute_script(f"window.scrollTo(0, {i})")
comment = driver.find_element(By.CSS_SELECTOR, 'div#t1_ikllxsq._3sf33-9rVAO_v4y0pIW_CH')
comment.screenshot(f'E:\WEB SCRAPING PROJECTS\PROJECTS\Reddit Scraping\shot{u}.png')
u += 1
Well my code scrolls down the page and saves screenshots in my desired path. But the problem is that all the screenshots are of the first element(comment) in the reddit post.
I want my code to save a screenshot of each comment separately. Need help

Here you have an exmample including the scroll till the end of the page:
# Needed libs
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.reddit.com/user/UoPeople09/comments/wlt4qj/what_made_you_apply_at_uopeople/'
wait = WebDriverWait(driver, 5)
driver.get(url)
# Wait for reject cookies button and push on it
reject_cookies_button = wait.until(EC.presence_of_element_located((By.XPATH, f"(//section[#class='_2BNSty-Ld4uppTeWGfEe8r']//button)[2]")))
reject_cookies_button.click()
# Make scroll till the end of the page
while True:
high_before_scroll = driver.execute_script('return document.body.scrollHeight')
driver.execute_script('window.scrollTo(100, document.body.scrollHeight);')
time.sleep(2)
if driver.execute_script('return document.body.scrollHeight') == high_before_scroll:
break
# We take how many comments we have
comments = wait.until(EC.presence_of_all_elements_located((By.XPATH, f"//div[contains(#class, 'Comment')]")))
# We take an screenshot for every comment and we save it
u = 1
for comment in comments:
driver.execute_script("arguments[0].scrollIntoView();", comment)
comment.screenshot(f'./shot{u}.png')
u += 1
I hope the comments in the code help you to understand what is happening
My code is done for linux, but just initialize the driver with your linux chromedriver

To get the screenshots of each comments, you need to identify the comment elements and then scroll to each comments and then take the screen shot.
This approach works for me.
url='https://www.reddit.com/user/UoPeople09/comments/wlt4qj/what_made_you_apply_at_uopeople/'
driver.get(url)
#disabled coockie button
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'Reject non-essential')]"))).click()
#Get all the comments
comments = driver.find_elements(By.CSS_SELECTOR, "[data-testid='comment_author_link']")
print(len(comments))
for i in range(len(comments)):
#Scroll to each comment
comments[i].location_once_scrolled_into_view
time.sleep(2)# slowdown the scripts to take the screenshot
driver.save_screenshot(f'E:\WEB SCRAPING PROJECTS\PROJECTS\Reddit Scraping\shot{i+1}.png')
Note: you have all the libraries, you need import time library only.

Related

Unable to scrape texts from URLs

I have been strugling to scrape the contents/text of news articles from each URLs. The extraction of URLs works fine, but scraping the texts from each URLs has been challenging. Below is my code:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for j in article_link:
news_response = requests.get(j)
news_data = news_response.content
news_soup = BeautifulSoup(news_data, 'html.parser')
art_cont = news_soup.find('div', 'Article__StyledArticleContent-sc-uw4nkg-0')
full_text.append(art_cont.text)
print(article_link)
print(full_text)
I tried to use beautifulsoup, but it doesn't seem to work. I will be grateful for any help.
First off you should probably unindent the second for loop, it shouldn't be running inside of the first loop (you will be doubling and getting all of the information countless extra times).
Second. The requests that you send are returning a webpage that has content blocked (I could not figure out a way around this with inserting headers into the request). What you could do is use the driver to load each of the links and grab the text from there, here is how you could do that.
for link in article_link:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
The full script would look like this:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for link in article_link[:5]:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
print(article_link)
print(full_text)
The best course of action is to utilize selenium throughout as the site's content is cloudflare secured. Although #Andrew Ryan has already addressed the issue, I thought I'd come up with a shorter version of it since this answer was already halfway through at the time of his posting.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
link = 'https://www.iol.co.za/news/south-africa/eastern-cape'
def get_links_and_texts(driver,url):
driver.get(url)
for article_link in [i.get_attribute('href') for i in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//article/a[starts-with(#class,'Link__StyledLink')]")))]:
driver.get(article_link)
art_content = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,".article-content"))).text
yield {"Link":article_link,"article_content":art_content}
if __name__ == '__main__':
with webdriver.Chrome() as driver:
for item in get_links_and_texts(driver,link):
print(item)

Python/Selenium can't get title of youtube video after search

I'm learning how to use selenium and I want to make a program that opens up youtube, searches for a video and prints the title of the first video to appear. I don't know why but it instead prints the title of the first video on youtube's homepage.
from logging import exception
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
PATH = 'C:\\Program Files (x86)\\chromedriver.exe'
driver = webdriver.Chrome(PATH)
driver.get("https://www.youtube.com/")
search = driver.find_element(By.CSS_SELECTOR, 'input#search')
search.send_keys("busqueda")
time.sleep(1)
search.send_keys(Keys.RETURN)
time.sleep(5)
try:
element = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, '//*[#id="video-title"]')))
print("título: ", element.get_attribute("innerHTML"))
except BaseException as e:
print(e)
finally:
driver.quit()
Please use below xpath, which will print the YouTube videos title with,
'.//a[#class = "yt-simple-endpoint style-scope ytd-video-renderer"]//yt-formatted-string[#class = "style-scope ytd-video-renderer"]'
elem=driver.find_element(By.CSS_SELECTOR,"a#video-title yt-formatted-string.style-scope.ytd-video-renderer")
print(elem.text)
Should suffice for the first element with that selector. After you add a wait.

trip advisor scraper using selenium, unable to get all images having same class name using find_elements_by_classname selenium

The problem I'm facing is I'm using
driver.find_elements_by_classname("a_classname_common_to_all_images_in_tripadvisor_hotels")
However, on running script each time I'm getting less as well as different outcomes.
For instance, sometimes it scrapes first 5 out of 30 on the page, sometimes 4/30 and so on.
I'm scraping images from this link:
https://www.tripadvisor.in/Hotels-g304551-New_Delhi_National_Capital_Territory_of_Delhi-Hotels.html
images = driver.find_elements_by_class_name("_1a4WY7aS")
I am able to find all names of the hotels using class_name method, however with images it's variable.
Any help is appreciated, thanks :)
From How can I scroll a web page using selenium webdriver in python?
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Inducing a webdriver wait to load all your elements.
images = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_1a4WY7aS")))
Import
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
When it scrapes only 5 of the images present means only 5 images were loaded. You should do 2 things to get every image on the page.
Scroll down to the end of the page: You can do this by selecting the body element and then sending the down keys.
from selenium.webdriver.common.keys import Keys
import time
for _ in range(10):
driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
After scrolling, wait for the elements to be present
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,
"img._1a4WY7aS")))

Scrape Table With JavaScript "Show More Button" In Webpage

I am attempting to pull a data table from a NYT open-access web article on the number of COVID-19 cases, which can be found here. The table shows the top 10 states with highest number of cases, and expands to all 50 states and U.S. territories upon clicking the "Show more" button.
The HTML portion of the table is as follows:
Using this tutorial, I have written the following code utilizing Selenium to try clicking this button, and pass this page off to BeautifulSoup to begin synthesizing for use in Pandas. My initial code looks as follows:
from bs4 import BeautifulSoup
import selenium
import time
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options)
driver.get("https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html")
At this juncture, I am not sure how to execute clicking the button (found in the HTML snippet: <button class="svelte-1tjczrs">Show more</button>), and stage it for BeautifulSoup.
Any help is greatly appreciated!
Try the following attempt. It should fetch you the required content unveiling show more button and put the same in a dataframe.
import pandas
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html"
with webdriver.Chrome() as driver:
driver.get(link)
datalist = []
show_more = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#g-cases-by-state button[class^='svelte']")))
driver.execute_script("arguments[0].click();",show_more)
for elem in WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#g-cases-by-state table[class^='svelte'] tr"))):
data = [item.text for item in elem.find_elements_by_css_selector("th,td")]
datalist.append(data)
df = pandas.DataFrame(datalist)
print(df)

How to locate elements through Selenium and Xpath

So I am trying to scrape some information from a website, and when I try to get element by xpath I am getting an error "Unable to locate element" when the path that I provide is copied directly from the inspection tool. I tried a couple of things but it did not work, so I told my self I was going to try an easier path (TEST) but still don't work. Is it possible that the website does not show all the html code when inspecting?
Here is the code, with the website and the xpath that I tried.
URL_TRADER = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
TEST = 'html/body/div[#id="app"]/div[#class="logged-out free"]/div[#class="client-components-app-app__wrapper undefined undefined"]'#/div/div[1]/div/div[2]/div/section/main/table/tbody/tr[3]/td[3]/div/div/div/div[1]/span'
X_PATH = '//*[#id="app"]/div/div/div[2]/div/div[1]/div/div[2]/div/section/main/table/tbody/tr[1]/td[3]/div/div/div/div[1]/span'
The main function is:
def trader_table():
# Loading Chrome and getting to the website
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
driver.get(URL_TRADER)
driver.implicitly_wait(10)
text = driver.find_element_by_xpath(X_PATH).get_attribute('innerHTML')
return text
I added a wait condition and used a css selector combination instead but this is the same as your xpath I think
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
driver = webdriver.Chrome()
driver.get(url)
data = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".client-components-experts-infoTable-expertTable__table .client-components-experts-infoTable-expertTable__dataRow td:nth-child(3)"))).get_attribute('innerHTML')
print(data)
You have provided all the necessary details required to construct an answer but you didn't explicitly mention which element you were trying to get.
However, the commented out xpath within TEST gives us a hint you were after the Price Target and to extract the text within those elements as the elements are JavaScript enabled elements, you need to induce WebDriverWait for the visibility_of_all_elements_located() and you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument('--disable-extensions')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get("https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly")
print([element.get_attribute('innerHTML') for element in WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='client-components-experts-infoTable-expertTable__isBuy']//span")))])
Console Output:
['$14.00', '$110.00', '$237.00', '$36.00', '$150.00', '$71.00', '$188.00', '$91.00', '$101.00', '$110.00']
I guess you are looking after price Here you go.
from selenium import webdriver
URL_TRADER = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
TEST = 'html/body/div[#id="app"]/div[#class="logged-out free"]/div[#class="client-components-app-app__wrapper undefined undefined"]'#/div/div[1]/div/div[2]/div/section/main/table/tbody/tr[3]/td[3]/div/div/div/div[1]/span'
X_PATH = "//div[#class='client-components-experts-infoTable-expertTable__isBuy']/div/span"
def trader_table():
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
driver.get(URL_TRADER)
driver.implicitly_wait(10)
text = driver.find_element_by_xpath(X_PATH).get_attribute('innerHTML')
print(text)
return text
Edited for All rows
from selenium import webdriver
URL_TRADER = 'https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly'
X_PATH = "//div[#class='client-components-experts-infoTable-expertTable__isBuy']/div/span"
def trader_table():
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
driver.get(URL_TRADER)
driver.implicitly_wait(10)
list_ele= driver.find_elements_by_xpath(X_PATH)
price_list = []
for ele in list_ele:
print(ele.text)
price_list.append(ele.text)
return price_list
list=trader_table()
print(list)
from selenium import webdriver
import time
driver = webdriver.Chrome("your webdriver location")
driver.get("https://www.tipranks.com/analysts/joseph-foresi?benchmark=none&period=yearly")
time.sleep(10)
y = driver.find_element_by_id('app').get_attribute('innerHTML')
print(y)
prints full inner html

Resources