Scrape Table With JavaScript "Show More Button" In Webpage - python-3.x

I am attempting to pull a data table from a NYT open-access web article on the number of COVID-19 cases, which can be found here. The table shows the top 10 states with highest number of cases, and expands to all 50 states and U.S. territories upon clicking the "Show more" button.
The HTML portion of the table is as follows:
Using this tutorial, I have written the following code utilizing Selenium to try clicking this button, and pass this page off to BeautifulSoup to begin synthesizing for use in Pandas. My initial code looks as follows:
from bs4 import BeautifulSoup
import selenium
import time
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options)
driver.get("https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html")
At this juncture, I am not sure how to execute clicking the button (found in the HTML snippet: <button class="svelte-1tjczrs">Show more</button>), and stage it for BeautifulSoup.
Any help is greatly appreciated!

Try the following attempt. It should fetch you the required content unveiling show more button and put the same in a dataframe.
import pandas
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html"
with webdriver.Chrome() as driver:
driver.get(link)
datalist = []
show_more = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#g-cases-by-state button[class^='svelte']")))
driver.execute_script("arguments[0].click();",show_more)
for elem in WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#g-cases-by-state table[class^='svelte'] tr"))):
data = [item.text for item in elem.find_elements_by_css_selector("th,td")]
datalist.append(data)
df = pandas.DataFrame(datalist)
print(df)

Related

Python selenium just screenshots the first element multiple times throughout the loop

I'm trying to take a screenshot of each comment in a reddit post using selenium python. All comments have the same id/class and that's what I have used to select them.
Here's my code;
import requests
from bs4 import BeautifulSoup
import pyttsx3, pyautogui
from PIL import Image
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome(executable_path='C:\Selenium_Drivers\chromedriver.exe')
url = 'https://www.reddit.com/user/UoPeople09/comments/wlt4qj/what_made_you_apply_at_uopeople/'
driver.get(url)
driver.implicitly_wait(5)
total_height = int(driver.execute_script("return document.body.scrollHeight"))
u = 1
for i in range(1, total_height*2, 50):
driver.execute_script(f"window.scrollTo(0, {i})")
comment = driver.find_element(By.CSS_SELECTOR, 'div#t1_ikllxsq._3sf33-9rVAO_v4y0pIW_CH')
comment.screenshot(f'E:\WEB SCRAPING PROJECTS\PROJECTS\Reddit Scraping\shot{u}.png')
u += 1
Well my code scrolls down the page and saves screenshots in my desired path. But the problem is that all the screenshots are of the first element(comment) in the reddit post.
I want my code to save a screenshot of each comment separately. Need help
Here you have an exmample including the scroll till the end of the page:
# Needed libs
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.reddit.com/user/UoPeople09/comments/wlt4qj/what_made_you_apply_at_uopeople/'
wait = WebDriverWait(driver, 5)
driver.get(url)
# Wait for reject cookies button and push on it
reject_cookies_button = wait.until(EC.presence_of_element_located((By.XPATH, f"(//section[#class='_2BNSty-Ld4uppTeWGfEe8r']//button)[2]")))
reject_cookies_button.click()
# Make scroll till the end of the page
while True:
high_before_scroll = driver.execute_script('return document.body.scrollHeight')
driver.execute_script('window.scrollTo(100, document.body.scrollHeight);')
time.sleep(2)
if driver.execute_script('return document.body.scrollHeight') == high_before_scroll:
break
# We take how many comments we have
comments = wait.until(EC.presence_of_all_elements_located((By.XPATH, f"//div[contains(#class, 'Comment')]")))
# We take an screenshot for every comment and we save it
u = 1
for comment in comments:
driver.execute_script("arguments[0].scrollIntoView();", comment)
comment.screenshot(f'./shot{u}.png')
u += 1
I hope the comments in the code help you to understand what is happening
My code is done for linux, but just initialize the driver with your linux chromedriver
To get the screenshots of each comments, you need to identify the comment elements and then scroll to each comments and then take the screen shot.
This approach works for me.
url='https://www.reddit.com/user/UoPeople09/comments/wlt4qj/what_made_you_apply_at_uopeople/'
driver.get(url)
#disabled coockie button
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'Reject non-essential')]"))).click()
#Get all the comments
comments = driver.find_elements(By.CSS_SELECTOR, "[data-testid='comment_author_link']")
print(len(comments))
for i in range(len(comments)):
#Scroll to each comment
comments[i].location_once_scrolled_into_view
time.sleep(2)# slowdown the scripts to take the screenshot
driver.save_screenshot(f'E:\WEB SCRAPING PROJECTS\PROJECTS\Reddit Scraping\shot{i+1}.png')
Note: you have all the libraries, you need import time library only.

Not being able to click on a button using Python selenium with error ElementNotInteractableException

I am using Python Selenium to open https://www.walmart.com/ and then want to click "Create an account" tab under "Sign In Account" as shown in this Walmart website pic. The source code of button from Walmart website along with the image is as follows:
<a link-identifier="Create an account" class="no-underline" href="/account/signup?vid=oaoh">
<button class="w_C8 w_DB w_DE db mb3 w-100" type="button" tabindex="-1">Create an account</button>
</a>
Walmart Website Source code-Pic
My python code for opening https://www.walmart.com/ for accessing Create an account button and click on it is as follows:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
url = "https://www.walmart.com/"
s=Service('C:/Users/Samiullah/.wdm/drivers/chromedriver/win32/96.0.4664.45/chromedriver.exe')
driver = webdriver.Chrome(service=s)
driver.get(url)
driver.maximize_window()
elems = driver.find_element(By.XPATH, '//button[normalize-space()="Create an account"]')
time.sleep(3)
elems.click()
However I am getting this error of ElementNotInteractableException as shown by this Error Pic.
Can anyone guide me what is wrong with my code/approach?.
Thanks in advance
You can not click this element directly, you have to hover over "Sign In" element to make this element appeared.
It is also highly recommended to use expected conditions.
This should work:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
url = "https://www.walmart.com/"
s=Service('C:/Users/Samiullah/.wdm/drivers/chromedriver/win32/96.0.4664.45/chromedriver.exe')
driver = webdriver.Chrome(service=s)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
driver.get(url)
driver.maximize_window()
sign_in_btn = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[text()='Sign In']")))
actions.move_to_element(sign_in_btn).perform()
time.sleep(0.5)
wait.until(EC.visibility_of_element_located((By.XPATH, '//button[normalize-space()="Create an account"]'))).click()

Why is my Selenium code returning only half of the data I requested

Recently, I wrote a selenium web scraper that is meant to extract all the information on a table containing data on all presidential elections that have been held in the united states. The table is on this wikipedia site.
The problem is that the code returns all the info I need when I write the result into a .txt file. But anytime I try to print that same result in my text editor, it returns only half of the data I need. I do not understand what the problem is. Can someone help me out?
Here is my code.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas
# using selenium and shromedriver to extract the javascript wikipage
scrape_options = Options()
scrape_options.add_argument('--headless')
driver = webdriver.Chrome(r'web scraping master/chromedriver', options=scrape_options)
page_info = driver.get('https://en.wikipedia.org/wiki/United_States_presidential_election')
# waiting for the javascript to load
try:
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"wikitable")))
finally:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
print(table)
with open("loge.txt","w") as f: #Only part I added to the code
f.write(str(table))
I'm not really sure what was the problem, but this works as expected. I've changed loge.txt to loge.html and the code dumps the entire table.
Mind trying this?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
scrape_options = Options()
scrape_options.add_argument('--headless')
driver = webdriver.Chrome(options=scrape_options)
page_info = driver.get('https://en.wikipedia.org/wiki/United_States_presidential_election')
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "wikitable")))
finally:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
with open("loge.html", "w") as f:
f.write(str(table))

How do I scrape food menu from zomato page?

I am trying to scrape food menu data from zomato. I am using selenium to do the same while inspecting the elements, I can find the class 'category_heading', but using the same in the code gives no result and shows empty list. I am attaching the snippet of the code. Thanks.
I have tried using browser.find_element_by_xpath as well find_element_by_class_name and tag, but nothing seems to work.
order_now = browser.find_element_by_xpath("//*[#id='orig-search-list']/div[1]/div[2]/a").click()
browser.maximize_window()
browser.implicitly_wait(20)
food_item = browser.find_elements_by_class_name("category_heading")
print('food',food_item)
I need the food menu data so that I can store it in a csv.
Page can be slow to load. Try using a wait condition
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
d = webdriver.Chrome()
d.get('https://www.zomato.com/bangalore/burgers-kingdom-indiranagar-bangalore/order')
rows = WebDriverWait(d, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".row")))
soup = bs(d.page_source, 'lxml')
for category in soup.select('.category-container'):
title = category.select_one('h3').text
print(title)
items = [i.text for i in category.select('.category_heading')]
if items:
print(items)
else:
print('No sub-headers')

How can I click second button on one webpage using selenium python3?

Working in python, selenium, Win 7, I want to click on sowcomment button after it is able to clickable that is located on this webPage, handled by wait then I want to click on showmore Comments button to see more comments in order to scrape more comments. After first button, i am able to extract comments.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import selenium.common.exceptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import selenium.webdriver.support.ui as UI
driver = webdriver.Firefox()
driver.get("http://www.aljazeera.com/news/2016/07/erdogan-west-mind-business-160729205425215.html")
wait = UI.WebDriverWait(driver, 10)
next_page_link = wait.until(
EC.element_to_be_clickable((By.ID, 'showcomment')))
next_page_link.click()
wait = UI.WebDriverWait(driver, 20)
next_page_link2 = wait.until(
EC.element_to_be_clickable((By.LINK_TEXT, 'Show more comments')))
next_page_link2.click()
v = driver.find_elements_by_class_name('gig-comment-body')
print(v)
for h in v:
print(h.text)
but second button is unable to click rather giving the exception:
selenium.common.exceptions.TimeoutException:
What is the problem?
I think you should try using execute_script() to perform click as below :
next_page_link2 = wait.until(
EC.element_to_be_clickable((By.XPATH, '//div[contains(text(),"Show more comments")]')))
#now click this using execute_script
driver.execute_script("arguments[0].click()", next_page_link2)
Hope it helps...:)

Resources