Selenium web scraping in python cant read .text of elements - python-3.x

I am trying to scrap reviews from verizon website and I found the xpath of reviews by doing inspect on webpage. I am executing below code but this review.text doesnt seems to be working perfectly all the time. I get correct text sometimes and sometimes it just prints Error in message -
Not sure , what am I doing wrong..
from selenium import webdriver
url = 'https://www.verizonwireless.com/smartphones/samsung-galaxy-s7/'
browser = webdriver.Chrome(executable_path='/Users/userName/PycharmProjects/Verizon/chromedriver')
browser.get(url)
reviews = []
xp = '//*[#id="BVRRContainer"]/div/div/div/div/div[3]/div/ul/li[2]/a/span[2]'
# read first ten pages of reviews ==>
for j in range(10):
reviews.extend(browser.find_elements_by_xpath('//*[#id="BVRRContainer"]/div/div/div/div/ol/li[*]/div/div[1]'
'/div/div[2]/div/div/div[1]/p'))
try:
next = browser.find_element_by_xpath(xp)
next.click()
except:
print(j,"error clicking")
# Print reviews ===>
for i, review in enumerate(reviews):
try:
print(review.text)
except:
print("Error in :" review)

You should improve the logic of your code. Note, that you cannot get text of elements from the first page after redirection to next page- you need to get text before clicking "Next" button.
Try to use below code instead:
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import time
url = 'https://www.verizonwireless.com/smartphones/samsung-galaxy-s7/'
browser = webdriver.Chrome()
browser.get(url)
reviews = []
xp = '//a[span[#class="bv-content-btn-pages-next"]]'
# read first ten pages of reviews ==>
for i in range(10):
for review in browser.find_elements_by_xpath('//div[#class="bv-content-summary-body-text"]/p'):
reviews.append(review.text)
try:
next = browser.find_element_by_xpath(xp)
next.location_once_scrolled_into_view
time.sleep(0.5) # To wait until scrolled down to "Next" button
next.click()
time.sleep(2) # To wait for page "autoscrolling" to first review + until modal window dissapeared
except WebDriverException:
print("error clicking")
for review in reviews:
print(review)

Related

web scraping data from glassdoor using selenium

please I need some help to run this code (https://github.com/PlayingNumbers/ds_salary_proj/blob/master/glassdoor_scraper.py)
In order to scrape job offer data from Glassdoor
Here's the code snippet:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd
options = webdriver.ChromeOptions()
#Uncomment the line below if you'd like to scrape without a new Chrome window every time.
#options.add_argument('headless')
#Change the path to where chromedriver is in your home folder.
driver = webdriver.Chrome(executable_path=path, options=options)
driver.set_window_size(1120, 1000)
url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+'data scientist'+"&sc.keyword="+'data scientist'+"&locT=&locId=&jobType="
#url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
driver.get(url)
#Let the page load. Change this number based on your internet speed.
#Or, wait until the webpage is loaded, instead of hardcoding it.
time.sleep(5)
#Test for the "Sign Up" prompt and get rid of it.
try:
driver.find_element_by_class_name("selected").click()
except NoSuchElementException:
pass
time.sleep(.1)
try:
driver.find_element_by_css_selector('[alt="Close"]').click() #clicking to the X.
print(' x out worked')
except NoSuchElementException:
print(' x out failed')
pass
#Going through each job in this page
job_buttons = driver.find_elements_by_class_name("jl")
I'm getting an empty list
job_buttons
[]
Your problem is with wrong except argument.
With driver.find_element_by_class_name("selected").click() you are trying to click non-existing element. There is no element matching "selected" class name on that page. This causes NoSuchElementException exception as you can see yourself while you are trying to catch ElementClickInterceptedException exception.
To fix this you should use the correct locator or at least the correct argument in except.
Like this:
try:
driver.find_element_by_class_name("selected").click()
except NoSuchElementException:
pass
Or even
try:
driver.find_element_by_class_name("selected").click()
except:
pass
I'm not sure what elements do you want to get into job_buttons.
The search results containing all the details per each job can be found by this:
job_buttons = driver.find_elements_by_css_selector("li.react-job-listing")

Find the Twitter text box element with python selenium

I made my own Twitter complaint bot that tweets at my ISP if the network drops.
Code works perfect, until it has to find the Twitter textbox to type the tweet.
Main error is:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
I have tried:
Adding time delays
Using Firefox Driver instead of Google
Adding page refreshes before the tweet_at_provider() looks for the textbox
Clicking the "Tweet" button to bring up the textbox to then try type in it
Using find.element_by_id but twitter changes id every pageload
When I comment out the first function call to test, it will find and type 6/10 times.
But when both functions are called the tweet_at_provider() always fails at grabbing the textbox and I get the StaleElement error.
import selenium, time, pyautogui
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, StaleElementReferenceException
PROMISED_DOWN = 200
PROMISED_UP = 10
CHROME_DRIVER_PATH = "C:\Development\chromedriver.exe"
GECKODRIVER_PATH = "C:\\Users\\meeha\\Desktop\\geckodriver\\geckodriver.exe"
TWITTER_USERNAME = "my_username"
TWITTER_PASSWORD = "my_password"
class InternetSpeedTwitterBot():
def __init__(self, driver_path):
self.driver = webdriver.Chrome(executable_path=driver_path)
self.down = 0
self.up = 0
def get_internet_speed(self):
self.driver.get("https://www.speedtest.net/")
self.driver.maximize_window()
time.sleep(2)
go = self.driver.find_element_by_xpath("//*[#id='container']/div/div[3]/div/div/div/div[2]/div[3]/div[1]/a/span[4]")
go.click()
time.sleep(40)
self.down = self.driver.find_element_by_xpath("//*[#id='container']/div/div[3]/div/div/div/div[2]/div[3]/div[3]/div/div[3]/div/div/div[2]/div[1]/div[2]/div/div[2]/span")
self.up = self.driver.find_element_by_xpath("//*[#id='container']/div/div[3]/div/div/div/div[2]/div[3]/div[3]/div/div[3]/div/div/div[2]/div[1]/div[3]/div/div[2]/span")
print(f"Download Speed: {self.down.text} Mbps")
print(f"Upload Speed: {self.up.text} Mbps")
time.sleep(3)
def tweet_at_provider(self):
self.driver.get("https://twitter.com/login")
self.driver.maximize_window()
time.sleep(3)
username = self.driver.find_element_by_name("session[username_or_email]")
password = self.driver.find_element_by_name("session[password]")
username.send_keys(TWITTER_USERNAME)
password.send_keys(TWITTER_PASSWORD)
password.submit()
time.sleep(5)
tweet_compose = self.driver.find_element_by_xpath('//*[#id="react-root"]/div/div/div[2]/header/div/div/div/div[1]/div[3]/a/div/span/div/div/span/span')
tweet_compose.click()
time.sleep(2)
textbox = self.driver.find_element_by_xpath('//*[#id="layers"]/div[2]/div/div/div/div/div/div[2]/div[2]/div/div[3]/div/div/div/div[1]/div/div/div/div/div[2]/div[1]/div/div/div/div/div/div/div/div/div/div[1]/div/div/div/div[2]/div/div/div/div')
textbox.send_keys(f"Hey #Ask_Spectrum, why is my internet speed {self.down.text} down / {self.up.text} up when I pay for {PROMISED_DOWN} down / {PROMISED_UP} up???")
bot = InternetSpeedTwitterBot(CHROME_DRIVER_PATH)
bot.get_internet_speed()
bot.tweet_at_provider()
I had the same error there and figured out that HTML tag was instantly changing as soon as I was typing something on the twitter text-box.
tackle this problem using XPATH of span tag that was showing up after typing space from my side. break tag is the initial tag when there is not any text prompted by you, only after you type anything turns into and that's when you have to copy XPATH and use it for your application

python3 More button clickable in the 1st page but NOT clickable in the 2nd page

This is the extended question on how to click 'More' button on a webpage.
Below is my previous question and one person kindly answered for it.
Since I'm not that familiar with 'find element by class name' function, I just added that person's revised code on my existing code. So my revised code would not be efficient (my apology).
Python click 'More' button is not working
The situation is, there are two types of 'More' button. 1st one is in the property description part and the 2nd one is in the text reviews part. If you click only one 'More' button from any of the reviews, reviews will be expanded so that you can see the full text reviews.
The issue I run into is that I can click 'More' button for the reviews that are in the 1st page but not clickable for the reviews in the 2nd page.
Below is the error message I get but my code still runs (without stopping once it sees an error).
Message:
no such element: Unable to locate element: {"method":"tag name","selector":"span"}
Based on my understanding, there is entry class and corresponding span for every review. I don't understand why it says python can't find it.
from selenium import webdriver
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
review_list=[]
review_appended_list=[]
review_list_v2=[]
review_appended_list_v2=[]
listed_reviews=[]
listed_reviews_v2=[]
listed_reviews_total=[]
listed_reviews_total_v2=[]
final_list=[]
#Incognito Mode
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
#Open Chrome
driver=webdriver.Chrome(executable_path="C:/Users/chromedriver.exe",options=option)
#url I want to visit (I'm going to loop over multiple listings but for simplicity, I just added one listing url).
lists = ['https://www.tripadvisor.com/VacationRentalReview-g30196-d6386734-Hot_51st_St_Walk_to_Mueller_2BDR_Modern_sleeps_7-Austin_Texas.html']
for k in lists:
driver.get(k)
time.sleep(3)
#click 'More' on description part.
link = driver.find_element_by_link_text('More')
try:
ActionChains(driver).move_to_element(link)
time.sleep(1) # time to move to link
link.click()
time.sleep(1) # time to update HTML
except Exception as ex:
print(ex)
time.sleep(3)
# first "More" shows text in all reviews - there is no need to search other "More"
try:
first_entry = driver.find_element_by_class_name('entry')
more = first_entry.find_element_by_tag_name('span')
#more = first_entry.find_element_by_link_text('More')
except Exception as ex:
print(ex)
try:
ActionChains(driver).move_to_element(more)
time.sleep(1) # time to move to link
more.click()
time.sleep(1) # time to update HTML
except Exception as ex:
print(ex)
#begin parsing html and scraping data.
html =driver.page_source
soup=BeautifulSoup(html,"html.parser")
listing=soup.find_all("div", class_="review-container")
all_reviews = driver.find_elements_by_class_name('wrap')
for review in all_reviews:
all_entries = review.find_elements_by_class_name('partial_entry')
if all_entries:
review_list=[all_entries[0].text]
review_appended_list.extend([review_list])
for i in range(len(listing)):
review_id=listing[i]["data-reviewid"]
listing_v1=soup.find_all("div", class_="rating reviewItemInline")
rating=listing_v1[i].span["class"][1]
review_date=listing_v1[i].find("span", class_="ratingDate relativeDate")
review_date_detail=review_date["title"]
listed_reviews=[review_id, review_date_detail, rating[7:8]]
listed_reviews.extend([k])
listed_reviews_total.append(listed_reviews)
for a,b in zip (listed_reviews_total,review_appended_list):
final_list.append(a+b)
#loop over from the 2nd page of the reviews for the same listing.
for j in range(5,20,5):
url_1='-'.join(k.split('-',3)[:3])
url_2='-'.join(k.split('-',3)[3:4])
middle="-or%d-" % j
final_k=url_1+middle+url_2
driver.get(final_k)
time.sleep(3)
link = driver.find_element_by_link_text('More')
try:
ActionChains(driver).move_to_element(link)
time.sleep(1) # time to move to link
link.click()
time.sleep(1) # time to update HTML
except Exception as ex:
print(ex)
# first "More" shows text in all reviews - there is no need to search other "More"
try:
first_entry = driver.find_element_by_class_name('entry')
more = first_entry.find_element_by_tag_name('span')
except Exception as ex:
print(ex)
try:
ActionChains(driver).move_to_element(more)
time.sleep(2) # time to move to link
more.click()
time.sleep(2) # time to update HTML
except Exception as ex:
print(ex)
html =driver.page_source
soup=BeautifulSoup(html,"html.parser")
listing=soup.find_all("div", class_="review-container")
all_reviews = driver.find_elements_by_class_name('wrap')
for review in all_reviews:
all_entries = review.find_elements_by_class_name('partial_entry')
if all_entries:
#print('--- review ---')
#print(all_entries[0].text)
#print('--- end ---')
review_list_v2=[all_entries[0].text]
#print (review_list)
review_appended_list_v2.extend([review_list_v2])
#print (review_appended_list)
for i in range(len(listing)):
review_id=listing[i]["data-reviewid"]
#print review_id
listing_v1=soup.find_all("div", class_="rating reviewItemInline")
rating=listing_v1[i].span["class"][1]
review_date=listing_v1[i].find("span", class_="ratingDate relativeDate")
review_date_detail=review_date["title"]
listed_reviews_v2=[review_id, review_date_detail, rating[7:8]]
listed_reviews_v2.extend([k])
listed_reviews_total_v2.append(listed_reviews_v2)
for a,b in zip (listed_reviews_total_v2,review_appended_list_v2):
final_list.append(a+b)
print (final_list)
if len(listing) !=5:
break
How to enable clicking 'More' button for the 2nd and rest of the pages? so that I can scrape the full text reviews?
Edited Below:
The error messages I get are these two lines.
Message: no such element: Unable to locate element: {"method":"tag name","selector":"span"}
Message: stale element reference: element is not attached to the page document
I guess my whole codes still run because I used try and except function? Usually when python runs into an error, it stops running.
Try it like:
driver.execute_script("""
arguments[0].click()
""", link)

How to automate the crawling without hardcoding any number to it?

I've written a script using python with selenium to scrape names of restaurants from a webpage. It is working great if I hardcode the number of amount I want to parse. The page has got lazy-loading process and it displays 40 names in each scroll. However, my script can handle it. The only thing I would like to improve in my script is that I do not wish to hardcode the number; rather, I want it to detect itself how many are there and parse it successfully. Hope there is someone to help. Here is the code:
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('https://www.yellowpages.ca/search/si/1/pizza/Toronto')
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
links = [posts.text for posts in driver.find_elements_by_xpath("//div[#itemprop='itemListElement']//h3[#itemprop='name']/a")]
if (len(links) == 240):
break
for link in links:
print(link)
driver.quit()
You can check if the number of links has changed in the last iteration
num_Of_links = -1
num = 0
while num != num_Of_links:
num_Of_links = num
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
links = [posts.text for posts in driver.find_elements_by_xpath("//div[#itemprop='itemListElement']//h3[#itemprop='name']/a")]
num = len(links)

Facing issues with my twitter scraper written using python and selenium

I've written a script in python to parse the name, tweets, following and follower of those available in view all section in my profile page of twitter. It is currently doing it's job. However, I find two problems with this scraper:
Every pages it parses the documents from are jamming on the taskbar.
The scraper has got a clumsy look.
Here is what I've written:
from selenium import webdriver
import time
def twitter_data():
driver = webdriver.Chrome()
driver.get('https://twitter.com/?lang=en')
driver.find_element_by_xpath('//input[#id="signin-email"]').send_keys('username')
driver.find_element_by_xpath('//input[#id="signin-password"]').send_keys('password')
driver.find_element_by_xpath('//button[#type="submit"]').click()
driver.implicitly_wait(15)
#Clicking the viewall link
driver.find_element_by_xpath("//small[#class='view-all']//a[contains(#class,'js-view-all-link')]").click()
time.sleep(10)
for links in driver.find_elements_by_xpath("//div[#class='stream-item-header']//a[contains(#class,'js-user-profile-link')]"):
processing_files(links.get_attribute("href"))
#going on to the each profile falling under viewall section
def processing_files(item_link):
driver = webdriver.Chrome()
driver.get(item_link)
# getting information of each profile holder
for prof in driver.find_elements_by_xpath("//div[#class='route-profile']"):
name = prof.find_elements_by_xpath(".//h1[#class='ProfileHeaderCard-name']//a[contains(#class,'ProfileHeaderCard-nameLink')]")[0]
tweet = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[0]
following = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[1]
follower = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[2]
print(name.text, tweet.text, following.text, follower.text)
twitter_data()
I've used both the implicitly_wait and time.sleep in my scraper cause when i found that it was necessary to keep the bot wait a bit longer I used the latter. Thanks in advance to take a look into it.
You can use driver.quit() to close the pages as given below. This will reduce pages in the task bar.
from selenium import webdriver
import time
def twitter_data():
driver = webdriver.Chrome()
driver.get('https://twitter.com/?lang=en')
driver.find_element_by_xpath('//input[#id="signin-email"]').send_keys('username')
driver.find_element_by_xpath('//input[#id="signin-password"]').send_keys('password')
driver.find_element_by_xpath('//button[#type="submit"]').click()
driver.implicitly_wait(15)
#Clicking the viewall link
driver.find_element_by_xpath("//small[#class='view-all']//a[contains(#class,'js-view-all-link')]").click()
time.sleep(10)
for links in driver.find_elements_by_xpath("//div[#class='stream-item-header']//a[contains(#class,'js-user-profile-link')]"):
processing_files(links.get_attribute("href"))
driver.quit()
#going on to the each profile falling under viewall section
def processing_files(item_link):
driver1 = webdriver.Chrome()
driver1.get(item_link)
# getting information of each profile holder
for prof in driver1.find_elements_by_xpath("//div[#class='route-profile']"):
name = prof.find_elements_by_xpath(".//h1[#class='ProfileHeaderCard-name']//a[contains(#class,'ProfileHeaderCard-nameLink')]")[0]
tweet = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[0]
following = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[1]
follower = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[2]
print(name.text, tweet.text, following.text, follower.text)
driver1.quit ()
twitter_data()

Resources