how to get top 100 results from google search using selenium? - python-3.x

import time
from selenium import webdriver
webdriver.Chrome('/Users/name/PycharmProjects/untitled15/chromedriver')
driver.get('http://www.google.com/xhtml')
time.sleep(2)
search_box = driver.find_element_by_name('q')
search_box.send_keys('keywordtosearch')
search_box.submit()
time_sleep(10)
driver.quit()
this is what so far I have obtained. I want to get top x searches from google for any given keyword
please assist me with this. Thank you!

I did a similar mini-project, where:
to get results by their respective links:
results_list = browser.find_elements_by_tag_name('a')
to simply loop through results and print them out:
i=0
for item in results_list:
print (item.get_attribute('href'), '\n')
if i == 10:
break
else:
i+=1
Hope this helps! Good luck.

Related

How to scroll down google job page using selenium python

I am trying to scroll down the job posts using below lines, but it will give sometime correct results to scroll down to the end and sometimes it won't.
html = driver.find_element_by_tag_name('html')
time.sleep(5)
html.send_keys(Keys.END)
Can anyone suggest me how to scroll down to the end, please find the link and screenshot below.
https://www.google.com/search?q=upsc+jobs+in+india&rlz=1C1CHBF_enIN869IN869&oq=upsc+jo&aqs=chrome.1.69i57j0i433i512j0i131i433i512j0i512l3j0i131i433i512l2j0i512j0i433i512&sourceid=chrome&ie=UTF-8&ibp=htl;jobs&sa=X&sqi=2&ved=2ahUKEwjR27GN_qPzAhX4ppUCHb_0B_QQkd0GegQIORAB#fpstate=tldetail&sxsrf=AOaemvIxuJXh3if0tw7ezZfjkXRe5DSxsA:1632911697417&htivrt=jobs&htidocid=N1_BNfzt8n8auXjGAAAAAA%3D%3D
The more you scroll the more data you get, basically it's a dynamic web site. I have hardcoded 50 as a dummy number, you can have 100 or any other number for this matter.
You can use the sample code :
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(30)
driver.get("https://www.google.com/search?q=upsc+jobs+in+india&rlz=1C1CHBF_enIN869IN869&oq=upsc+jo&aqs=chrome.1.69i57j0i433i512j0i131i433i512j0i512l3j0i131i433i512l2j0i512j0i433i512&sourceid=chrome&ie=UTF-8&ibp=htl;jobs&sa=X&sqi=2&ved=2ahUKEwjR27GN_qPzAhX4ppUCHb_0B_QQkd0GegQIORAB#fpstate=tldetail&sxsrf=AOaemvIxuJXh3if0tw7ezZfjkXRe5DSxsA:1632911697417&htivrt=jobs&htidocid=N1_BNfzt8n8auXjGAAAAAA%3D%3D")
j = 1
for i in range(50):
element = driver.find_element(By.XPATH, f"(//div[#role='heading'])[{j}]")
driver.execute_script("arguments[0].scrollIntoView(true);", element)
j = j + 1
You can also try this to scroll till the end.
driver.get("https://www.google.com/search?q=upsc+jobs+in+india&rlz=1C1CHBF_enIN869IN869&oq=upsc+jo&aqs=chrome.1.69i57j0i433i512j0i131i433i512j0i512l3j0i131i433i512l2j0i512j0i433i512&sourceid=chrome&ie=UTF-8&ibp=htl;jobs&sa=X&sqi=2&ved=2ahUKEwjR27GN_qPzAhX4ppUCHb_0B_QQkd0GegQIORAB#fpstate=tldetail&sxsrf=AOaemvIxuJXh3if0tw7ezZfjkXRe5DSxsA:1632911697417&htivrt=jobs&htidocid=N1_BNfzt8n8auXjGAAAAAA%3D%3D")
i = 0
try:
while True:
options = driver.find_elements_by_xpath("//div[#role='treeitem']")
driver.execute_script("arguments[0].scrollIntoView(true);",options[i])
i+=1
time.sleep(.5)
except:
pass

web scraping data from glassdoor using selenium

please I need some help to run this code (https://github.com/PlayingNumbers/ds_salary_proj/blob/master/glassdoor_scraper.py)
In order to scrape job offer data from Glassdoor
Here's the code snippet:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd
options = webdriver.ChromeOptions()
#Uncomment the line below if you'd like to scrape without a new Chrome window every time.
#options.add_argument('headless')
#Change the path to where chromedriver is in your home folder.
driver = webdriver.Chrome(executable_path=path, options=options)
driver.set_window_size(1120, 1000)
url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+'data scientist'+"&sc.keyword="+'data scientist'+"&locT=&locId=&jobType="
#url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
driver.get(url)
#Let the page load. Change this number based on your internet speed.
#Or, wait until the webpage is loaded, instead of hardcoding it.
time.sleep(5)
#Test for the "Sign Up" prompt and get rid of it.
try:
driver.find_element_by_class_name("selected").click()
except NoSuchElementException:
pass
time.sleep(.1)
try:
driver.find_element_by_css_selector('[alt="Close"]').click() #clicking to the X.
print(' x out worked')
except NoSuchElementException:
print(' x out failed')
pass
#Going through each job in this page
job_buttons = driver.find_elements_by_class_name("jl")
I'm getting an empty list
job_buttons
[]
Your problem is with wrong except argument.
With driver.find_element_by_class_name("selected").click() you are trying to click non-existing element. There is no element matching "selected" class name on that page. This causes NoSuchElementException exception as you can see yourself while you are trying to catch ElementClickInterceptedException exception.
To fix this you should use the correct locator or at least the correct argument in except.
Like this:
try:
driver.find_element_by_class_name("selected").click()
except NoSuchElementException:
pass
Or even
try:
driver.find_element_by_class_name("selected").click()
except:
pass
I'm not sure what elements do you want to get into job_buttons.
The search results containing all the details per each job can be found by this:
job_buttons = driver.find_elements_by_css_selector("li.react-job-listing")

How to get all comments in youtube with selenium?

The webpage shows that there are 702 Comments.
target youtube sample
I write a function get_total_youtube_comments(url) ,many codes copied from the project on github.
project on github
def get_total_youtube_comments(url):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--headless")
driver = webdriver.Chrome(options=options,executable_path='/usr/bin/chromedriver')
wait = WebDriverWait(driver,60)
driver.get(url)
SCROLL_PAUSE_TIME = 2
CYCLES = 7
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.PAGE_DOWN)
html.send_keys(Keys.PAGE_DOWN)
time.sleep(SCROLL_PAUSE_TIME * 3)
for i in range(CYCLES):
html.send_keys(Keys.END)
time.sleep(SCROLL_PAUSE_TIME)
comment_elems = driver.find_elements_by_xpath('//*[#id="content-text"]')
all_comments = [elem.text for elem in comment_elems]
return all_comments
Try to parse all comments on a sample webpage https://www.youtube.com/watch?v=N0lxfilGfak.
url='https://www.youtube.com/watch?v=N0lxfilGfak'
list = get_total_youtube_comments(url)
It can get some comments ,only small party of all comments.
len(list)
60
60 is much less than 702,how to get all comments in youtube with selenium?
#supputuri,i can extract all comments with your code.
comments_list = driver.find_elements_by_xpath("//*[#id='content-text']")
len(comments_list)
709
print(driver.find_element_by_xpath("//h2[#id='count']").text)
717 Comments
comments_list[-1].text
'mistake at 23:11 \nin NOT it should return false if x is true.'
comments_list[0].text
'Got a question on the topic? Please share it in the comment section below and our experts will answer it for you. For Edureka Python Course curriculum, Visit our Website: Use code "YOUTUBE20" to get Flat 20% off on this training.'
Why the comments number is 709 instead of 717 shown in page?
You are getting a limited number of comments as YouTube will load the comments as you keep scrolling down. There are around 394 comments left on that video you have to first make sure all the comments are loaded and then also expand all View Replies so that you will reach the max comments count.
Note: I was able to get 700 comments using the below lines of code.
# get the last comment
lastEle = driver.find_element_by_xpath("(//*[#id='content-text'])[last()]")
# scroll to the last comment currently loaded
lastEle.location_once_scrolled_into_view
# wait until the comments loading is done
WebDriverWait(driver,30).until(EC.invisibility_of_element((By.CSS_SELECTOR,"div.active.style-scope.paper-spinner")))
# load all comments
while lastEle != driver.find_element_by_xpath("(//*[#id='content-text'])[last()]"):
lastEle = driver.find_element_by_xpath("(//*[#id='content-text'])[last()]")
driver.find_element_by_xpath("(//*[#id='content-text'])[last()]").location_once_scrolled_into_view
time.sleep(2)
WebDriverWait(driver,30).until(EC.invisibility_of_element((By.CSS_SELECTOR,"div.active.style-scope.paper-spinner")))
# open all replies
for reply in driver.find_elements_by_xpath("//*[#id='replies']//paper-button[#class='style-scope ytd-button-renderer'][contains(.,'View')]"):
reply.location_once_scrolled_into_view
driver.execute_script("arguments[0].click()",reply)
time.sleep(5)
WebDriverWait(driver, 30).until(
EC.invisibility_of_element((By.CSS_SELECTOR, "div.active.style-scope.paper-spinner")))
# print the total number of comments
print(len(driver.find_elements_by_xpath("//*[#id='content-text']")))
There are a couple of things:
The WebElements within the website https://www.youtube.com/ are dynamic. So are the comments dynamically rendered.
With in the webpage https://www.youtube.com/watch?v=N0lxfilGfak the comments doesn't render unless user scrolls the following element within the Viewport.
The comments are with in:
<!--css-build:shady-->
Which applies, Polymer CSS Builder is used apply Polymer's CSS Mixin shim and ShadyDOM scoping. So some runtime work is still done to convert CSS selectors under the default settings.
Considering the above mentioned factors here's a solution to retrieve all the comments:
Code Block:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException, WebDriverException
import time
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get('https://www.youtube.com/watch?v=N0lxfilGfak')
driver.execute_script("return scrollBy(0, 400);")
subscribe = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[text()='Subscribe']")))
driver.execute_script("arguments[0].scrollIntoView(true);",subscribe)
comments = []
my_length = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//yt-formatted-string[#class='style-scope ytd-comment-renderer' and #id='content-text'][#slot='content']"))))
while True:
try:
driver.execute_script("window.scrollBy(0,800)")
time.sleep(5)
comments.append([my_elem.text for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//yt-formatted-string[#class='style-scope ytd-comment-renderer' and #id='content-text'][#slot='content']")))])
except TimeoutException:
driver.quit()
break
print(comment)
If you don't have to use Selenium I would recommend you to look at the google/youtube api.
https://developers.google.com/youtube/v3/getting-started
Example :
https://www.googleapis.com/youtube/v3/commentThreads?key=YourAPIKey&textFormat=plainText&part=snippet&videoId=N0lxfilGfak&maxResults=100
This would give you the first 100 results and gets you a token that you can append on the next request to get the next 100 results.
I'm not familiar with python, but I'll tell you the steps that I would do to get all comments.
First of all, if your code I think the main issue is with the
CYCLES = 7
According to this, you will be scrolling for 2 seconds 7 times. Since you are successfully grabbing 60 comments, fixing the above condition will solve your issue.
I assume you don't have any issue in finding elements on a website using locators.
You need to get the total comments to count to a variable as an int. (in your case, let's say it's COMMENTS = 715)
Define another variable called VISIBLECOUNTS = 0
The use a while loop to scroll if the COMMENTS > VISIBLECOUNTS
The code might look like this ( really sorry if there are syntax issues )
// python - selenium command to get all comments counts.
COMMENTS = 715
(715 is just a sample value, it will change upon the total comments count)
VISIBLECOUNTE = 0
SCROLL_PAUSE_TIME = 2
while VISIBLECOUNTS < COMMENTS :
html.send_keys(Keys.END)
time.sleep(SCROLL_PAUSE_TIME)
VISIBLECOUNTS = len(driver.find_elements_by_xpath('//ytm-comment-thread-renderer'))
With this, you will be scrolling down until the COMMENTS = VISIBLECOUNTS. Then you can grab all the comments as all of them share the same element attributes such as ytm-comment-thread-renderer
Since I'm not familiar with python I'll add the command to get the comments to count from js. you can try this on your browser and convert it into your python command
Run the bellow queries in your console and check.
To get total comments count
var comments = document.querySelector(".comment-section-header-text").innerText.split(" ")
//We can get the text value "Comments • 715" and split by spaces and get the last value
Number(comments[comments.length -1])
//Then convirt string "715" to int, you just need to do these in python - selenium
To get active comments count
$x("//ytm-comment-thread-renderer").length
Note: if it's hard to extract the values you still can use the selenium js executor and do the scrolling with js until all the comments are visible. But I guess it's not hard to do it in python since the logic is the same.
I'm really sorry about not being able to add the solution in python.
But hope this helped.
cheers.
The first thing you need to do is scroll down the video page to load all comments:
$actualHeight = 0;
$nextHeight = 0;
while (true) {
try {
$nextHeight += 10;
$actualHeight = $this->driver->executeScript('return document.documentElement.scrollHeight;');
if ($nextHeight >= ($actualHeight - 50 ) ) break;
$this->driver->executeScript("window.scrollTo(0, $nextHeight);");
$this->driver->manage()->timeouts()->implicitlyWait = 10;
} catch (Exception $e) {
break;
}
}

Python/Selenium - How to parse the URL and click next page?

I am trying to parse the hrefs and the titles of all articles from https://www.weforum.org/agenda/archive/covid-19 but I also want to pull information on the next page.
My code can only pull the current page but is not working on click() next page.
driver.get("https://www.weforum.org/agenda/archive/covid-19")
links =[]
titles = []
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.tout__link'))):
links.append(elem.get_attribute('href'))
titles.append(elem.text)
try:
WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".pagination__nav-text"))).click()
WebDriverWait(driver,5).until(EC.staleness_of(elem))
except:
break
Can anyone help me with the issue? Thank you!
The class name 'pagination__nav-text' is not unique. As per the design, it clicks on the first found element which is "Prev" link. so you would not see that working.
Can you try with this approach,
driver.get("https://www.weforum.org/agenda/archive/covid-19")
wait = WebDriverWait(driver,10)
links =[]
titles = []
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.tout__link'))):
links.append(elem.get_attribute('href'))
titles.append(elem.text)
try:
print('trying to click next')
WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//div[#class='pagination__nav-text' and contains(text(),'Next')]"))).click()
WebDriverWait(driver,5).until(EC.staleness_of(elem))
except:
break
print(links)
print(titles)
driver.quit()

How to automate the crawling without hardcoding any number to it?

I've written a script using python with selenium to scrape names of restaurants from a webpage. It is working great if I hardcode the number of amount I want to parse. The page has got lazy-loading process and it displays 40 names in each scroll. However, my script can handle it. The only thing I would like to improve in my script is that I do not wish to hardcode the number; rather, I want it to detect itself how many are there and parse it successfully. Hope there is someone to help. Here is the code:
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('https://www.yellowpages.ca/search/si/1/pizza/Toronto')
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
links = [posts.text for posts in driver.find_elements_by_xpath("//div[#itemprop='itemListElement']//h3[#itemprop='name']/a")]
if (len(links) == 240):
break
for link in links:
print(link)
driver.quit()
You can check if the number of links has changed in the last iteration
num_Of_links = -1
num = 0
while num != num_Of_links:
num_Of_links = num
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
links = [posts.text for posts in driver.find_elements_by_xpath("//div[#itemprop='itemListElement']//h3[#itemprop='name']/a")]
num = len(links)

Resources