I want to extract 3rd line from a text input - python-3.x

I have to extract the third line of a text.
URL = 'https://simonsmith.github.io/github-user-search/#/search?q=benjamn'
browser = webdriver.Chrome()
browser.get(URL)
time.sleep(20)
content = browser.page_source
soup = BeautifulSoup(content)
for link in soup.find_all('a'):
n=link.get('href')
n = re.sub(r"\#",'',n)
print(n)
In this example I would only collect benjamn from the output and discard the rest.
OUTPUT:
/
https://developer.github.com/v3/
/benjamn
/BenjamNathan
/benjamni
/benjamnnzz
/BenjamnTal
/benjamncresnik
/benjamn1012990
/benjamnsmith
/benjamn77
/BENJAMNDO4FO
/benjamnzzzz
/benjamn25
/benjamnn
/benjamn2
/benjamnwilliams
https://github.com/simonsmith/github-user-search

You wrote:
for link in soup.find_all('a'):
Suppose you instead had:
links = list(soup.find_all('a'))
for link in links:
Then links[2] would contain the desired link.
Equivalently, you could use:
for i, link in enumerate(soup.find_all('a')):
and focus on the particular link where i == 2.

You can instead better refine your selector and use a css selector
if you use either
li .u-flex
or
[class^=User].u-flex
the first being faster; you will get only the 15 links for people. If you then use find_element_by_css_selector, you will only return the first match.
That is:
browser.find_element_by_css_selector("li .u-flex").get_attribute("href")
No need for BeautifulSoup but the equivalent is:
soup.select_one('li .u-flex')['href']

To fetch the value benjamn use WebdriverWait and element_to_be_clickable with following xpath.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
URL = 'https://simonsmith.github.io/github-user-search/#/search?q=benjamn'
browser = webdriver.Chrome()
browser.get(URL)
element=WebDriverWait(browser, 15).until(EC.element_to_be_clickable((By.XPATH,"(//a[starts-with(#class,'User_')]//p[starts-with(#class,'User_')])[1]")))
print(element.text)
Output Printed on console:
benjamn
To Print all the text values use following code.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
URL = 'https://simonsmith.github.io/github-user-search/#/search?q=benjamn'
browser = webdriver.Chrome()
browser.get(URL)
elements=WebDriverWait(browser, 15).until(EC.visibility_of_all_elements_located((By.XPATH,"//a[starts-with(#class,'User_')]//p[starts-with(#class,'User_')]")))
for element in elements:
print(element.text)
Output:
benjamn
BenjamNathan
benjamni
benjamnnzz
BenjamnTal
benjamncresnik
benjamn1012990
benjamnsmith
benjamn77
BENJAMNDO4FO
benjamnzzzz
benjamn25
benjamnn
benjamn2
benjamnwilliams

You can grab that link using selenium making use of xpath and surely not hardcoding index like the following:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
LINK = 'https://simonsmith.github.io/github-user-search/#/search?q=benjamn'
with webdriver.Chrome() as driver:
wait = WebDriverWait(driver, 10)
driver.get(LINK)
expected_link = wait.until(EC.presence_of_element_located((By.XPATH,"//a[./*[contains(#class,'username')]]")))
print(expected_link.get_attribute("href"))
Output:
https://simonsmith.github.io/github-user-search/#/benjamn

Related

Unable to scrape texts from URLs

I have been strugling to scrape the contents/text of news articles from each URLs. The extraction of URLs works fine, but scraping the texts from each URLs has been challenging. Below is my code:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for j in article_link:
news_response = requests.get(j)
news_data = news_response.content
news_soup = BeautifulSoup(news_data, 'html.parser')
art_cont = news_soup.find('div', 'Article__StyledArticleContent-sc-uw4nkg-0')
full_text.append(art_cont.text)
print(article_link)
print(full_text)
I tried to use beautifulsoup, but it doesn't seem to work. I will be grateful for any help.
First off you should probably unindent the second for loop, it shouldn't be running inside of the first loop (you will be doubling and getting all of the information countless extra times).
Second. The requests that you send are returning a webpage that has content blocked (I could not figure out a way around this with inserting headers into the request). What you could do is use the driver to load each of the links and grab the text from there, here is how you could do that.
for link in article_link:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
The full script would look like this:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for link in article_link[:5]:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
print(article_link)
print(full_text)
The best course of action is to utilize selenium throughout as the site's content is cloudflare secured. Although #Andrew Ryan has already addressed the issue, I thought I'd come up with a shorter version of it since this answer was already halfway through at the time of his posting.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
link = 'https://www.iol.co.za/news/south-africa/eastern-cape'
def get_links_and_texts(driver,url):
driver.get(url)
for article_link in [i.get_attribute('href') for i in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//article/a[starts-with(#class,'Link__StyledLink')]")))]:
driver.get(article_link)
art_content = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,".article-content"))).text
yield {"Link":article_link,"article_content":art_content}
if __name__ == '__main__':
with webdriver.Chrome() as driver:
for item in get_links_and_texts(driver,link):
print(item)

scroll google map webpage sidebar using selenium and python

I am trying to collect data on a google map webpage, this is the link. link
This is the code I have tried. My idea is to scroll to the "website name" (you can find the website name once you scroll down) once is present in the browser. but it is not scrolling.
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.maximize_window()
driver.get("https://www.google.com/maps/place/Amsterdamsche+Athleten+Club+Hercules/#52.36937,4.8049968,16.25z/data=!4m5!3m4!1s0x47c5e3c9cbb3d913:0xef85f93ef996cc06!8m2!3d52.3692292!4d4.8056684")
img_result = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,'//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[7]/div[5]/a/div[1]/div[2]/div[1]')))
driver.execute_script("arguments[0].scrollIntoView(true);",img_result)
print(img_result.text)
driver.close()
what is the solution for this?
EDIT:This is what I'm trying to get.
At least on my side I see no need to scroll.
The following code worked:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 20)
url = "https://www.google.com/maps/place/Amsterdamsche+Athleten+Club+Hercules/#52.36937,4.8049968,16z/data=!4m5!3m4!1s0x47c5e3c9cbb3d913:0xef85f93ef996cc06!8m2!3d52.3692292!4d4.8056684"
driver.get(url)
name = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "[data-item-id='authority']"))).text
print(name)
Output:
aachercules.nl
The same could be done with XPath instead of CSS Selectors.
This is the XPath I used:
name = wait.until(EC.visibility_of_element_located((By.XPATH, "//a[#data-item-id='authority']"))).text

Click on show more button; selenium scrape with python

I'm trying to scrape a website with show more button; and I'm not able to click on it.
The website is: https://www.wtatennis.com/rankings/singles
And my code is:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import ActionChains
from tqdm import tqdm
import time
options = Options()
options.add_argument("--headless")
browser = webdriver.Chrome(ChromeDriverManager().install(),options=options)
browser.get('https://www.wtatennis.com/rankings/singles')
action = ActionChains(browser)
showmore = browser.find_elements_by_xpath(".//button[contains(#class, 'btn widget-footer__more-button rankings__show-more js-show-more-button')]")
action.move_to_element(showmore).perform()
showmore.click()
time.sleep(5)
Has anyone any idea? Thanks!
Don't use './/' in your locator when you are starting the search from root, as there is no current element your locator won't find any element. Also you can use any attribute to find elements uniquely. see below code:
browser = webdriver.Chrome(options=options)
browser.get('https://www.wtatennis.com/rankings/singles')
WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.XPATH,
'//*[#data-text="Accept Cookies"]'))).click()
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,
'//*[#data-text = "Show More"]'))).click()
use webdriver wait and data attributes
tu use wait import:
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
To wait till all elements are loaded you have to make sure last element is not changing , if its changing keep scrolling .
browser.get('https://www.wtatennis.com/rankings/singles')
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,
'//*[#data-text="Accept Cookies"]'))).click()
value = "start"
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,
'//*[#data-text = "Show More"]'))).click()
while(browser.find_element_by_xpath("(//tr[#class='rankings__row'])[last()]").text != value):
elem = browser.find_element_by_xpath(
'(//*[contains(text(),"Loading")])[2]')
value = browser.find_element_by_xpath(
"(//tr[#class='rankings__row'])[last()]").text
browser.execute_script("arguments[0].scrollIntoView()", elem)
WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located((By.XPATH,
"//tr[#class='rankings__row']")))
try:
WebDriverWait(browser, 10).until_not(EC.text_to_be_present_in_element((By.XPATH,
"(//tr[#class='rankings__row'])[last()]"), value))
except:
None

Selenium Python - Verify either of the title with EC.title_contains() method

I want to verify either of the page title "Apple" or "Mango" using EC.title_contains() method.
I have tried below but does't work for both the pages.
WebDriverWait(driver, 10).until(EC.title_contains("Apple") or EC.title_contains("Mango"))
WebDriverWait(driver, 10).until(EC.title_contains("Apple" or "Mango"))
Hi Check if below lines can help you to know the page by page title, change the driver path..
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
opt = webdriver.ChromeOptions()
opt.add_argument("--start-maximized")
driver = webdriver.Chrome(executable_path="C:\\chrome driver\\chromedriver.exe", options=opt)
driver.get("http://rera.rajasthan.gov.in/ProjectSearch")
WebDriverWait(driver,15).until(lambda driver: 'Mango' in driver.title or 'RER' in driver.title)
search_btn = driver.find_element_by_xpath('//*[#id="btn_SearchProjectSubmit"]')
# invoke the click() action
search_btn.click()
For Element below lines need to import By
from selenium.webdriver.common.by import By
WebDriverWait(driver,15).until(lambda driver: driver.find_element(By.XPATH,'xpath') or driver.find_element(By.XPATH,'xpath2nd'))

Python Selenium Vanguard: NoSuchElementException: no such element: Unable to locate element with Selenium and Python

I am trying to download history data and click on link to historical data. However even though the Xcode is correct I get this error:
NoSuchElementException: no such element: Unable to locate element.
Code trials:
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
driver = webdriver.Chrome(executable_path='/Users/Documents/Coding/chromedriver')
url = "https://www.vanguardinvestor.co.uk/investments/vanguard-lifestrategy-100-equity-fund-accumulation-shares/price-performance?intcmpgn=blendedlifestrategy_lifestrategy100equityfund_fund_link"
driver.get(url)
wait = WebDriverWait(driver, 10)
elem = driver.find_element_by_xpath("//*[#id='prices-and-performance-tab']/div/div[4]/div[3]/div[1]/div[1]/div[3]/div/div/div[2]/div/table/tfoot/tr/td/a")
webdriver.ActionChains(driver).move_to_element(elem).click(elem).perform()
To click on the element wait for the page to load and element to be clickable and then click.Try entire snippet.
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(executable_path='/Users/Documents/Coding/chromedriver')
url = "https://www.vanguardinvestor.co.uk/investments/vanguard-lifestrategy-100-equity-fund-accumulation-shares/price-performance?intcmpgn=blendedlifestrategy_lifestrategy100equityfund_fund_link"
driver.get(url)
element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//span[text()[contains(.,'Price & Performance')]]")))
element.click
To click on the element with text as Search for more historical prices you need to induce WebDriverWait for the element to be clickable and you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path = r'C:\Utility\BrowserDrivers\chromedriver.exe' )
driver.get("https://www.vanguardinvestor.co.uk/investments/vanguard-lifestrategy-100-equity-fund-accumulation-shares/price-performance?intcmpgn=blendedlifestrategy_lifestrategy100equityfund_fund_link")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[#id='bannerButton']"))).click()
more_historical_prices = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.LINK_TEXT, "Search for more historical prices")))
driver.execute_script("arguments[0].scrollIntoView(true);", more_historical_prices)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.LINK_TEXT, "Search for more historical prices")))
Browser Snapshot:

Resources