selenium in python is skipping articles while trying to scrape the data - python-3.x

Im trying to extract data from articles using selenium in python, the code is identifying the articles but while running the loop a few articles are skipped randomly. Any help resolving this issue will be appreciated.
#Importing libraries
import requests
import os
import json
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
import time
import requests
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import traceback
from webdriver_manager.chrome import ChromeDriverManager
#opening a chrome instance
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r"C:/selenium/chromedriver.exe")
#getting into the website
driver.get('https://academic.oup.com/rof/issue/2/2')
#getting the articles
articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, '/html/body/div[3]/main/section/div/div/div[1]/div/div[3]/div[2]/div[3]/div/div/div/div/h5')))
#loop to get in and out of articles
for article in articles:
try:
ActionChains(driver).key_down(Keys.CONTROL).click(article).key_up(Keys.CONTROL).perform()
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
window1 = driver.window_handles[1]
driver.switch_to_window(window1)
driver.close()
driver.switch_to_window(window0)
except:
print("couldnt get the article")

First, for collect all article element, you can use this css selector:
articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.customLink.item-title a')))
Second, This is wrong method:
driver.switch_to_window(window1)
It's should:
driver.switch_to.window(window1)
See the difference between _ and . above.
Third, you forgot to initialize the window0 variable:
window0 = driver.window_handles[0]
And finally, try the following code:
#getting into the website
driver.get('https://academic.oup.com/rof/issue/2/2')
#getting the articles
articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.customLink.item-title a')))
#loop to get in and out of articles
for article in articles:
try:
ActionChains(driver).key_down(Keys.CONTROL).click(article).key_up(Keys.CONTROL).perform()
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
window1 = driver.window_handles[1]
driver.switch_to.window(window1)
driver.close()
window0 = driver.window_handles[0]
driver.switch_to.window(window0)
except:
print("couldnt get the article")
driver.quit()

Related

Unable to scrape texts from URLs

I have been strugling to scrape the contents/text of news articles from each URLs. The extraction of URLs works fine, but scraping the texts from each URLs has been challenging. Below is my code:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for j in article_link:
news_response = requests.get(j)
news_data = news_response.content
news_soup = BeautifulSoup(news_data, 'html.parser')
art_cont = news_soup.find('div', 'Article__StyledArticleContent-sc-uw4nkg-0')
full_text.append(art_cont.text)
print(article_link)
print(full_text)
I tried to use beautifulsoup, but it doesn't seem to work. I will be grateful for any help.
First off you should probably unindent the second for loop, it shouldn't be running inside of the first loop (you will be doubling and getting all of the information countless extra times).
Second. The requests that you send are returning a webpage that has content blocked (I could not figure out a way around this with inserting headers into the request). What you could do is use the driver to load each of the links and grab the text from there, here is how you could do that.
for link in article_link:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
The full script would look like this:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for link in article_link[:5]:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
print(article_link)
print(full_text)
The best course of action is to utilize selenium throughout as the site's content is cloudflare secured. Although #Andrew Ryan has already addressed the issue, I thought I'd come up with a shorter version of it since this answer was already halfway through at the time of his posting.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
link = 'https://www.iol.co.za/news/south-africa/eastern-cape'
def get_links_and_texts(driver,url):
driver.get(url)
for article_link in [i.get_attribute('href') for i in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//article/a[starts-with(#class,'Link__StyledLink')]")))]:
driver.get(article_link)
art_content = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,".article-content"))).text
yield {"Link":article_link,"article_content":art_content}
if __name__ == '__main__':
with webdriver.Chrome() as driver:
for item in get_links_and_texts(driver,link):
print(item)

Login into a Website with data from csv file with python selenium

I am trying to create a script that automatically logs in different accounts to a website, the login data should be taken from a CSV file. Unfortunately I get no result, maybe someone has a solution for me?
import pandas as pd
import os
from selenium import webdriver
from twocaptcha import TwoCaptcha
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
loginlist = pd.read_csv('/Volumes/GoogleDrive/Meine Ablage/Privat/py_scripts/login.csv', header=None, skiprows=\[0\], sep =',')
print(loginlist)
driver = webdriver.Chrome()
driver.get("https://freebitco.in/signup/?op=s")
time.sleep(6)
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
time.sleep(1)
login = driver.find_element(By.CLASS_NAME, "login_menu_button")
login.click()
time.sleep(3)
def fulfill_form(email, password):
input_email = driver.find_element(By.ID, 'login_form_btc_address')
input_password = driver.find_element(By.ID, 'login_form_password')
input_email.send_keys(email)
time.sleep(1)
input_password.send_keys(password)
time.sleep(5)
failed_attempts = \[\]
for customer in loginlist:
try:
fulfill_form(str(loginlist\[0\]), str(loginlist\[1\]))
except:
failed_attempts.append(loginlist\[0\])
pass
if len(failed_attempts) \> 0:
print("{} cases have failed".format(len(failed_attempts)))
print("Procedure concluded")
I tried several older solutions from other posts, unfortunately they led nowhere.

How to click the next span value which has same class name

import urllib3
import certifi
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
import time
import ssl
http = urllib3.PoolManager(ca_certs=certifi.where())
chrome_options = Options()
chrome_options.add_argument("--incognito")
driver = webdriver.Chrome(options=chrome_options, executable_path="D:\\python works\\driver\\chromedriver.exe")
URL= "https://physicians.wustl.edu/"
driver.get(URL)
time.sleep(5)
driver.find_element_by_link_text("Find a Doctor").click()
find_doc = driver.current_url
print(find_doc)
driver.get(find_doc)
# content = driver.page_source
# print(content)
response = http.request('GET', find_doc)
url_text = response.data #text
time.sleep(10)
count = len(driver.find_elements_by_xpath("//span[#class='entry-title-link']"))
print(count)
s = driver.find_element_by_css_selector("span[class='entry-title-link']") #firstpage click
s.click()
urls = []
provider = []
print(driver.current_url)
urls.append(driver.current_url)
name = driver.find_element_by_css_selector("h1[class='washu-ppi-name entry-title']").text
print(name)
provider.append(name)
specialization = driver.find_element_by_css_selector("ul[class='wuphys-specialties']").text
print(specialization)
location= driver.find_element_by_css_selector("a[class='wuphys-addr name']").text
print(location)
time.sleep(5)
driver.find_element_by_css_selector("a[href='https://physicians.wustl.edu/find-a-doctor/']").click()
time.sleep(10)
I have same classname of span but I need to loop the same class name but the div is different. In the url there is doctors name with details after click I get details and I need to move to next doctor which has same class name
I think you are looking for something of this kind (to loop through all the doctor links and get info from there). Here I have written a basic action which you can scale to add more data related to each doctor.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
import time
driver = webdriver.Chrome(options=chrome_options, executable_path="D:\\python works\\driver\\chromedriver.exe")
driver.maximize_window()
driver.get("https://physicians.wustl.edu/")
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.LINK_TEXT, "Find a Doctor"))).click()
print(driver.current_url)
doc_cnt = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//span[#class='entry-title-link']")))
print(len(doc_cnt))
doc_list=[] # to append all the doctor urls into the list for further processing, if required.
for doc in doc_cnt:
ActionChains(driver).key_down(Keys.CONTROL).click(doc).key_up(Keys.CONTROL).perform()
driver.switch_to.window(driver.window_handles[1])
doc_list.append(driver.current_url)
# ... you could include any code of yours related to each doctor here...
# After this one the tab terminates and a new doctor link would open
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(1)
print(doc_list)

BeautifulSoup to Data Frame With Pandas

New to programming here, so forgive the silly questions. I've been trying to work out how to Python for web scraping and a lot of the YouTube videos and other questions kinda get me there, I'm having a hard time relating the answer to my actual code.
My code so far is:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=1"
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup_level1=soup(driver.page_source, 'lxml')
race_soup = soup_level1.find("tbody", class_="f_fs13")
print(race_soup.text.strip())
results_soup = soup_level1.find("tbody", class_="f_fs12")
print(results_soup.text.strip())
datalist = [] #empty list
x = 0 #counter
print('good')
driver.close()
This will generate the parsed data, but now I am stuck as how to move it from text to a data frame with pandas. I'm sure it is simple, but all of the instructional material I've seen isn't clicking for me.
Also, The code so far is just sort of copy and pasted chunks from different websites that I got to work with trial and error. I'm not sure if any of it is redundant, so if there is a neater way to go about it, I would appreciate that feedback as well!
Thanks in advance,
Spencer
Give this a try:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=1"
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df1 = pd.read_html(str(race_soup))[0]
print(df1)
df2 = pd.read_html(str(results_soup))[0]
print(df2)
datalist = [] #empty list
x = 0 #counter
print('good')
driver.close()

Scraping dynamic web page with selenium

I'm trying to get the links of the posts on this page, but they are apparently generated by clicking each of the post images. I'm using Selenium and beautifulsoup4 in Python 3.8.
Any idea how to get the links while selenium continues to the next pages?
url: https://www.goplaceit.com/cl/mapa?id_modalidad=1&tipo_pro//*[#id=%22gpi-property-list-container%22]/div[3]/div[1]/div[1]/imgpiedad=1%2C2&selectedTool=list#12/-33.45/-70.66667
after clicking on the image it opens a new tab with the following type of shortening url: https://www.goplaceit.com/propiedad/6198212
which sends me to a url type:
https://www.goplaceit.com/cl/propiedad/venta/departamento/santiago/6198212-depto-con-1d-1b-y-terraza-a-pasos-del-metro-toesca-bodega
My code:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import winsound
from timeit import default_timer as timer
from selenium.webdriver.common.keys import Keys
start = timer()
PROXY = "PROXY" # IP:PORT or HOST:PORT
path_to_extension = r"extension"
options = Options()
#options.add_argument("--incognito")
options.add_argument('load-extension=' + path_to_extension)
#options.add_argument('--disable-java')
options.headless = False
prefs = {"profile.default_content_setting_values.notifications" : 2}
prefs2 = {"profile.managed_default_content_settings.images": 2}
prefs.update(prefs2)
prefs3 = {"profile.default_content_settings.cookies": 2}
prefs.update(prefs3)
options.add_experimental_option("prefs",prefs)
options.add_argument("--start-maximized")
options.add_argument('--proxy-server=%s' % PROXY)
driver = webdriver.Chrome('chromedriver.exe', options=options)
driver.get('https://www.goplaceit.com/cl/')
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="root"]/nav/div/div[2]/div[1]/button'))).click()
correo = driver.find_element(By.XPATH, '//*[#id="email"]')
correo.send_keys("Mail")
contraseña = driver.find_element(By.XPATH, '//*[#id="password"]')
contraseña.send_keys("password")
contraseña.send_keys(Keys.ENTER)
time.sleep(7)
elem.driver.find_element(By.XPATH, '//*[#id="gpi-main-landing-search-input"]/div/input')
elem.click()
elem.send_keys("keywords")
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="gpi-main-landing-search-input"]/div/div[1]/ul/li[1]/a/div/div[1]'))).click()
buscador.send_keys(Keys.ENTER)
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="root"]/div/div/div[1]/div[2]/div/div[1]/div/div[1]/button'))).click()
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="custom-checkbox"]'))).click()
page_number = 0
max_page_number = 30
while page_number<=max_page_number:
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(),"paginator-btn-right")]'))).click()
You can get easily the urls by clicking on an image, saving your url, coming back to the first page and repeating this for all the images:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
driver.get("https://www.goplaceit.com/cl/mapa?id_modalidad=1&tipo_propiedad=1%2C2&selectedTool=list#8/-33.958/-71.206")
images = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#class='sc-iyvyFf ljSqTz']//img")))
urls = []
for i, image in enumerate(images):
window_before = driver.window_handles[0]
image.click()
driver.implicitly_wait(2)
window_after = driver.window_handles[i+1]
driver.switch_to.window(window_after)
urls.append(driver.current_url)
driver.switch_to.window(window_before)

Resources