Aws lambda, Web Scraping cannot find Chrome binary (Python) - python-3.x

I am trying to write a web scraping job through aws lambda(python) and I am getting this error when I execute it.
Error
Message: unknown error: cannot find Chrome binary
How am I running:
I have downloaded chromedriver from this website and zipped code along with below python code.Please let me know if this way or do I need to make any modifications to my code?
https://chromedriver.storage.googleapis.com/index.html?path=111.0.5563.19/
import concurrent.futures
import requests
from selenium import webdriver
import os
import subprocess
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import boto3
from datetime import datetime
def scrape_data():
try:
years = [2023]
states = ["alnb"]
for state in states:
"""Creating s3 connection to write into state folder"""
for year in years:
url = 'https://www.govinfo.gov/app/collection/uscourts/bankruptcy/'+state+'/'+str(year)+'/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
options = webdriver.ChromeOptions()
options.add_argument("headless")
driver = webdriver.Chrome(executable_path='./chromedriver', chrome_options=options)
driver.get(url)
elements = WebDriverWait(driver, 2).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "panel-body"))
)
soup = BeautifulSoup(driver.page_source, 'html.parser')
bankruptcy_element = soup.findAll('div',{"class": "panel-collapse collapse in","class": "panel-title","class": "panel-body","class":"panel panel-default","class": "panel-collapse collapse in"})
print("scraping data for state "+state.capitalize() +" for "+str(year).capitalize())
data = []
for i in bankruptcy_element:
for xmlfile in i.findAll('a', href=True):
if ("pdf" in (xmlfile['href'])):
xmlfile['href']=xmlfile['href'].replace(".pdf","/mods.xml")
xmlfile['href']=xmlfile['href'].replace("/pdf","")
xmlfile['href']=xmlfile['href'].replace("/pkg/","/")
xmlfile['href']=xmlfile['href'].replace("/content","")
xmlfile['href']="https://www.govinfo.gov/metadata/granule"+xmlfile['href']
data.append(xmlfile['href'])
return data
except Exception as e:
pass
print(e)
def lambda_handler(event, context):
s3 = boto3.client('s3')
today_date=datetime.today().strftime('%Y-%m-%d')
s3.put_object(Bucket='w-zone', Key='Banktcy/'+today_date+"/xmlfiles.txt", Body=scrape_data())
#

Related

Unable to scrape texts from URLs

I have been strugling to scrape the contents/text of news articles from each URLs. The extraction of URLs works fine, but scraping the texts from each URLs has been challenging. Below is my code:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for j in article_link:
news_response = requests.get(j)
news_data = news_response.content
news_soup = BeautifulSoup(news_data, 'html.parser')
art_cont = news_soup.find('div', 'Article__StyledArticleContent-sc-uw4nkg-0')
full_text.append(art_cont.text)
print(article_link)
print(full_text)
I tried to use beautifulsoup, but it doesn't seem to work. I will be grateful for any help.
First off you should probably unindent the second for loop, it shouldn't be running inside of the first loop (you will be doubling and getting all of the information countless extra times).
Second. The requests that you send are returning a webpage that has content blocked (I could not figure out a way around this with inserting headers into the request). What you could do is use the driver to load each of the links and grab the text from there, here is how you could do that.
for link in article_link:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
The full script would look like this:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for link in article_link[:5]:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
print(article_link)
print(full_text)
The best course of action is to utilize selenium throughout as the site's content is cloudflare secured. Although #Andrew Ryan has already addressed the issue, I thought I'd come up with a shorter version of it since this answer was already halfway through at the time of his posting.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
link = 'https://www.iol.co.za/news/south-africa/eastern-cape'
def get_links_and_texts(driver,url):
driver.get(url)
for article_link in [i.get_attribute('href') for i in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//article/a[starts-with(#class,'Link__StyledLink')]")))]:
driver.get(article_link)
art_content = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,".article-content"))).text
yield {"Link":article_link,"article_content":art_content}
if __name__ == '__main__':
with webdriver.Chrome() as driver:
for item in get_links_and_texts(driver,link):
print(item)

How to click the next span value which has same class name

import urllib3
import certifi
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
import time
import ssl
http = urllib3.PoolManager(ca_certs=certifi.where())
chrome_options = Options()
chrome_options.add_argument("--incognito")
driver = webdriver.Chrome(options=chrome_options, executable_path="D:\\python works\\driver\\chromedriver.exe")
URL= "https://physicians.wustl.edu/"
driver.get(URL)
time.sleep(5)
driver.find_element_by_link_text("Find a Doctor").click()
find_doc = driver.current_url
print(find_doc)
driver.get(find_doc)
# content = driver.page_source
# print(content)
response = http.request('GET', find_doc)
url_text = response.data #text
time.sleep(10)
count = len(driver.find_elements_by_xpath("//span[#class='entry-title-link']"))
print(count)
s = driver.find_element_by_css_selector("span[class='entry-title-link']") #firstpage click
s.click()
urls = []
provider = []
print(driver.current_url)
urls.append(driver.current_url)
name = driver.find_element_by_css_selector("h1[class='washu-ppi-name entry-title']").text
print(name)
provider.append(name)
specialization = driver.find_element_by_css_selector("ul[class='wuphys-specialties']").text
print(specialization)
location= driver.find_element_by_css_selector("a[class='wuphys-addr name']").text
print(location)
time.sleep(5)
driver.find_element_by_css_selector("a[href='https://physicians.wustl.edu/find-a-doctor/']").click()
time.sleep(10)
I have same classname of span but I need to loop the same class name but the div is different. In the url there is doctors name with details after click I get details and I need to move to next doctor which has same class name
I think you are looking for something of this kind (to loop through all the doctor links and get info from there). Here I have written a basic action which you can scale to add more data related to each doctor.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
import time
driver = webdriver.Chrome(options=chrome_options, executable_path="D:\\python works\\driver\\chromedriver.exe")
driver.maximize_window()
driver.get("https://physicians.wustl.edu/")
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.LINK_TEXT, "Find a Doctor"))).click()
print(driver.current_url)
doc_cnt = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//span[#class='entry-title-link']")))
print(len(doc_cnt))
doc_list=[] # to append all the doctor urls into the list for further processing, if required.
for doc in doc_cnt:
ActionChains(driver).key_down(Keys.CONTROL).click(doc).key_up(Keys.CONTROL).perform()
driver.switch_to.window(driver.window_handles[1])
doc_list.append(driver.current_url)
# ... you could include any code of yours related to each doctor here...
# After this one the tab terminates and a new doctor link would open
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(1)
print(doc_list)

Why is my Selenium code returning only half of the data I requested

Recently, I wrote a selenium web scraper that is meant to extract all the information on a table containing data on all presidential elections that have been held in the united states. The table is on this wikipedia site.
The problem is that the code returns all the info I need when I write the result into a .txt file. But anytime I try to print that same result in my text editor, it returns only half of the data I need. I do not understand what the problem is. Can someone help me out?
Here is my code.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas
# using selenium and shromedriver to extract the javascript wikipage
scrape_options = Options()
scrape_options.add_argument('--headless')
driver = webdriver.Chrome(r'web scraping master/chromedriver', options=scrape_options)
page_info = driver.get('https://en.wikipedia.org/wiki/United_States_presidential_election')
# waiting for the javascript to load
try:
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"wikitable")))
finally:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
print(table)
with open("loge.txt","w") as f: #Only part I added to the code
f.write(str(table))
I'm not really sure what was the problem, but this works as expected. I've changed loge.txt to loge.html and the code dumps the entire table.
Mind trying this?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
scrape_options = Options()
scrape_options.add_argument('--headless')
driver = webdriver.Chrome(options=scrape_options)
page_info = driver.get('https://en.wikipedia.org/wiki/United_States_presidential_election')
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "wikitable")))
finally:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
with open("loge.html", "w") as f:
f.write(str(table))

selenium in python is skipping articles while trying to scrape the data

Im trying to extract data from articles using selenium in python, the code is identifying the articles but while running the loop a few articles are skipped randomly. Any help resolving this issue will be appreciated.
#Importing libraries
import requests
import os
import json
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
import time
import requests
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import traceback
from webdriver_manager.chrome import ChromeDriverManager
#opening a chrome instance
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r"C:/selenium/chromedriver.exe")
#getting into the website
driver.get('https://academic.oup.com/rof/issue/2/2')
#getting the articles
articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, '/html/body/div[3]/main/section/div/div/div[1]/div/div[3]/div[2]/div[3]/div/div/div/div/h5')))
#loop to get in and out of articles
for article in articles:
try:
ActionChains(driver).key_down(Keys.CONTROL).click(article).key_up(Keys.CONTROL).perform()
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
window1 = driver.window_handles[1]
driver.switch_to_window(window1)
driver.close()
driver.switch_to_window(window0)
except:
print("couldnt get the article")
First, for collect all article element, you can use this css selector:
articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.customLink.item-title a')))
Second, This is wrong method:
driver.switch_to_window(window1)
It's should:
driver.switch_to.window(window1)
See the difference between _ and . above.
Third, you forgot to initialize the window0 variable:
window0 = driver.window_handles[0]
And finally, try the following code:
#getting into the website
driver.get('https://academic.oup.com/rof/issue/2/2')
#getting the articles
articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.customLink.item-title a')))
#loop to get in and out of articles
for article in articles:
try:
ActionChains(driver).key_down(Keys.CONTROL).click(article).key_up(Keys.CONTROL).perform()
WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
window1 = driver.window_handles[1]
driver.switch_to.window(window1)
driver.close()
window0 = driver.window_handles[0]
driver.switch_to.window(window0)
except:
print("couldnt get the article")
driver.quit()

__Scrape__ a file behind a login. Using requests in Python 3.8. Connection problem

I'd like to connect to the website, then go to the account page and get my invoice.
I already some of topics but my script stills not work ...
This is my code. Do you see something wrong ? or I miss something about the connection.
Edit: I tried on another website and it's working, so I think I miss something about the connection of this website.
import requests
from bs4 import BeautifulSoup
payload = {'email_address': 'XXX#gmail.com', 'password': 'XXX'}
url = 'https://www.maxicoffee.com/login.php'
account_url = "https://www.maxicoffee.com/account.php"
with requests.Session() as session:
session.get(url)
session.post(url, data = payload)
account_page = session.get(account_url)
soup = BeautifulSoup(account_page.content, "html.parser")
invoice = soup.find("Facture imprimable")
print(invoice)
I think it may be a good idea if you use the selenium. read the documentation and use it, but I try to say it here how to use it.
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import os
import time
chromedriver = "driver/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
actions = ActionChains(driver)
username_login = 'XXX#gmail.com'
password_login = 'XXX'
def login(driver , username_login ,password_login):
login_link = 'https://www.maxicoffee.com/account.php'
driver.get(login_link)
time.sleep(5)
username_input = driver.find_element_by_xpath('that input element xpath')
password_input = driver.find_element_by_xpath('that input element xpath')
username_input.send_keys(username_login)
password_input.send_keys(password_login)
login_button = driver.find_element_by_xpath('that login button element xpath')
login_button.click()

Resources