Python selenium - "No modal dialog is currently open exception in selenium" - python-3.x

I am getting input url's and trying to load it in the browser (Firefox - Updated version). The script has been working for all the url's. Except few URLs return this error
"No modal dialog is currently open"
I tried adding driver.switch_to_alert().accept() but no help. I usually end up closing browser and restart it all over again.
Please find the code :
capabilities = DesiredCapabilities.FIREFOX.copy()
capabilities['marionette'] = True
capabilities['acceptSslCerts'] = True
driver = webdriver.Firefox(capabilities=capabilities)
driver.implicitly_wait(3)
print ("Number of rows to be processed is:" + str(ws1.max_row))
for row in range(2,ws1.max_row+1):
try:
region = ws1["E"+str(row)].value
# Reading Row 'row'
print("Reading row #"+str(row))
url = ws1["C"+str(row)].value
regex = df[df['Country'] == region]['Regex'].values[0]
#Load mainpage or input URL
i = 0
while i < 3:
try:
driver.get("http://"+url)
driver.switch_to_alert().accept()
time.sleep(5)
break
except (TimeoutException,WebDriverException,NoSuchElementException) as e:
print(e,'Retrying...',i+1)
i += 1
Sample URLs :
afilias.info
www.nuwavenow.com
www.IntoTomorrow.com
www.picobrew.com

Related

Cannot get text from found list though text exists

I am trying to scrape reference texts from this: paper
When I go to the site, the references section does not show up. To see them, I should either click "References" or "+Show References". I am trying to find references link and click it.
Here is my code:
browser.get('https://doi.org/10.3847/1538-4357/abb3c9')
refCheck = ["references", "cited literature", "literature cited", "refs"]
for h in range(0, len(browser.find_elements(By.XPATH, '//a[#href]'))):
textSearch = browser.find_elements(By.XPATH, '//a[#href]')[h].text
href = browser.find_elements(By.XPATH, '//a[#href]')[h].get_attribute("href")
if (textSearch.lower() in refCheck) & (len(href) > 0):
browser.find_elements(By.XPATH, '//a[#href]')[h].get_attribute("href")
print(h)
print(textSearch)
print(href)
break
browser.get(href)
attrList = []
refCheck = ["references", "cited literature", "literature cited", "refs"]
tags = ["ol","ul"]
for t in tags:
if len(browser.find_elements(By.TAG_NAME, t)) > 0:
for i in range(0, len(browser.find_elements(By.TAG_NAME, t))):
for attr in browser.find_elements(By.TAG_NAME, t)[i].get_property('attributes'):
for rc in refCheck:
if (rc in attr['name'].lower()) | (rc in attr['value'].lower()):
attrList.append(t)
attrList.append(i)
attrList.append(attr['name'])
attrList.append(attr['value'])
print(attr['name'])
print(attr['value'])
print(len(browser.find_elements(By.TAG_NAME, t)[i].find_elements(By.XPATH,'./li')))
if len(attrList) > 0:
break
if len(attrList) > 0:
break
if len(attrList) > 0:
break
cnt = 0
for f in browser.find_elements(By.TAG_NAME, t)[i].find_elements(By.XPATH, './li'):
print(f.text)
if len(f.text) > 0:
refList.append(f.text)
cnt += 1
print(cnt)
However, the returned text is always empty.
PS. By the way, I have tried to click href I reached instead of browser.get(href), however it does not work as well. When I tried to get the hyperlink through get_attributes("href"), it always returned a string so could not click.
How should I get that text?
EDIT:
Found the answer here: link
Using get_attribute("textContent") solved my issue.
You can use alternatively crossref.org API if you search by DOI it will give you a JSON response that contains 'reference'. You can play with the json however you want.
import requests
def get_ref(doi):
url = f'https://api.crossref.org/works/{doi}'
response = requests.get(url)
if response.status_code == 200:
response = response.json()
return response['message']['reference']
return None
doi = 'doi.org/10.3847/1538-4357/abb3c9'
get_ref_count(doi)

Downloading files by crawling sub-URLs in python

I am trying to download documents (mainly in pdf) from a large number of web links like the following:
https://projects.worldbank.org/en/projects-operations/document-detail/P167897?type=projects
https://projects.worldbank.org/en/projects-operations/document-detail/P173997?type=projects
https://projects.worldbank.org/en/projects-operations/document-detail/P166309?type=projects
However, the pdf files are not directly accessible from these links. One needs to click on sub-URLs to access the pdfs. Is there any way to crawl the sub-URLs and download all the related files from them? I am trying it with the following codes but have not had any success so far specifically for these URLs listed here.
Please let me know if you need any further clarifications. I would be happy to do so. Thank you.
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'download_pdf'
allowed_domains = ["www.worldbank.org"]
start_urls = [
"https://projects.worldbank.org/en/projects-operations/document-detail/P167897?type=projects",
"https://projects.worldbank.org/en/projects-operations/document-detail/P173997?type=projects",
"https://projects.worldbank.org/en/projects-operations/document-detail/P166309?type=projects"
] # Entry page
def afterResponse(self, response, url, error=None, extra=None):
if not extra:
print ("The version of library simplified_scrapy is too old, please update.")
SimplifiedMain.setRunFlag(False)
return
try:
path = './pdfs'
# create folder start
srcUrl = extra.get('srcUrl')
if srcUrl:
index = srcUrl.find('year/')
year = ''
if index > 0:
year = srcUrl[index + 5:]
index = year.find('?')
if index>0:
path = path + year[:index]
utils.createDir(path)
# create folder end
path = path + url[url.rindex('/'):]
index = path.find('?')
if index > 0: path = path[:index]
flag = utils.saveResponseAsFile(response, path, fileType="pdf")
if flag:
return None
else: # If it's not a pdf, leave it to the frame
return Spider.afterResponse(self, response, url, error, extra)
except Exception as err:
print(err)
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lst = doc.selects('div.list >a').contains("documents/", attr="href")
if not lst:
lst = doc.selects('div.hidden-md hidden-lg >a')
urls = []
for a in lst:
a["url"] = utils.absoluteUrl(url.url, a["href"])
# Set root url start
a["srcUrl"] = url.get('srcUrl')
if not a['srcUrl']:
a["srcUrl"] = url.url
# Set root url end
urls.append(a)
return {"Urls": urls}
# Download again by resetting the URL. Called when you want to download again.
def resetUrl(self):
Spider.clearUrl(self)
Spider.resetUrlsTest(self)
SimplifiedMain.startThread(MySpider()) # Start download
There's an API endpoint that contains the entire response you see on the web-site along with... the URL to the document pdf. :D
So, you can query the API, get the URLS, and finally fetch the documents.
Here's how:
import requests
pids = ["P167897", "P173997", "P166309"]
for pid in pids:
end_point = f"https://search.worldbank.org/api/v2/wds?" \
f"format=json&includepublicdocs=1&" \
f"fl=docna,lang,docty,repnb,docdt,doc_authr,available_in&" \
f"os=0&rows=20&proid={pid}&apilang=en"
documents = requests.get(end_point).json()["documents"]
for document_data in documents.values():
try:
pdf_url = document_data["pdfurl"]
print(f"Fetching: {pdf_url}")
with open(pdf_url.rsplit("/")[-1], "wb") as pdf:
pdf.write(requests.get(pdf_url).content)
except KeyError:
continue
Output: (fully downloaded .pdf files)
Fetching: http://documents.worldbank.org/curated/en/106981614570591392/pdf/Official-Documents-Grant-Agreement-for-Additional-Financing-Grant-TF0B4694.pdf
Fetching: http://documents.worldbank.org/curated/en/331341614570579132/pdf/Official-Documents-First-Restatement-to-the-Disbursement-Letter-for-Grant-D6810-SL-and-for-Additional-Financing-Grant-TF0B4694.pdf
Fetching: http://documents.worldbank.org/curated/en/387211614570564353/pdf/Official-Documents-Amendment-to-the-Financing-Agreement-for-Grant-D6810-SL.pdf
Fetching: http://documents.worldbank.org/curated/en/799541612993594209/pdf/Sierra-Leone-AFRICA-WEST-P167897-Sierra-Leone-Free-Education-Project-Procurement-Plan.pdf
Fetching: http://documents.worldbank.org/curated/en/310641612199201329/pdf/Disclosable-Version-of-the-ISR-Sierra-Leone-Free-Education-Project-P167897-Sequence-No-02.pdf
and more ...

Using python, how to attach a contact in Whatsapp and send to multiple numbers?

I am struggling to click over an element. I am not able to click on the contact element from the attach icon in whatsapp chat. i am using url= "web.whatsapp.com".
Also is there any other option of doing the task?
I get "NoSuchElementException" error when i run the below code. Please help me to select it correctly.
try:
chat_button = driver.find_element_by_xpath("//a[#id = 'action-button']").click()
time.sleep(2)
element_presence(By.XPATH,'//*[#id="main"]/footer/div[1]/div[2]/div/div[2]',30)
msg_box=driver.find_element(By.XPATH , '//*[#id="main"]/footer/div[1]/div[2]/div/div[2]')
msg_box.send_keys(message + Keys.ENTER)
time.sleep(2)
attach_icon = driver.find_element_by_xpath("//div[#title = 'Attach']").click()
# issue starts from here...
contact_icon = driver.find_element_by_xpath("//svg[#id = 'contact-Layer_1']")
contact_icon.click()
time.sleep(2)
search_contact = driver.find_element_by_xpath("//input[#title = 'search']")
search_contact.click()
search_contact.send_keys("pravin tcs")
driver.find_element_by_xpath("//div[#class = '_1kfc8_2uQfJ']").click()
driver.find_element_by_xpath("//span[#data-icon = 'send-light']").click()
except Exception as e:
print("Invalid phone no :"+str(phone_no))

Selenium (Python) - waiting for a download process to complete using Chrome web driver

I'm using selenium and python via chromewebdriver (windows) in order to automate a task of downloading large amount of files from different pages.
My code works, but the solution is far from ideal: the function below clicks on the website button that initiating a java script function that generating a PDF file and then downloading it.
I had to use a static wait in order to wait for the download to be completed (ugly) I cannot check the file system in order to verify when the download is completed since i'm using multi threading (downloading lot's of files from different pages at once) and also the the name of the files is generated dynamically in the website itself.
My code:
def file_download(num, drivervar):
Counter += 1
try:
drivervar.get(url[num])
download_button = WebDriverWait(drivervar, 20).until(EC.element_to_be_clickable((By.ID, 'download button ID')))
download_button.click()
time.sleep(10)
except TimeoutException: # Retry once
print('Timeout in thread number: ' + str(num) + ', retrying...')
.....
Is it possible to determine download completion in webdriver? I want to avoid using time.sleep(x).
Thanks a lot.
You can get the status of each download by visiting chrome://downloads/ with the driver.
To wait for all the downloads to finish and to list all the paths:
def every_downloads_chrome(driver):
if not driver.current_url.startswith("chrome://downloads"):
driver.get("chrome://downloads/")
return driver.execute_script("""
var items = document.querySelector('downloads-manager')
.shadowRoot.getElementById('downloadsList').items;
if (items.every(e => e.state === "COMPLETE"))
return items.map(e => e.fileUrl || e.file_url);
""")
# waits for all the files to be completed and returns the paths
paths = WebDriverWait(driver, 120, 1).until(every_downloads_chrome)
print(paths)
Was updated to support changes till version 81.
I have had the same problem and found a solution. You can check weither or not a .crdownload is in your download folder. If there are 0 instances of a file with .crdownload extension in the download folder then all your downloads are completed. This only works for chrome and chromium i think.
def downloads_done():
while True:
for filename in os.listdir("/downloads"):
if ".crdownload" in i:
time.sleep(0.5)
downloads_done()
Whenever you call downloads_done() it will loop itself untill all downloads are completed. If you are downloading massive files like 80 gigabytes then i don't recommend this because then the function can reach maximum recursion depth.
2020 edit:
def wait_for_downloads():
print("Waiting for downloads", end="")
while any([filename.endswith(".crdownload") for filename in
os.listdir("/downloads")]):
time.sleep(2)
print(".", end="")
print("done!")
The "end" keyword argument in print() usually holds a newline but we replace it.
While there are no filenames in the /downloads folder that end with .crdownload
sleep for 2 seconds and print one dot without newline to console
I don't really recommend using selenium anymore after finding out about requests but if it's a very heavily guarded site with cloudflare and captchas etc then you might have to resort to selenium.
With Chrome 80, I had to change the answer from #florent-b by the code below:
def every_downloads_chrome(driver):
if not driver.current_url.startswith("chrome://downloads"):
driver.get("chrome://downloads/")
return driver.execute_script("""
return document.querySelector('downloads-manager')
.shadowRoot.querySelector('#downloadsList')
.items.filter(e => e.state === 'COMPLETE')
.map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
""")
I believe this is retro-compatible, I mean this shall be working with older versions of Chrome.
There are issues with opening chrome://downloads/ when running Chrome in headless mode.
The following function uses a composite approach that works whether the mode is headless or not, choosing the better approach available in each mode.
It assumes that the caller clears all files downloaded at file_download_path after each call to this function.
import os
import logging
from selenium.webdriver.support.ui import WebDriverWait
def wait_for_downloads(driver, file_download_path, headless=False, num_files=1):
max_delay = 60
interval_delay = 0.5
if headless:
total_delay = 0
done = False
while not done and total_delay < max_delay:
files = os.listdir(file_download_path)
# Remove system files if present: Mac adds the .DS_Store file
if '.DS_Store' in files:
files.remove('.DS_Store')
if len(files) == num_files and not [f for f in files if f.endswith('.crdownload')]:
done = True
else:
total_delay += interval_delay
time.sleep(interval_delay)
if not done:
logging.error("File(s) couldn't be downloaded")
else:
def all_downloads_completed(driver, num_files):
return driver.execute_script("""
var items = document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList').items;
var i;
var done = false;
var count = 0;
for (i = 0; i < items.length; i++) {
if (items[i].state === 'COMPLETE') {count++;}
}
if (count === %d) {done = true;}
return done;
""" % (num_files))
driver.execute_script("window.open();")
driver.switch_to_window(driver.window_handles[1])
driver.get('chrome://downloads/')
# Wait for downloads to complete
WebDriverWait(driver, max_delay, interval_delay).until(lambda d: all_downloads_completed(d, num_files))
# Clear all downloads from chrome://downloads/
driver.execute_script("""
document.querySelector('downloads-manager').shadowRoot
.querySelector('#toolbar').shadowRoot
.querySelector('#moreActionsMenu')
.querySelector('button.clear-all').click()
""")
driver.close()
driver.switch_to_window(driver.window_handles[0])
import os
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
class MySeleniumTests(unittest.TestCase):
selenium = None
#classmethod
def setUpClass(cls):
cls.selenium = webdriver.Firefox(...)
...
def test_download(self):
os.chdir(self.download_path) # default download directory
# click the button
self.selenium.get(...)
self.selenium.find_element_by_xpath(...).click()
# waiting server for finishing inner task
def download_begin(driver):
if len(os.listdir()) == 0:
time.sleep(0.5)
return False
else:
return True
WebDriverWait(self.selenium, 120).until(download_begin) # the max wating time is 120s
# waiting server for finishing sending.
# if size of directory is changing,wait
def download_complete(driver):
sum_before=-1
sum_after=sum([os.stat(file).st_size for file in os.listdir()])
while sum_before != sum_after:
time.sleep(0.2)
sum_before = sum_after
sum_after = sum([os.stat(file).st_size for file in os.listdir()])
return True
WebDriverWait(self.selenium, 120).until(download_complete) # the max wating time is 120s
You must do these thing
Wait for server to finish inner business( for example, query from database).
Wait for server to finish sending the files.
(my English is not very well)
To obtain the return of more than one item, I had to change the answer of #thdox by the code below:
def every_downloads_chrome(driver):
if not driver.current_url.startswith("chrome://downloads"):
driver.get("chrome://downloads/")
return driver.execute_script("""
var elements = document.querySelector('downloads-manager')
.shadowRoot.querySelector('#downloadsList')
.items
if (elements.every(e => e.state === 'COMPLETE'))
return elements.map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
""")
This may not work for all usecases but for my simple need to wait for one pdf to download it works great. Based off of Walter's comment above.
def get_non_temp_len(download_dir):
non_temp_files = [i for i in os.listdir(download_dir) if not (i.endswith('.tmp') or i.endswith('.crdownload'))]
return len(non_temp_files)
download_dir = 'your/download/dir'
original_count = get_non_temp_len(download_dir) # get the file count at the start
# do your selenium stuff
while original_count == get_non_temp_len(download_dir):
time.sleep(.5) # wait for file count to change
driver.quit()
I had the same problem and this method worked for me.
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
from threading import Thread
import os
import datetime
def checkFilePresence(downloadPath, numberOfFilesInitially, artistName,
songTitle):
timeNow = datetime.datetime.now()
found = False
while not found:
numberOfFilesNow = len(os.listdir(downloadPath))
if numberOfFilesNow > numberOfFilesInitially:
for folders, subfolders, files in os.walk(downloadPath):
for file in files:
modificationTime = datetime.datetime.fromtimestamp\
(os.path.getctime(os.path.join(folders, file)))
if modificationTime > timeNow:
if file.endswith('.mp3'):
return
This code work in headless mode and return downloaded file name (based on
#protonum code):
def wait_for_downloads(download_path):
max_delay = 30
interval_delay = 0.5
total_delay = 0
file = ''
done = False
while not done and total_delay < max_delay:
files = [f for f in os.listdir(download_path) if f.endswith('.crdownload')]
if not files and len(file) > 1:
done = True
if files:
file = files[0]
time.sleep(interval_delay)
total_delay += interval_delay
if not done:
logging.error("File(s) couldn't be downloaded")
return download_path + '/' + file.replace(".crdownload", "")
def wait_for_download_to_be_don(self, path_to_folder, file_name):
max_time = 60
counter = 0
while not os.path.exists(path_to_folder + file_name) and time_counter < max_time:
sleep(0.5)
time_counter += 0.5
if time_counter == max_time:
assert os.path.exists(path_to_folder + file_name), "The file wasn't downloaded"
When using test automation, its crucial that developers make the software testable. It is your job to check the software combined with the testability, meaning that you need to request a spinner or a simple HTML tag which indicates when the download is done successfully.
In a case as yours, where you cannot check it in the UI and you cannot check in system, this is the best way to solve it.

ERROR - Message: Service /usr/bin/safaridriver unexpectedly exited. Status code was: 1

I'll get this, if I try to run open_hyperlink_1 function. Also, first and fourth element in cycle load correctly. When I try run function without cycle all links will be opened correctly
def open_hyperlink_1(inp, end = ''):
op_log = logging.getLogger("root.open_hyperlink_1")
op_log.debug('*' + 'https://www.example.ru'+end + '*')
try:
driver = webdriver.Safari()
driver.get('https://www.example.ru' + end)
select =driver.find_element_by_xpath('//div[#class="outcomesBlock firstBlock"]//ul')
link_text = select.get_attribute("outerHTML")
op_log.info('link opened')
op_log.info('loading bets')
print('+')
driver.close()
del driver
except Exception as E:
op_log.error(E)
print('-')
for i in link:
open_hyperlink_1(inp, i)

Resources