how to download a file using selenium in python? - python-3.x

I wanted to download files using python but I am not able to do it. I tried searching for the ways to do it but i dint find any relevant resource.
Here is my code:
from selenium import webdriver
driver = webdriver.Chrome('/home/user/Downloads/chromedriver')
#The below link is a pdf file and not an HTML file. I want to download this file directly.
driver.get("https://authlink-files-storage.ams3.digitaloceanspaces.com/authlink/transfered_certificates_related_docs/supporting_docs_17_2020_07_24_06_25_764ffb965d1b4ae287a0d3cc01c8dd03")
Now I want to download this file but i am not able to do it.

If direct download doesn't work you can always workaround using the printing functionality:
Need to use chrome options --kiosk-printing which will automatically click on print button once print dialog is opened
options = webdriver.ChromeOptions()
options.add_argument("--kiosk-printing")
Define chrome preferences as JSON string
prefs = {"savefile.default_directory": "your destination path", "printing.default_destination_selection_rules": {"kind": "local", "idPattern": ".*", "namePattern": "Save as PDF"}}
In above prefs, default directory will be used to save your pdf in required location. second pref will select the "save as pdf" option from print dialog automatically
Add pref as experimental options
options.add_experimental_option("prefs", prefs)
Define driver using chrome options and prefs
driver = webdriver.Chrome(chrome_options=options)
Once the pdf is opened in url, you can open print dialog using javascript
driver.execute_script("window.print()")
Your pdf will be saved in the destination path with the same title

Try This Code
from selenium import webdriver
download_dir = "C:\\Temp\\Dowmload" # for linux/*nix, download_dir="/usr/Public"
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], # Disable Chrome's PDF Viewer
"download.default_directory": download_dir , "download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)
driver = webdriver.Chrome('//Server/Apps/chrome_driver/chromedriver.exe', chrome_options=options)
driver.get("https://authlink-files-storage.ams3.digitaloceanspaces.com/authlink/transfered_certificates_related_docs/supporting_docs_17_2020_07_24_06_25_764ffb965d1b4ae287a0d3cc01c8dd03")

The solution to your problem is simple. To explain it better let me help you with a consideration of a scenario like downloading a file without clicking on the save as button in a present framework folder and then deleting the file after verification.
from selenium import webdriver
import os
From selenium.webdriver.common.by import By
From webdriver_manager.chrome import ChromeDriverManager
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--verbose')
op.add_argument("--disable-notifications")
op.add_experimental_option("prefs", {"download.default_directory":
"G:/Python/Download/","download.prompt_for_download":
False,"download.directory_upgrade": True,"safebrowsing.enabled": True})
op.add_argument('--disable-gpu')
op.add_argument('--disable-software-rasterizer')
driver = webdriver.Chrome(ChromeDriverManager().install(),
chrome_options=op)
driver.find_element(By.XPATH, “//span[#type = ‘button’]”).click()
def download_file_verify(self,filename):
dir_path = "G:/Python/Download/"
res = os.listdir(dir_path)
try:
name = os.path.isfile("Download/" + res[0])
if res[0].__contains__(filename):
print("file downloaded successfully")
except "file is not downloaded":
name = False
return name
def delete_previous_file(self,filename):
try:
d_path = "G:/Python/Download/"
list = os.listdir(d_path)
for file in list:
print("present file is: " + file)
path = ("Download/" + file)
if file.__contains__(filename):
os.remove(path)
print("Present file is deleted")
except:
pass

Related

Selenium Python - How to load existed profile on chrome?

So, I want to use my existed profile on chrome to make easily to login and fetch some data from the website. So I tried this on my current codes but it doesn't load the profile for some reason,
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
class ClientDriver:
def __init__(self):
self.options = Options()
self.options.add_argument("--lang=en_US")
self.options.add_argument("--disable-gpu")
self.options.add_argument("--no-sandbox")
self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument(
r"user-data-dir=C:\Users\User\AppData\Local\Google\Chrome\User Data\Profile 1"
)
self.options.add_argument("--profile-directory=Profile 1")
def check(self):
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()), options=self.options
)
driver.get("chrome://version/")
time.sleep(100)
x = ClientDriver()
x.check()
As can you see on my current codes, it will redirect to chrome://version/ to check the Profile Path and it's not C:\Users\User\AppData\Local\Google\Chrome\User Data\Profile 1 but it was C:\Users\Users\AppData\Local\Google\Chrome\User Data\Profile 1\Profile 1. Can someone help me out?
You're checking chrome://version/ from your selenium test, of course you see a wrong path!
You should instead open a normal chrome window, insert chrome://version/ in the search bar, press enter and check from there your profile path.
You will see it will be something like C:\Users\User\AppData\Local\Google\Chrome\User Data\Profile 1
And so in the code you'll have to write:
self.options.add_argument(
r"user-data-dir=C:\Users\User\AppData\Local\Google\Chrome\User Data"
)
self.options.add_argument("--profile-directory=Profile 1")

How to download file in pdf with selenium edge web driver in specific custom folder in python selenium?

I am using selenium webdriver to automate downloading several PDF files. I get the PDF preview window (see below), and now I would like to download the file. How can I accomplish this using edge as the browser?
Sample Screenshot i want to download
Here's I've got so far but it's not working.
path = "F:\Anuzz\Desktop\sel\msedgedriver.exe"
options = EdgeOptions()
options.add_experimental_option('prefs', {
"download.default_directory": "F:\Anuzz\Desktop\sel\test.py",
"download.prompt_for_download": False,
"plugins.always_open_pdf_externally": True
})
driver = Edge(path, options=options)
driver.get('https://sscstudy.com/ssc-chsl-paper-pdf-download/')
driver.find_element_by_xpath('//*[#id="post-11490"]/div/div/p[4]/a/strong').click()
NEW (works on edge)
To use this you have to install pyautogui library with the command pip install pyautogui
import time
import pyautogui
from selenium import webdriver
driver = webdriver.Edge()
pdf_url = 'http://www.africau.edu/images/default/sample.pdf'
driver.get(pdf_url)
time.sleep(3)
pyautogui.hotkey('ctrl', 's')
time.sleep(2)
path_and_filename = r'C:\Users\gt\Desktop\test.pdf'
pyautogui.typewrite(path_and_filename)
pyautogui.press('enter')
OLD (works on chrome)
This is the code I use to automatically download a pdf to a specific path. If you have windows, just put your account name in r'C:\Users\...\Desktop'. Moreover, you have to put the path of your driver in chromedriver_path. The code below downloads a sample pdf.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
options = webdriver.ChromeOptions()
download_path = r'C:\Users\...\Desktop'
options.add_experimental_option('prefs', {
"download.default_directory": download_path, # change default directory for downloads
"download.prompt_for_download": False, # to auto download the file
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True # it will not show PDF directly in chrome
})
chromedriver_path = '...'
driver = webdriver.Chrome(options=options, service=Service(chromedriver_path))
pdf_url = 'http://www.africau.edu/images/default/sample.pdf'
driver.get(pdf_url)
After testing, I think that the problem is mainly caused by the site you provided, which seems to embed other PDF viewers instead of the one that comes with Edge.
So you may need code like this to achieve your needs( url splicing ):
from selenium import webdriver
from selenium.webdriver.edge import service
import time
edgeOption = webdriver.EdgeOptions()
edgeOption.use_chromium = True
edgeOption.add_argument("start-maximized")
edgeOption.add_experimental_option('prefs', {
"download.default_directory": "C:\\Downloads",
"download.prompt_for_download": False
})
edgeOption.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
s=service.Service(r'C:\Users\Administrator\Desktop\msedgedriver.exe')
driver = webdriver.Edge(service=s, options=edgeOption)
driver.get('https://sscstudy.com/ssc-chsl-paper-pdf-download/')
url = driver.find_element_by_xpath('//*[#id="post-11490"]/div/div/p[4]/a').get_attribute('href')
driver.get("https://drive.google.com/uc?id="+url[32:(len(url)-17)]+"&export=download")
time.sleep(1)
Note: Test with Selenium 4.1.0 and Edge 101.0.1210.53. Please modify path of the Edge Driver and other possible parameters according to your own situation.

Can you actually change the default download directory for an already open chrome session using Selenium on Python?

I'm trying to download some zips files from this page to an specific path for an already chrome browser session open using the code down below:
import time
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
opt = Options() #the variable that will store the selenium options
opt.add_experimental_option("debuggerAddress", "localhost:9222") #this allows bulk-dozer to take control of your Chrome Browser in DevTools mode.
opt.add_experimental_option("prefs", {"download.default_directory": r"C:\Users\ResetStoreX\Downloads\Binance futures data\ADAUSDT-Mark_Prices_Klines_1h_Timeframe"}) #set the path to save the desired zipped data
s = Service(r'C:\Users\ResetStoreX\AppData\Local\Programs\Python\Python39\Scripts\chromedriver.exe') #Use the chrome driver located at the corresponding path
driver = webdriver.Chrome(service=s, options=opt) #execute the chromedriver.exe with the previous conditions
#Why using MarkPrices: https://support.btse.com/en/support/solutions/articles/43000557589-index-price-and-mark-price#:~:text=Index%20Price%20is%20an%20important,of%20cryptocurrencies%20on%20major%20exchanges.&text=Mark%20Price%20is%20the%20price,be%20fair%20and%20manipulation%20resistant.
if driver.current_url == 'https://data.binance.vision/?prefix=data/futures/um/daily/markPriceKlines/ADAUSDT/1h/' :
number = 2 #initialize an int variable to 2 because the desired web elements in this page starts from 2
counter = 0
the_dictionary_links = {}
while number <= np.size(driver.find_elements(By.XPATH, '//*[#id="listing"]/tr')): #iterate over the tbody array
data_file_name = driver.find_element(By.XPATH, f'//*[#id="listing"]/tr[{number}]/td[1]/a').text
if data_file_name.endswith('CHECKSUM') == False:
the_dictionary_links[data_file_name] = driver.find_element(By.XPATH, f'//*[#id="listing"]/tr[{number}]/td[1]/a').get_attribute('href')
print(f'Saving {data_file_name} and its link for later use')
counter += 1
number += 1
print(counter)
i = 0
o = 0
for i,o in the_dictionary_links.items():
driver.get(o)
print(f'Downloading {i}')
time.sleep(1.8)
And unfortunately it's not working, it throws the following error:
InvalidArgumentException: invalid argument: cannot parse capability:
goog:chromeOptions from invalid argument: unrecognized chrome option:
prefs
So, I would like to know what could have gone wrong? I coded the program above based on this solution but it only seems to work for a new chrome session, and I need the download default directory to be capable of being reset when needed for an already open session. Any ideas?

How to get the docx in the iframe with selenium?

I want to get the document in the url such as below:
document in the iframe
Try with wget command ,the downloaded file contain no document.
The document contained in the webpage can't be printed in pdf file in chrome.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
driver = driver = webdriver.Chrome(options=options)
target_doc_url = "http://www.ibodao.com/OfficePreview?furl=/Public/uploads/files/2020/0219/5e4cc551729af.docx"
driver.get(target_doc_url)
iframeMsg = driver.find_element_by_id("office_iframe")
driver.switch_to_frame(iframeMsg);
with open('/tmp/target.html','w') as writer:
writer.write(driver.page_source)
Open the /tmp/target.html,no document in it.
How to get the document in the iframe whose id is office_iframe?
import re
import urllib.request
from selenium import webdriver
driver = webdriver.Chrome()
target_doc_url = "http://www.ibodao.com/OfficePreview?furl=/Public/uploads/files/2020/0219/5e4cc551729af.docx"
driver.get(target_doc_url)
iframeMsg = driver.find_element_by_id("office_iframe")
src=iframeMsg.get_attribute("src")
m = re.search('.*?url=(.+?)/vector-output', src)
doc = m.group(1)
print(doc)
urllib.request.urlretrieve(doc, "a.docx")
this will save document as docx file , the src attribute in iframe shows the actual document file you don't need the vector-output part from the source
You can manually download it by going to :
http://static.ibodao.com/Public/uploads/files/2020/0219/5e4cc551729af.docx
Make it more simple after getting the src which contains real url:
target_url = src.split("=")[1]
urllib.request.urlretrieve(target_url, "target.docx")

Reusing previously opened selenium window

I have been testing various automation tasks using Selenium Python library. I faced a very serious issue in that process. That is the delay caused due to poor internet connection.
The starting steps are same for all tasks, namely
- open browser
- go to website
- login
- open a specific link after login
These tasks take a long time, so I was hoping to find a method by which I could reuse an already open session and continue testing on that window instead of opening new browser window every time I run the script.
Here's what I have tested so far:
Example Original File:
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(r'C:\testDir\chromedriver_win32\chromedriver.exe', chrome_options=chrome_options)
url = driver.command_executor._url
session_id = driver.session_id
print("URL:",url,"SESSION ID:",session_id)
driver.get('http://www.facebook.com')
username = "---"
password = "---"
driver.find_element_by_name('email').send_keys(username)
driver.find_element_by_name('pass').send_keys(password)
driver.find_element_by_name('login').click()
# time.sleep(2)
driver.maximize_window()
time.sleep(2)
driver.find_elements_by_name('q').send_keys('Ayush Mandowara')
# element = wait.until(EC.presence_of_element_located((By.xpath, '//input[#placeholder="Search"]'))
driver.find_element_by_xpath('//input[#placeholder="Search"]').send_keys('Ayush Mandowara' + Keys.RETURN)
time.sleep(4)
driver.find_element_by_xpath('//div[contains(text(), "Ayush Mandowara")]').click()
time.sleep(3)
driver.find_element_by_class_name('coverBorder').click()
time.sleep(2)
Connecting File:
from connectingToPrev import url, session_id
driver = webdriver.Remote(command_executor=url,desired_capabilities={})
driver.session_id = session_id
driver.get("http://www.google.com")
This connecting file is following what all has already happened in the previous window, I was expecting it either connect to previous window or to open the last link with correct credentials
Answer in Python is appreciated!

Resources