How to get the docx in the iframe with selenium? - python-3.x

I want to get the document in the url such as below:
document in the iframe
Try with wget command ,the downloaded file contain no document.
The document contained in the webpage can't be printed in pdf file in chrome.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
driver = driver = webdriver.Chrome(options=options)
target_doc_url = "http://www.ibodao.com/OfficePreview?furl=/Public/uploads/files/2020/0219/5e4cc551729af.docx"
driver.get(target_doc_url)
iframeMsg = driver.find_element_by_id("office_iframe")
driver.switch_to_frame(iframeMsg);
with open('/tmp/target.html','w') as writer:
writer.write(driver.page_source)
Open the /tmp/target.html,no document in it.
How to get the document in the iframe whose id is office_iframe?

import re
import urllib.request
from selenium import webdriver
driver = webdriver.Chrome()
target_doc_url = "http://www.ibodao.com/OfficePreview?furl=/Public/uploads/files/2020/0219/5e4cc551729af.docx"
driver.get(target_doc_url)
iframeMsg = driver.find_element_by_id("office_iframe")
src=iframeMsg.get_attribute("src")
m = re.search('.*?url=(.+?)/vector-output', src)
doc = m.group(1)
print(doc)
urllib.request.urlretrieve(doc, "a.docx")
this will save document as docx file , the src attribute in iframe shows the actual document file you don't need the vector-output part from the source
You can manually download it by going to :
http://static.ibodao.com/Public/uploads/files/2020/0219/5e4cc551729af.docx

Make it more simple after getting the src which contains real url:
target_url = src.split("=")[1]
urllib.request.urlretrieve(target_url, "target.docx")

Related

How to download file in pdf with selenium edge web driver in specific custom folder in python selenium?

I am using selenium webdriver to automate downloading several PDF files. I get the PDF preview window (see below), and now I would like to download the file. How can I accomplish this using edge as the browser?
Sample Screenshot i want to download
Here's I've got so far but it's not working.
path = "F:\Anuzz\Desktop\sel\msedgedriver.exe"
options = EdgeOptions()
options.add_experimental_option('prefs', {
"download.default_directory": "F:\Anuzz\Desktop\sel\test.py",
"download.prompt_for_download": False,
"plugins.always_open_pdf_externally": True
})
driver = Edge(path, options=options)
driver.get('https://sscstudy.com/ssc-chsl-paper-pdf-download/')
driver.find_element_by_xpath('//*[#id="post-11490"]/div/div/p[4]/a/strong').click()
NEW (works on edge)
To use this you have to install pyautogui library with the command pip install pyautogui
import time
import pyautogui
from selenium import webdriver
driver = webdriver.Edge()
pdf_url = 'http://www.africau.edu/images/default/sample.pdf'
driver.get(pdf_url)
time.sleep(3)
pyautogui.hotkey('ctrl', 's')
time.sleep(2)
path_and_filename = r'C:\Users\gt\Desktop\test.pdf'
pyautogui.typewrite(path_and_filename)
pyautogui.press('enter')
OLD (works on chrome)
This is the code I use to automatically download a pdf to a specific path. If you have windows, just put your account name in r'C:\Users\...\Desktop'. Moreover, you have to put the path of your driver in chromedriver_path. The code below downloads a sample pdf.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
options = webdriver.ChromeOptions()
download_path = r'C:\Users\...\Desktop'
options.add_experimental_option('prefs', {
"download.default_directory": download_path, # change default directory for downloads
"download.prompt_for_download": False, # to auto download the file
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True # it will not show PDF directly in chrome
})
chromedriver_path = '...'
driver = webdriver.Chrome(options=options, service=Service(chromedriver_path))
pdf_url = 'http://www.africau.edu/images/default/sample.pdf'
driver.get(pdf_url)
After testing, I think that the problem is mainly caused by the site you provided, which seems to embed other PDF viewers instead of the one that comes with Edge.
So you may need code like this to achieve your needs( url splicing ):
from selenium import webdriver
from selenium.webdriver.edge import service
import time
edgeOption = webdriver.EdgeOptions()
edgeOption.use_chromium = True
edgeOption.add_argument("start-maximized")
edgeOption.add_experimental_option('prefs', {
"download.default_directory": "C:\\Downloads",
"download.prompt_for_download": False
})
edgeOption.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
s=service.Service(r'C:\Users\Administrator\Desktop\msedgedriver.exe')
driver = webdriver.Edge(service=s, options=edgeOption)
driver.get('https://sscstudy.com/ssc-chsl-paper-pdf-download/')
url = driver.find_element_by_xpath('//*[#id="post-11490"]/div/div/p[4]/a').get_attribute('href')
driver.get("https://drive.google.com/uc?id="+url[32:(len(url)-17)]+"&export=download")
time.sleep(1)
Note: Test with Selenium 4.1.0 and Edge 101.0.1210.53. Please modify path of the Edge Driver and other possible parameters according to your own situation.

how to download a file using selenium in python?

I wanted to download files using python but I am not able to do it. I tried searching for the ways to do it but i dint find any relevant resource.
Here is my code:
from selenium import webdriver
driver = webdriver.Chrome('/home/user/Downloads/chromedriver')
#The below link is a pdf file and not an HTML file. I want to download this file directly.
driver.get("https://authlink-files-storage.ams3.digitaloceanspaces.com/authlink/transfered_certificates_related_docs/supporting_docs_17_2020_07_24_06_25_764ffb965d1b4ae287a0d3cc01c8dd03")
Now I want to download this file but i am not able to do it.
If direct download doesn't work you can always workaround using the printing functionality:
Need to use chrome options --kiosk-printing which will automatically click on print button once print dialog is opened
options = webdriver.ChromeOptions()
options.add_argument("--kiosk-printing")
Define chrome preferences as JSON string
prefs = {"savefile.default_directory": "your destination path", "printing.default_destination_selection_rules": {"kind": "local", "idPattern": ".*", "namePattern": "Save as PDF"}}
In above prefs, default directory will be used to save your pdf in required location. second pref will select the "save as pdf" option from print dialog automatically
Add pref as experimental options
options.add_experimental_option("prefs", prefs)
Define driver using chrome options and prefs
driver = webdriver.Chrome(chrome_options=options)
Once the pdf is opened in url, you can open print dialog using javascript
driver.execute_script("window.print()")
Your pdf will be saved in the destination path with the same title
Try This Code
from selenium import webdriver
download_dir = "C:\\Temp\\Dowmload" # for linux/*nix, download_dir="/usr/Public"
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], # Disable Chrome's PDF Viewer
"download.default_directory": download_dir , "download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)
driver = webdriver.Chrome('//Server/Apps/chrome_driver/chromedriver.exe', chrome_options=options)
driver.get("https://authlink-files-storage.ams3.digitaloceanspaces.com/authlink/transfered_certificates_related_docs/supporting_docs_17_2020_07_24_06_25_764ffb965d1b4ae287a0d3cc01c8dd03")
The solution to your problem is simple. To explain it better let me help you with a consideration of a scenario like downloading a file without clicking on the save as button in a present framework folder and then deleting the file after verification.
from selenium import webdriver
import os
From selenium.webdriver.common.by import By
From webdriver_manager.chrome import ChromeDriverManager
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--verbose')
op.add_argument("--disable-notifications")
op.add_experimental_option("prefs", {"download.default_directory":
"G:/Python/Download/","download.prompt_for_download":
False,"download.directory_upgrade": True,"safebrowsing.enabled": True})
op.add_argument('--disable-gpu')
op.add_argument('--disable-software-rasterizer')
driver = webdriver.Chrome(ChromeDriverManager().install(),
chrome_options=op)
driver.find_element(By.XPATH, “//span[#type = ‘button’]”).click()
def download_file_verify(self,filename):
dir_path = "G:/Python/Download/"
res = os.listdir(dir_path)
try:
name = os.path.isfile("Download/" + res[0])
if res[0].__contains__(filename):
print("file downloaded successfully")
except "file is not downloaded":
name = False
return name
def delete_previous_file(self,filename):
try:
d_path = "G:/Python/Download/"
list = os.listdir(d_path)
for file in list:
print("present file is: " + file)
path = ("Download/" + file)
if file.__contains__(filename):
os.remove(path)
print("Present file is deleted")
except:
pass

Get Value Outside a Tag using WebDriver

I am trying to get a Value outside a tag using Python Webdriver.But i am getting both (inside and outside) values.
Html codeto scrape
That's what I am doing:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
url = 'https://www.zattini.com.br/roupas/feminino?mi=ztt_hm_fem_cat1_roupas&psn=Banner_BarradeCategorias_1fem&fc=barradecategorias'
driver = webdriver.Chrome()
driver.get(url)
brands = driver.find_element_by_xpath("//a[#qa-automation='search-brand']")
#html = driver.page_source
printf(brands.text)
But I am getting :
#MO
5
And all I want is the "Mo" value. To, after that, get the "5" in another column of the array.
What can I change to get them separately.
Since there is a element(/span) as a child element of anchor, it prints all the text.
Try this solution,
brands = driver.find_element_by_xpath("//a[#qa-automation='search-brand']")
brandcount = driver.find_element_by_xpath("//a[#qa-automation='search-brand']/span")
#html = driver.page_source
print(str(brands.text).strip(brandcount.text).strip('\n'))
print(brandcount.text)

How can I scrape data which is not having any of the source code?

scrape.py
# code to scrape the links from the html
from bs4 import BeautifulSoup
import urllib.request
data = open('scrapeFile','r')
html = data.read()
data.close()
soup = BeautifulSoup(html,features="html.parser")
# code to extract links
links = []
for div in soup.find_all('div', {'class':'main-bar z-depth-1'}):
# print(div.a.get('href'))
links.append('https://godamwale.com' + str(div.a.get('href')))
print(links)
file = open("links.txt", "w")
for link in links:
file.write(link + '\n')
print(link)
I have successfully got the list of links by using this code. But When I want to scrape the data from those links from their html page, these don't have any of the source code that contains data,and to extract them it my job tough . I have used selenium driver , but it won't work well for me.
I want to scrape the data from the below link , that contains data in the html sections , which have Customer details, licence and automation, commercial details, Floor wise, operational details . I want to extract these data with name , location , contact number and type.
https://godamwale.com/list/result/591359c0d6b269eecc1d8933
it 's link here . If someone finds solution , please give it to me.
Using Developer tools in your browser, you'll notice whenever you visit that link there is a request for https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933 that returns a json response probably containing the data you're looking for.
Python 2.x:
import urllib2, json
contents = json.loads(urllib2.urlopen("https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933").read())
print contents
Python 3.x:
import urllib.request, json
contents = json.loads(urllib.request.urlopen("https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933").read().decode('UTF-8'))
print(contents)
Here you go , the main problem with the site seems to be it takes time to load that's why it was returning incomplete page source. you have to wait until page loads completely. notice time.sleep(8) this line in code below :
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import time
CHROMEDRIVER_PATH ="C:\Users\XYZ\Downloads/Chromedriver.exe"
wd = webdriver.Chrome(CHROMEDRIVER_PATH)
responce = wd.get("https://godamwale.com/list/result/591359c0d6b269eecc1d8933")
time.sleep(8) # wait untill page loads completely
soup = BeautifulSoup(wd.page_source, 'lxml')
props_list = []
propvalues_list = []
div = soup.find_all('div', {'class':'row'})
for childtags in div[6].findChildren('div',{'class':'col s12 m4 info-col'}):
props = childtags.find("span").contents
props_list.append(props)
propvalue = childtags.find("p",recursive=True).contents
propvalues_list.append(propvalue)
print(props_list)
print(propvalues_list)
note: code will return Construction details in 2 seperate list.

Web Scraping reviews -Flipkart

I am trying to take out entire review of a product(remaining half of the review is display after clicking read more. but I am still not able to do so.It is not displaying entire content of a review, which get dispalyed after clicking read more option. Below is the code , which click the readmore option and also get data from the website
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
response = requests.get("https://www.flipkart.com/poco-f1-graphite-black-64-gb/product-reviews/itmf8fyjyssnt25c?page=2&pid=MOBF85V7A6PXETAX")
data = BeautifulSoup(response.content, 'lxml')
chromepath = r"C:\Users\Mohammed\Downloads\chromedriver.exe"
driver=webdriver.Chrome(chromepath)
driver.get("https://www.flipkart.com/poco-f1-graphite-black-64-gb/product-reviews/itmf8fyjyssnt25c?page=2&pid=MOBF85V7A6PXETAX")
d = driver.find_element_by_class_name("_1EPkIx")
d.click()
title = data.find_all("p",{"class" : "_2xg6Ul"})
text1 = data.find_all("div",{"class" : "qwjRop"})
name = data.find_all("p",{"class" : "_3LYOAd _3sxSiS"})
for t2, t , t1 in zip(title,text1,name) :
print(t2.text,'\n',t.text,'\n',t1.text)
To get the full reviews, It is necessary to click on those READ MORE buttons to unwrap the rest. As you have already used selenium in combination with BeautifulSoup, I've modified the script to follow the logic. The script will first click on those READ MORE buttons. Once it is done, it will then parse all the titles and reviews from there. You can now get the titles and reviews from multiple pages (upto 4 pages).
import time
from bs4 import BeautifulSoup
from selenium import webdriver
link = "https://www.flipkart.com/poco-f1-graphite-black-64-gb/product-reviews/itmf8fyjyssnt25c?page={}&pid=MOBF85V7A6PXETAX"
driver = webdriver.Chrome() #If necessary, define the chrome path explicitly
for page_num in range(1,5):
driver.get(link.format(page_num))
[item.click() for item in driver.find_elements_by_class_name("_1EPkIx")]
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'lxml')
for items in soup.select("._3DCdKt"):
title = items.select_one("p._2xg6Ul").text
review = ' '.join(items.select_one(".qwjRop div:nth-of-type(2)").text.split())
print(f'{title}\n{review}\n')
driver.quit()

Resources