How to Download webpage as .mhtml - python-3.x

I am able to successfully open a URL and save the resultant page as a .html file. However, I am unable to determine how to download and save a .mhtml (Web Page, Single File).
My code is:
import urllib.parse, time
from urllib.parse import urlparse
import urllib.request
url = ('https://www.example.com')
encoded_url = urllib.parse.quote(url, safe='')
print(encoded_url)
base_url = ("https://translate.google.co.uk/translate?sl=auto&tl=en&u=")
translation_url = base_url+encoded_url
print(translation_url)
req = urllib.request.Request(translation_url, headers={'User-Agent': 'Mozilla/6.0'})
print(req)
response = urllib.request.urlopen(req)
time.sleep(15)
print(response)
webContent = response.read()
print(webContent)
f = open('GoogleTranslated.html', 'wb')
f.write(webContent)
print(f)
f.close
I have tried to use wget using the details captured in this question:
How to download a webpage (mhtml format) using wget in python but the details are incomplete (or I am simply unabl eto understand).
Any suggestions would be helpful at this stage.

Did you try using Selenium with a Chrome Webdriver to save page?
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.expected_conditions import visibility_of_element_located
from selenium.webdriver.support.ui import WebDriverWait
import pyautogui
URL = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
FILE_NAME = ''
# open page with selenium
# (first need to download Chrome webdriver, or a firefox webdriver, etc)
driver = webdriver.Chrome()
driver.get(URL)
# wait until body is loaded
WebDriverWait(driver, 60).until(visibility_of_element_located((By.TAG_NAME, 'body')))
time.sleep(1)
# open 'Save as...' to save html and assets
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
if FILE_NAME != '':
pyautogui.typewrite(FILE_NAME)
pyautogui.hotkey('enter')

I have a better solution, which will not involve any possible manual operation and specify the path to hold the mhtml file. I learn this from a chinese blog . The key idea is to use chrome-dev-tools command.
The code is shown below as an example.
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.qq.com/')
# Execute Chrome dev tool command to obtain the mhtml file
res = driver.execute_cdp_cmd('Page.captureSnapshot', {})
# 2. write file locally
with open('./store/qq.mhtml', 'w', newline='') as f:
f.write(res['data'])
driver.quit()
Hope this will help!
more things about chrome dev protocols

save as mhtml, need to add argument '--save-page-as-mhtml'
options = webdriver.ChromeOptions()
options.add_argument('--save-page-as-mhtml')
driver = webdriver.Chrome(options=options)

I wrote it just the way it was. Sorry if it's wrong.
I created a class, so you can use it. The example is in the three lines below.
Also, you can change the number of seconds to sleep as you like.
Incidentally, non-English keyboards such as Japanese and Hangul keyboards are also supported.
import chromedriver_binary
from selenium import webdriver
import pyautogui
import pyperclip
import uuid
class DonwloadMhtml(webdriver.Chrome):
def __init__(self):
super().__init__()
self._first_save = True
time.sleep(2)
def save_page(self, url, filename=None):
self.get(url)
time.sleep(3)
# open 'Save as...' to save html and assets
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
if filename is None:
pyperclip.copy(str(uuid.uuid4()))
else:
pyperclip.copy(filename)
time.sleep(1)
pyautogui.hotkey('ctrl', 'v')
time.sleep(2)
if self._first_save:
pyautogui.hotkey('tab')
time.sleep(1)
pyautogui.press('down')
time.sleep(1)
pyautogui.press('up')
time.sleep(1)
pyautogui.hotkey('enter')
time.sleep(1)
self._first_save = False
pyautogui.hotkey('enter')
time.sleep(1)
# example
dm = DonwloadMhtml()
dm.save_page('https://en.wikipedia.org/wiki/Python_(programming_language)', 'wikipedia_python') # create file named "wikipedia_python.mhtml"
dm.save_page('https://www.python.org/') # file named randomly based on uuid4
python3.8.10
selenium==4.4.3

Related

Collecting all links in the main page and sub-pages

I am trying make a script collect all links in main and sub-pages
For example, I need to collect example.com, example.com/link1, example.com/link1/sub-link1, ...
Please check my code
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
# Instantiate webdriver
driver = webdriver.Chrome()
# Open Chrome maximized
driver.maximize_window()
# Open a link
driver.get("https://www.calculator.net/")
links = driver.find_elements(By.TAG_NAME, 'a')
header = ['URL', 'Name']
with open('/home/user/calculator.csv', 'w', encoding='UTF8') as f:
writer = csv.writer(f)
# write the header
writer.writerow(header)
for link in links:
href = link.get_attribute("href")
data = [href, link.text]
# write the data
writer.writerow(data)
Please note that I tried to add driver.get(href) to the for loop, but that did not work
Try with BeautifulSoup module and let driver sleep a little.
from bs4 import BeautifulSoup
...
driver.get("https://www.calculator.net/")
time.sleep(5)
html = driver.page_source
driver.close()
bs = BeautifulSoup(html, 'html.parser')
links = bs.find_all('a')
for link in links:
print(link.attrs['href']

Reading multiple URLs from a text file, processing each web page, and scraping the content inside

I have a .txt file with a list of multiple URLs. My purpose is to open this .txt file, access each URL in each line, scrape the content inside each URL, and append the content with list of multiple URLs in the txt file to the "draft.csv" file.
When I tried to run other codes, the recommended request result shows "Please turn on JavaScript and refresh the page", so I intended to use Selenium instead to get this resolved. I am able to fetch all the pages as wanted, but unable to see the desired content in each link.
Below is list of multiple URLs for example:
http://example.com/2267/15175/index.html
http://example.com/2267/16796/index.html
http://example.com/2267/17895/index.html
This is my current code using Selenium and Requests.
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import sys
import pandas as pd
import urllib.request
import requests
frame =[]
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options = chrome_options)
with open("draft.txt", "r") as file:
for line in file:
url = line.rstrip("\n")
print(url)
driver.get(url)
html = etree.HTML(driver.page_source)
allurl = requests.get(url)
htmltext = allurl.text
extract_link = html.xpath('//span[#id="my_two"]/table/tbody/tr/td/table[2]')
for i in extract_link:
link = i.xpath('./tbody/tr/td/div/p/a/#href')
content = 'http://example.com'+ link[0]
frame.append({
'content': content,
})
dfs = pd.DataFrame(frame)
dfs.to_csv('draft.csv',index=False,encoding='utf-8-sig')
Thank you in advance for helping me with this!
You must load selenium inside a for loop, and you can use bs4 for scraping:
from selenium import webdriver
from bs4 import BeautifulSoup
f = open("urls.txt")
urls = [url.strip() for url in f.readlines()]
For url in urls:
driver.get(url)
...
html = driver.page_source
soup = BeautifulSoup(html)
Information = soup.find('title')
Url = url
...
driver.quit()

How to quit selenium chrome driver when the driver is not selected? [duplicate]

This question already has answers here:
Selenium : How to stop geckodriver process impacting PC memory, without calling driver.quit()?
(1 answer)
PhantomJS web driver stays in memory
(1 answer)
Closed 3 years ago.
I wrote some code in python using selenium and multiprocessing to parallelize data collection. I am collecting some data from YouTube. I have a method which initiates a chrome webdriver. I used multiprocessing to collect data faster. The issue is that when the timeout for the multiprocessing is reached, the function with the chromedriver exits the function before driver.quit() command can register. This leads to the accumulation of idle chromedrivers which I cannot close within python since (to my knowledge) there is no way to reference them. Is there any way to close all chromedrivers without explicitly using the driver objects?
I wrote the code in python3. The chromedriver is Chrome version 72.
# Web related modules
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from urllib.request import urlopen
import html2text
# YouTube download module
from pytube import YouTube
# Multiprocessing tools
from multiprocessing import Lock, Manager, Queue, Pool
import multiprocessing as mp
# Misc modules
import time, re, pickle, os, shutil, argparse, glob, unicodedata, datetime
from argparse import RawTextHelpFormatter
# Irrelevant to the problem
def save_vids(vid_ids,save_loc):
print('Irrelevant Function')
# Function that generates the initial list of urls to visit
def explore_home(chromedriver_path,chrome_options,caps):
driver=webdriver.Chrome(executable_path=chromedriver_path,options=chrome_options,desired_capabilities=caps)
driver.get('https://www.youtube.com')
time.sleep(1)
html_source = driver.page_source
driver.close()
parts=html_source.split('{"webCommandMetadata":{"url":"/watch_videos?')[1:]
vids=[]
for part in parts:
part=part[part.find('video_ids=')+10:]
if part.find('\\u')!=-1:
if part.find('"')!=-1:
end=min(part.find('\\u'),part.find('"'))
else:
end=part.find('\\u')
elif part.find('"')!=-1:
end=part.find('"')
else:
print('fuck')
concat_list=part[:end]
vids.extend(concat_list.split('%2C'))
vids=[vid for vid in vids if len(re.findall(r'[0-9]|[a-z]|[A-Z]|_|-',vid))==11 and len(vid)==11]
return vids
# The function that generates chromedrivers and fails to quit if a multiprocessing timeout occurs.
def explore_vid(chromedriver_path,chrome_options,caps,vid,ads,save_loc,l):
driver=webdriver.Chrome(executable_path=chromedriver_path,options=chrome_options,desired_capabilities=caps)
driver.get('https://www.youtube.com/watch?v='+vid)
time.sleep(2)
sec_html = driver.page_source
soup=BeautifulSoup(sec_html,'lxml')
mydivs = str(soup.findAll("div", {"class": "style-scope ytd-watch-next-secondary-results-renderer"}))
inds=[m.start() for m in re.finditer('ytimg.com/vi/', mydivs)]
rec_vids=['https://www.youtube.com/watch?v='+mydivs[ind+13:ind+24] for ind in inds]
browser_log = driver.get_log('performance')
adInfo=find_ad(browser_log,vid)
if adInfo:
#Check if it is the first time this ad has been seen
adID=adInfo[0]
l.acquire()
try:
if adID in ads:
ads[adID][0].append(adInfo[1])
else:
try:
element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".ytp-ad-button.ytp-ad-visit-advertiser-button.ytp-ad-button-link")))
element.click()
driver.switch_to.window(driver.window_handles[-1])
ad_website_URL=driver.current_url
ad_website_HTML=driver.page_source
clean_text=html2text.html2text(ad_website_HTML)
save_vids(adID,save_loc)
textName=os.path.join(save_loc,adID,'adwebsite.txt')
file = open(textName,"w")
file.write(ad_website_URL)
file.write('\n')
file.write(clean_text)
file.close()
ads[adID]=[[adInfo[1]],ad_website_URL]
except WebDriverException:
print('Button click failed: %s:%s' %(vid,adInfo[0]))
finally:
l.release()
# The quit command for the chrome driver
driver.quit()
return rec_vids
def find_ad(browser_log,vid):
for k in range(len(browser_log)):
if browser_log[k]['message'].find('adunit')!=-1 and browser_log[k]['message'].find(vid)!=-1:
ind=browser_log[k]['message'].find('https://www.youtube.com/get_video_info?html5=1&video_id=')
vid_id=browser_log[k]['message'][ind+56:ind+67]
return (vid_id,time.localtime())
return None
def positive_int(argument):
num=int(argument)
if num<1:
msg="Maximum depth parameter must be a positive number. You entered: %s" %argument
raise argparse.ArgumentTypeError(msg)
return num
def valid_pickle(argument):
file=str(argument)
if not file.endswith('.pickle'):
msg="ad_save_loc must end with .pickle You entered: %s" %file
raise argparse.ArgumentTypeError(msg)
return file
def valid_dir(argument):
directory=str(argument)
if not os.path.isdir(directory):
msg="vid_save_loc must be a valid directory. You entered: %s" %directory
raise argparse.ArgumentTypeError(msg)
return directory
if __name__ == '__main__':
# Argument Parsing
parser = argparse.ArgumentParser(description='Scrapes Youtube ads and advertising company websites. \nUse --restart to restart the scraping from scratch by deleting previous data\nExample Usage: python finalReader.py E:\ads\ads.pickle E:\ads --ncpu 2', formatter_class=RawTextHelpFormatter)
parser.add_argument('ad_save_loc',help='Save Location for Ad Main Dictionary', type=valid_pickle)
parser.add_argument('vid_save_loc',help='Save Location for Ad Videos', type=valid_dir)
parser.add_argument('chromedriver_path', help='Path of the chrome executable', type=str)
parser.add_argument('--restart', help='Restart collection', action="store_true", default=False, dest='restartCollection')
parser.add_argument('--ncpu', nargs='?', help='Number of cores for multiprocessing, 1 by default', default=1, type=int, dest='mpcpu')
parser.add_argument('--timeout',nargs='?', help='For how long the data collection will take place (in seconds), infinite by default', default=float('inf'), type=float, dest='time_limit')
parser.add_argument('--max_depth', nargs='?', help='Depth of Youtube exploration tree', default=1, type=positive_int, dest='search_depth')
args = parser.parse_args()
ad_save_loc=args.ad_save_loc
vid_save_loc=args.vid_save_loc
vid_save_loc=os.path.join(vid_save_loc,'ad_data')
mpcpu=max(args.mpcpu,1)
time_limit=args.time_limit
chromedriver_path=args.chromedriver_path
search_depth=args.search_depth
if not os.path.isdir(vid_save_loc):
os.mkdir(vid_save_loc)
if args.restartCollection:
for the_file in os.listdir(vid_save_loc):
file_path = os.path.join(vid_save_loc, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(e)
if os.path.isfile(ad_save_loc):
os.remove(ad_save_loc)
ads={}
else:
if os.path.isfile(ad_save_loc):
pickle_in = open(ad_save_loc,"rb")
ads = pickle.load(pickle_in)
else:
ads={}
# Chrome Driver Options
chrome_options=Options()
chrome_options.add_argument('--mute-audio')
caps = DesiredCapabilities.CHROME
caps['loggingPrefs'] = {'performance': 'ALL'}
startTime=time.time()
currentTime=time.time()
# Data Collection Loop - Multiprocessing
while currentTime-startTime<time_limit:
print('Time from start: %s' %str(datetime.timedelta(seconds=currentTime-startTime)))
rec_vids=explore_home(chromedriver_path,chrome_options,caps)
while not rec_vids:
time.sleep(60)
rec_vids=explore_home(chromedriver_path,chrome_options,caps)
m = Manager()
lock = m.Lock()
pool = Pool(processes=mpcpu)
for depth in range(search_depth):
print('Depth %s' %depth)
multiple_results=[pool.apply_async(explore_vid, (chromedriver_path,chrome_options,caps,vid,ads,vid_save_loc,lock)) for vid in rec_vids]
branching_vids=[]
for res in multiple_results:
try:
branching_vids.append(res.get(timeout=30))
if time.time()-startTime<time_limit:
break
except mp.TimeoutError:
print('Timeout')
res_vids=branching_vids.copy()
pickle_out = open(ad_save_loc,"wb")
pickle.dump(ads, pickle_out)
pickle_out.close()
currentTime=time.time()

How can I fix encoding problems without a metric-ton of .replace()? Python3 Chrome-Driver BS4?

The print() command prints the scraped website perfectly to the IDLE shell. However, write/writelines/print will not write to a file without throwing many encode errors or super-geek-squad code.
Tried various forms of .encode(encoding='...',errors='...') to no avail.
When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.
If I wanted to spend 10 years doing .replace('...','...'), as shown in the code of text = ... I can get this to completely work:
#! python3
import os
import os.path
from os import path
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
def Close():
driver.stop_client()
driver.close()
driver.quit()
CHROMEDRIVER_PATH = 'E:\Downloads\chromedriver_win32\chromedriver.exe'
# start raw html
NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'
BaseURL = 'https://novelplanet.com'
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}
options = Options()
options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
#options.add_argument("--headless") # Runs Chrome in headless mode.
#options.add_argument('--no-sandbox') # Bypass OS security model
#options.add_argument('--disable-gpu') # applicable to windows os only
options.add_argument('start-maximized') #
options.add_argument('disable-infobars')
#options.add_argument("--disable-extensions")
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
driver.get(url)
# wait for title not be equal to "Please wait 5 seconds..."
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
# End raw html
# Start get first chapter html coded
i=0
for chapterLink in soup.find_all(class_='rowChapter'):
i+=1
cLink = chapterLink.find('a').contents[0].strip()
print(driver.title)
# end get first chapter html coded
# start navigate to first chapter
link = driver.find_element_by_link_text(cLink)
link.click()
# end navigate to first chapter
# start copy of chapter and add to a file
def CopyChapter():
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
print(driver.title)
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
readables = soup.find(id='divReadContent')
text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')
name = driver.title
file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')
print(file_name)
#print(text) # <-- This shows the correct text in the shell with no errors
with open(file_name,'a+') as file:
print(text,file=file) # <- this never works without a bunch of .replace() where text is defined
global lastURL
lastURL = driver.current_url
NextChapter()
# end copy of chapter and add to a file
# start goto next chapter if exists then return to copy chapter else Close()
def NextChapter():
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
a=0
main = soup.find(class_='wrapper')
for container in main.find_all(class_='container'):
a+=1
row = container.find(class_='row')
b=0
for chapterLink in row.find_all(class_='4u 12u(small)'):
b+=1
cLink = chapterLink.find('a').contents[0].strip()
link = driver.find_element_by_link_text(cLink)
link.click()
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
global currentURL
currentURL = driver.current_url
if currentURL != lastURL:
CopyChapter()
else:
print('Finished!!!')
Close()
# end goto next chapter if exists then return to copy chapter else Close()
CopyChapter()
#EOF
Expected results would have the Text file output exactly the same as the IDLE print(text) with absolutely no changes. Then I would be able to test if every chapter gets copied for offline viewing and that it stops at the last chapter posted.
At the current time unless I keep adding more and more .replace() for every novel and chapter this won't ever be working properly. I wouldn't mind manually removing the Ad descriptions by using .replace() but if there is also a better way to do that then how can it be done?
Windows 10
Python 3.7.0
There was some reason for os and os.path in an earlier version of this script but now I don't remember if it is still needed or not.

Multiprocessing selenium in Python 3 using Chrome issues

I'm trying to scrape data from a website, and trying to run multiple chrome browsers to simultaneously download these files to speed up the process. If I use a single window, this script runs fine. However, there are two issues I'm running into -
a) Many of the browser windows do not close.
b) While the program does run and downloads files for a while, it stops after some time. Error message - 'ERROR:shader_disk_cache.cc(238)] Failed to create shader cache entry -2'
My chromedriver is in 'D:\401\401k'
Script -
'''Downloading 5500 forms from ERISA'''
#Import Library
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time, os, timeit, shutil
import pandas as pd
from multiprocessing import Pool
#Clean up download folder
if os.path.exists('D:/Form5500_Downloads'):
shutil.rmtree('D:/Form5500_Downloads')
os.makedirs('D:/Form5500_Downloads')
else:
os.makedirs('D:/Form5500_Downloads')
'''Function to download a single form using ACK ID'''
def download_form(ackid):
#Setting Chrome preferences
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory" : "D:/Form5500_Downloads"} #Download folder
chromeOptions.add_experimental_option("prefs",prefs)
path_to_chromedriver = 'D:/401/401k/chromedriver_2.35'
browser = webdriver.Chrome(executable_path = r'D:\401\401k\chromedriver_2.35.exe', chrome_options=chromeOptions)
browser.implicitly_wait(3) #Implicit wait untile the element appears
# Open ERISA website
url = 'https://www.efast.dol.gov/portal/app/disseminate?execution=e1s4#'
browser.get(url)
# Search for a form using ACK ID
browser.find_element_by_css_selector('#ackId').send_keys(ackid)
browser.find_element_by_css_selector('.ui-icon-search').click()
#Check if the form exists - if NOT, exit the function
try:
browser.find_element_by_css_selector('#form\:filingTreeTable\:0\:einLnk').click()
except:
browser.find_element_by_css_selector('#ackId').clear()# delete the ackid value
browser.quit()
#Wait until downloaded and rename using ackid
print(ackid)
while not os.path.exists("D:/Form5500_Downloads/filing.pdf"):
time.sleep(5)
os.rename("D:/Form5500_Downloads/filing.pdf","D:/Form5500_Downloads/"+ackid+".pdf")
browser.quit()
def main():
'''Download'''
# Get list of ackids from csv file
df = pd.read_csv('D:/401/401k/F_SCH_H_2015_latest.csv',usecols=[0], nrows=10000)
ackid_list = df['ACK_ID'].tolist()
if __name__ == '__main__':
with Pool(10) as p:
records = p.map(download_form, ackid_list)
main()
This issue was resolved using the following -
browser.stop_client()
browser.close()
instead of
browser.quit()
I still do not understand why this hack works though.

Resources