outputting python script results into text - python-3.x

I have a want to save my python script's result into a txt file.
My python code
from selenium import webdriver
bro = r"D:\Developer\Software\Python\chromedriver.exe"
driver=webdriver.Chrome(bro)
duo=driver.get('http://www.lolduo.com')
body=driver.find_elements_by_tag_name('tr')
for post in body:
print(post.text)
driver.close()
Some codes that I've tried
import subprocess
with open("output.txt", "w") as output:
subprocess.call(["python", "./file.py"], stdout=output);
I tried this code and it only makes a output.txt file and has nothing inside it
D:\PythonFiles> file.py > result.txt
Exception:
UnicodeEncodeError: 'charmap' codec can't encode character '\u02c9' in
position 0: character maps to
and only prints out 1/3 of the results of the script into a text file.

You can try below code to write data to text file:
from selenium import webdriver
bro = r"D:\Developer\Software\Python\chromedriver.exe"
driver = webdriver.Chrome(bro)
driver.get('http://www.lolduo.com')
body = driver.find_elements_by_tag_name('tr')
with open("output.txt", "w", encoding="utf8") as output:
output.write("\n".join([post.text for post in body]))
driver.close()

You can try this. This Is my Python Code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
import time
bro = r"D:\Developer\Software\Python\chromedriver.exe"
driver = webdriver.Chrome(bro)
driver.get('http://www.lolduo.com')
body = driver.find_elements_by_tag_name('tr') .text
with open('output15.txt', mode='w') as f:
for post in body:
print(post)
f.write(post)
time.sleep(2)
driver.close()

Related

Python / Selenium looping problem with 'for'

im trying to loop this code with "for" but it gives a error. Can you help me to solve it, thank you.
Error Code:
IndentationError: expected an indented block
Source Code:
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
binary = FirefoxBinary("C:\\Program Files\\Mozilla Firefox\\firefox.exe")
profile = FirefoxProfile("C:/Users/Baran/AppData/Roaming/Mozilla/Firefox/Profiles/oy6k3nay.yarrak")
driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary, executable_path="C:\WebDrivers\geckodriver.exe")
driver.get('http://www.ipsorgu.com/')
time.sleep(5)
driver.close()
What i tried:
.
.
.
.
driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary, executable_path="C:\WebDrivers\geckodriver.exe")
#################################################################
for i in range(10):
###################################################################
driver.get('http://www.ipsorgu.com/')
time.sleep(5)
driver.close()
This is because every thing in a for loop has to be indented. So you final code could look like this:
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
binary = FirefoxBinary("C:\\Program Files\\Mozilla Firefox\\firefox.exe")
profile = FirefoxProfile("C:/Users/Baran/AppData/Roaming/Mozilla/Firefox/Profiles/oy6k3nay.yarrak")
driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary, executable_path="C:\WebDrivers\geckodriver.exe")
for i in range(10):
driver.get('http://www.ipsorgu.com/')
time.sleep(5)
driver.close()

How to solve catch image and emoji in web crawler using selenium

I want to ask some questions.
I am using Python 3.7.6, web driver and selenium to do web crawler
And then, I used Visual Studio Code to finish my web crawler, and I output a csv file.
I used "find_elements_by_xpath" to catch some information. The following image is my part code:
from datetime import date,datetime
from selenium import webdriver #載入webdriver
from selenium.webdriver.common.keys import Keys #載入按鍵
from bs4 import BeautifulSoup #載入BeautifulSoup工具
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
import numpy as np
import xlrd
import csv
import codecs
import time
data = xlrd.open_workbook('B.xlsx')
table = data.sheets()[0]
print(table)
nrows = table.nrows
ncols = table.ncols
print(ncols)
print(nrows)
for i in range(1,nrows):
csv_post="Post_No_" + str(i) + ".csv"
with open(csv_post, 'a', newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['互動作者','發表時間','互動內容'])
print_link = table.cell_value(i,3)
print(i)
print(print_link)
driver_blank=webdriver.Chrome('./chromedriver') #使用chrome作為爬蟲輔助工具,把chromedriver載入進來
driver_blank.get(print_link)
time.sleep(1)
post_page_count = len(driver_blank.find_elements_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[4]/div[2]/div[2]/select/option"))
if(post_page_count != 0):
try_value=1
while(try_value):
try:
driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[2]")
print("測試顯示正常")
try_value=0
except NoSuchElementException as e:
print("測試顯示異常,現正刷新網頁")
driver_blank.refresh()
time.sleep(10)
print("總頁數:"+str(post_page_count))
table_rows=len(driver_blank.find_elements_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table"))
print("共有"+str(table_rows)+"個Table")
real_table_rows=table_rows+1
#only 1
post_author = driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[1]/tbody/tr[2]/td[1]/a")
post_content = driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[1]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td/div")
post_time = driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[1]/tbody/tr[2]/td[2]/table/tbody/tr[4]/td/div[2]/span")
print("互動作者:"+post_author.text)
print("互動內容:")
print(post_content.text)
print("發表時間:"+post_time.text)
print("<<< --- >>>")
with open(csv_post, 'a', newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow([post_author.text,post_time.text,post_content.text])
enter image description here
The following is the forum post: (https://forumd.hkgolden.com/view.aspx?type=MB&message=7197409)
enter image description here
I want to catch text, emoji, and image.
I can catch only the text, but I cannot catch emoji and image.
I don't know what to do. Can anyone help me? Thank you.

How to Download webpage as .mhtml

I am able to successfully open a URL and save the resultant page as a .html file. However, I am unable to determine how to download and save a .mhtml (Web Page, Single File).
My code is:
import urllib.parse, time
from urllib.parse import urlparse
import urllib.request
url = ('https://www.example.com')
encoded_url = urllib.parse.quote(url, safe='')
print(encoded_url)
base_url = ("https://translate.google.co.uk/translate?sl=auto&tl=en&u=")
translation_url = base_url+encoded_url
print(translation_url)
req = urllib.request.Request(translation_url, headers={'User-Agent': 'Mozilla/6.0'})
print(req)
response = urllib.request.urlopen(req)
time.sleep(15)
print(response)
webContent = response.read()
print(webContent)
f = open('GoogleTranslated.html', 'wb')
f.write(webContent)
print(f)
f.close
I have tried to use wget using the details captured in this question:
How to download a webpage (mhtml format) using wget in python but the details are incomplete (or I am simply unabl eto understand).
Any suggestions would be helpful at this stage.
Did you try using Selenium with a Chrome Webdriver to save page?
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.expected_conditions import visibility_of_element_located
from selenium.webdriver.support.ui import WebDriverWait
import pyautogui
URL = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
FILE_NAME = ''
# open page with selenium
# (first need to download Chrome webdriver, or a firefox webdriver, etc)
driver = webdriver.Chrome()
driver.get(URL)
# wait until body is loaded
WebDriverWait(driver, 60).until(visibility_of_element_located((By.TAG_NAME, 'body')))
time.sleep(1)
# open 'Save as...' to save html and assets
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
if FILE_NAME != '':
pyautogui.typewrite(FILE_NAME)
pyautogui.hotkey('enter')
I have a better solution, which will not involve any possible manual operation and specify the path to hold the mhtml file. I learn this from a chinese blog . The key idea is to use chrome-dev-tools command.
The code is shown below as an example.
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.qq.com/')
# Execute Chrome dev tool command to obtain the mhtml file
res = driver.execute_cdp_cmd('Page.captureSnapshot', {})
# 2. write file locally
with open('./store/qq.mhtml', 'w', newline='') as f:
f.write(res['data'])
driver.quit()
Hope this will help!
more things about chrome dev protocols
save as mhtml, need to add argument '--save-page-as-mhtml'
options = webdriver.ChromeOptions()
options.add_argument('--save-page-as-mhtml')
driver = webdriver.Chrome(options=options)
I wrote it just the way it was. Sorry if it's wrong.
I created a class, so you can use it. The example is in the three lines below.
Also, you can change the number of seconds to sleep as you like.
Incidentally, non-English keyboards such as Japanese and Hangul keyboards are also supported.
import chromedriver_binary
from selenium import webdriver
import pyautogui
import pyperclip
import uuid
class DonwloadMhtml(webdriver.Chrome):
def __init__(self):
super().__init__()
self._first_save = True
time.sleep(2)
def save_page(self, url, filename=None):
self.get(url)
time.sleep(3)
# open 'Save as...' to save html and assets
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
if filename is None:
pyperclip.copy(str(uuid.uuid4()))
else:
pyperclip.copy(filename)
time.sleep(1)
pyautogui.hotkey('ctrl', 'v')
time.sleep(2)
if self._first_save:
pyautogui.hotkey('tab')
time.sleep(1)
pyautogui.press('down')
time.sleep(1)
pyautogui.press('up')
time.sleep(1)
pyautogui.hotkey('enter')
time.sleep(1)
self._first_save = False
pyautogui.hotkey('enter')
time.sleep(1)
# example
dm = DonwloadMhtml()
dm.save_page('https://en.wikipedia.org/wiki/Python_(programming_language)', 'wikipedia_python') # create file named "wikipedia_python.mhtml"
dm.save_page('https://www.python.org/') # file named randomly based on uuid4
python3.8.10
selenium==4.4.3

python 3.6 ascii codec error in urllib request

I'm trying to download picture from website with my python script, but every time i use georgian alphabet in url it gets error "UnicodeEncodeError: 'ascii' codec can't encode characters"
here is my code:
import os
import urllib.request
def download_image(url):
fullfilename = os.path.join('/images', 'image.jpg')
urllib.request.urlretrieve(url, fullfilename)
download_image(u'https://example.com/media/სდასდსადადსაფა_8QXjrbi.jpg')
I think it's better to use requests library in your example which deals with utf-8 characters.
Here is the code:
import requests
def download_image(url):
request = requests.get(url)
local_path = 'images/images.jpg'
with open(local_path, 'wb') as file:
file.write(request.content)
my_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/8/81/ერეკლე_II_ბავშვობის_სურათი.jpgw/459px-ერეკლე_II_ბავშვობის_სურათი.jpg'
download_image(my_url)

Save HTML Source Code to File

How can I copy the source code of a website into a text file in Python 3?
EDIT:
To clarify my issue, here's what I have:
import urllib.request
def extractHTML(url):
f = open('temphtml.txt', 'w')
page = urllib.request.urlopen(url)
pagetext = page.read()
f.write(pagetext)
f.close()
extractHTML('http:www.google.com')
I get the following error for the f.write() function:
builtins.TypeError: must be str, not bytes
import urllib.request
site = urllib.request.urlopen('http://somesite.com')
data = site.read()
file = open("file.txt","wb") #open file in binary mode
file.writelines(data)
file.close()
Untested but should work.
EDIT: Updated for python3
Try this.
import urllib.request
def extractHTML(url):
urllib.request.urlretrieve(url, 'temphtml.txt')
It is easier, but if you still want to do it that way. This is the solution:
import urllib.request
def extractHTML(url):
f = open('temphtml.txt', 'w')
page = urllib.request.urlopen(url)
pagetext = str(page.read())
f.write(pagetext)
f.close()
extractHTML('https://www.google.com')
Your script gave an error saying it must be a string. Just convert bytes to a string with str().
Next I got an error saying no host was given. Google is a secured site so https: not http: and most importantly you forgot to include // at the end of https:.
probably you wanted to create something like that:
import urllib.request
class ExtractHtml():
def Page(self):
print("enter the web page name starting with 'http://': ")
url=input()
site=urllib.request.urlopen(url)
data=site.read()
file =open("D://python_projects/output.txt", "wb")
file.write(data)
file.close()
w=ExtractHtml()
w.Page()

Resources