outputting python script results into text

outputting python script results into text - python-3.x

I have a want to save my python script's result into a txt file.
My python code
from selenium import webdriver
bro = r"D:\Developer\Software\Python\chromedriver.exe"
driver=webdriver.Chrome(bro)
duo=driver.get('http://www.lolduo.com')
body=driver.find_elements_by_tag_name('tr')
for post in body:
print(post.text)
driver.close()
Some codes that I've tried
import subprocess
with open("output.txt", "w") as output:
subprocess.call(["python", "./file.py"], stdout=output);
I tried this code and it only makes a output.txt file and has nothing inside it
D:\PythonFiles> file.py > result.txt
Exception:
UnicodeEncodeError: 'charmap' codec can't encode character '\u02c9' in
position 0: character maps to
and only prints out 1/3 of the results of the script into a text file.

You can try below code to write data to text file:
from selenium import webdriver
bro = r"D:\Developer\Software\Python\chromedriver.exe"
driver = webdriver.Chrome(bro)
driver.get('http://www.lolduo.com')
body = driver.find_elements_by_tag_name('tr')
with open("output.txt", "w", encoding="utf8") as output:
output.write("\n".join([post.text for post in body]))
driver.close()

You can try this. This Is my Python Code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
import time
bro = r"D:\Developer\Software\Python\chromedriver.exe"
driver = webdriver.Chrome(bro)
driver.get('http://www.lolduo.com')
body = driver.find_elements_by_tag_name('tr') .text
with open('output15.txt', mode='w') as f:
for post in body:
print(post)
f.write(post)
time.sleep(2)
driver.close()

Related

Python / Selenium looping problem with 'for'

im trying to loop this code with "for" but it gives a error. Can you help me to solve it, thank you.
Error Code:
IndentationError: expected an indented block
Source Code:
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
binary = FirefoxBinary("C:\\Program Files\\Mozilla Firefox\\firefox.exe")
profile = FirefoxProfile("C:/Users/Baran/AppData/Roaming/Mozilla/Firefox/Profiles/oy6k3nay.yarrak")
driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary, executable_path="C:\WebDrivers\geckodriver.exe")
driver.get('http://www.ipsorgu.com/')
time.sleep(5)
driver.close()
What i tried:
.
.
.
.
driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary, executable_path="C:\WebDrivers\geckodriver.exe")
#################################################################
for i in range(10):
###################################################################
driver.get('http://www.ipsorgu.com/')
time.sleep(5)
driver.close()

This is because every thing in a for loop has to be indented. So you final code could look like this:
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
binary = FirefoxBinary("C:\\Program Files\\Mozilla Firefox\\firefox.exe")
profile = FirefoxProfile("C:/Users/Baran/AppData/Roaming/Mozilla/Firefox/Profiles/oy6k3nay.yarrak")
driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary, executable_path="C:\WebDrivers\geckodriver.exe")
for i in range(10):
driver.get('http://www.ipsorgu.com/')
time.sleep(5)
driver.close()

How to solve catch image and emoji in web crawler using selenium

I want to ask some questions.
I am using Python 3.7.6, web driver and selenium to do web crawler
And then, I used Visual Studio Code to finish my web crawler, and I output a csv file.
I used "find_elements_by_xpath" to catch some information. The following image is my part code:
from datetime import date,datetime
from selenium import webdriver #載入webdriver
from selenium.webdriver.common.keys import Keys #載入按鍵
from bs4 import BeautifulSoup #載入BeautifulSoup工具
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
import numpy as np
import xlrd
import csv
import codecs
import time
data = xlrd.open_workbook('B.xlsx')
table = data.sheets()[0]
print(table)
nrows = table.nrows
ncols = table.ncols
print(ncols)
print(nrows)
for i in range(1,nrows):
csv_post="Post_No_" + str(i) + ".csv"
with open(csv_post, 'a', newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['互動作者','發表時間','互動內容'])
print_link = table.cell_value(i,3)
print(i)
print(print_link)
driver_blank=webdriver.Chrome('./chromedriver') #使用chrome作為爬蟲輔助工具，把chromedriver載入進來
driver_blank.get(print_link)
time.sleep(1)
post_page_count = len(driver_blank.find_elements_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[4]/div[2]/div[2]/select/option"))
if(post_page_count != 0):
try_value=1
while(try_value):
try:
driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[2]")
print("測試顯示正常")
try_value=0
except NoSuchElementException as e:
print("測試顯示異常，現正刷新網頁")
driver_blank.refresh()
time.sleep(10)
print("總頁數："+str(post_page_count))
table_rows=len(driver_blank.find_elements_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table"))
print("共有"+str(table_rows)+"個Table")
real_table_rows=table_rows+1
#only 1
post_author = driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[1]/tbody/tr[2]/td[1]/a")
post_content = driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[1]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td/div")
post_time = driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[1]/tbody/tr[2]/td[2]/table/tbody/tr[4]/td/div[2]/span")
print("互動作者："+post_author.text)
print("互動內容：")
print(post_content.text)
print("發表時間："+post_time.text)
print("<<< --- >>>")
with open(csv_post, 'a', newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow([post_author.text,post_time.text,post_content.text])
enter image description here
The following is the forum post: (https://forumd.hkgolden.com/view.aspx?type=MB&message=7197409)
enter image description here
I want to catch text, emoji, and image.
I can catch only the text, but I cannot catch emoji and image.
I don't know what to do. Can anyone help me? Thank you.

How to Download webpage as .mhtml

I am able to successfully open a URL and save the resultant page as a .html file. However, I am unable to determine how to download and save a .mhtml (Web Page, Single File).
My code is:
import urllib.parse, time
from urllib.parse import urlparse
import urllib.request
url = ('https://www.example.com')
encoded_url = urllib.parse.quote(url, safe='')
print(encoded_url)
base_url = ("https://translate.google.co.uk/translate?sl=auto&tl=en&u=")
translation_url = base_url+encoded_url
print(translation_url)
req = urllib.request.Request(translation_url, headers={'User-Agent': 'Mozilla/6.0'})
print(req)
response = urllib.request.urlopen(req)
time.sleep(15)
print(response)
webContent = response.read()
print(webContent)
f = open('GoogleTranslated.html', 'wb')
f.write(webContent)
print(f)
f.close
I have tried to use wget using the details captured in this question:
How to download a webpage (mhtml format) using wget in python but the details are incomplete (or I am simply unabl eto understand).
Any suggestions would be helpful at this stage.

Did you try using Selenium with a Chrome Webdriver to save page?
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.expected_conditions import visibility_of_element_located
from selenium.webdriver.support.ui import WebDriverWait
import pyautogui
URL = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
FILE_NAME = ''
# open page with selenium
# (first need to download Chrome webdriver, or a firefox webdriver, etc)
driver = webdriver.Chrome()
driver.get(URL)
# wait until body is loaded
WebDriverWait(driver, 60).until(visibility_of_element_located((By.TAG_NAME, 'body')))
time.sleep(1)
# open 'Save as...' to save html and assets
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
if FILE_NAME != '':
pyautogui.typewrite(FILE_NAME)
pyautogui.hotkey('enter')

I have a better solution, which will not involve any possible manual operation and specify the path to hold the mhtml file. I learn this from a chinese blog . The key idea is to use chrome-dev-tools command.
The code is shown below as an example.
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.qq.com/')
# Execute Chrome dev tool command to obtain the mhtml file
res = driver.execute_cdp_cmd('Page.captureSnapshot', {})
# 2. write file locally
with open('./store/qq.mhtml', 'w', newline='') as f:
f.write(res['data'])
driver.quit()
Hope this will help!
more things about chrome dev protocols

save as mhtml, need to add argument '--save-page-as-mhtml'
options = webdriver.ChromeOptions()
options.add_argument('--save-page-as-mhtml')
driver = webdriver.Chrome(options=options)

I wrote it just the way it was. Sorry if it's wrong.
I created a class, so you can use it. The example is in the three lines below.
Also, you can change the number of seconds to sleep as you like.
Incidentally, non-English keyboards such as Japanese and Hangul keyboards are also supported.
import chromedriver_binary
from selenium import webdriver
import pyautogui
import pyperclip
import uuid
class DonwloadMhtml(webdriver.Chrome):
def __init__(self):
super().__init__()
self._first_save = True
time.sleep(2)
def save_page(self, url, filename=None):
self.get(url)
time.sleep(3)
# open 'Save as...' to save html and assets
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
if filename is None:
pyperclip.copy(str(uuid.uuid4()))
else:
pyperclip.copy(filename)
time.sleep(1)
pyautogui.hotkey('ctrl', 'v')
time.sleep(2)
if self._first_save:
pyautogui.hotkey('tab')
time.sleep(1)
pyautogui.press('down')
time.sleep(1)
pyautogui.press('up')
time.sleep(1)
pyautogui.hotkey('enter')
time.sleep(1)
self._first_save = False
pyautogui.hotkey('enter')
time.sleep(1)
# example
dm = DonwloadMhtml()
dm.save_page('https://en.wikipedia.org/wiki/Python_(programming_language)', 'wikipedia_python') # create file named "wikipedia_python.mhtml"
dm.save_page('https://www.python.org/') # file named randomly based on uuid4
python3.8.10
selenium==4.4.3

python 3.6 ascii codec error in urllib request

I'm trying to download picture from website with my python script, but every time i use georgian alphabet in url it gets error "UnicodeEncodeError: 'ascii' codec can't encode characters"
here is my code:
import os
import urllib.request
def download_image(url):
fullfilename = os.path.join('/images', 'image.jpg')
urllib.request.urlretrieve(url, fullfilename)
download_image(u'https://example.com/media/სდასდსადადსაფა_8QXjrbi.jpg')

I think it's better to use requests library in your example which deals with utf-8 characters.
Here is the code:
import requests
def download_image(url):
request = requests.get(url)
local_path = 'images/images.jpg'
with open(local_path, 'wb') as file:
file.write(request.content)
my_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/8/81/ერეკლე_II_ბავშვობის_სურათი.jpgw/459px-ერეკლე_II_ბავშვობის_სურათი.jpg'
download_image(my_url)

Save HTML Source Code to File

How can I copy the source code of a website into a text file in Python 3?
EDIT:
To clarify my issue, here's what I have:
import urllib.request
def extractHTML(url):
f = open('temphtml.txt', 'w')
page = urllib.request.urlopen(url)
pagetext = page.read()
f.write(pagetext)
f.close()
extractHTML('http:www.google.com')
I get the following error for the f.write() function:
builtins.TypeError: must be str, not bytes

import urllib.request
site = urllib.request.urlopen('http://somesite.com')
data = site.read()
file = open("file.txt","wb") #open file in binary mode
file.writelines(data)
file.close()
Untested but should work.
EDIT: Updated for python3

Try this.
import urllib.request
def extractHTML(url):
urllib.request.urlretrieve(url, 'temphtml.txt')
It is easier, but if you still want to do it that way. This is the solution:
import urllib.request
def extractHTML(url):
f = open('temphtml.txt', 'w')
page = urllib.request.urlopen(url)
pagetext = str(page.read())
f.write(pagetext)
f.close()
extractHTML('https://www.google.com')
Your script gave an error saying it must be a string. Just convert bytes to a string with str().
Next I got an error saying no host was given. Google is a secured site so https: not http: and most importantly you forgot to include // at the end of https:.

probably you wanted to create something like that:
import urllib.request
class ExtractHtml():
def Page(self):
print("enter the web page name starting with 'http://': ")
url=input()
site=urllib.request.urlopen(url)
data=site.read()
file =open("D://python_projects/output.txt", "wb")
file.write(data)
file.close()
w=ExtractHtml()
w.Page()

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

outputting python script results into text - python-3.x

Related

Python / Selenium looping problem with 'for'

How to solve catch image and emoji in web crawler using selenium

How to Download webpage as .mhtml

python 3.6 ascii codec error in urllib request

Save HTML Source Code to File

Categories

Resources