Python3 script stops working after scrapy update

Python3 script stops working after scrapy update - python-3.x

I am on macOS 10.14.2 using "homebrewed" python3.7.2, scrapy 1.5.1 and twisted 18.9.0 as a python novice with the following script to download old newspaper archived on a website:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# A scrapy script to download issues of the Gaceta (1843-1961)
import errno
import json
import os
from datetime import datetime
import scrapy
from scrapy import FormRequest, Request
os.chdir("/Volumes/backup/Archives/Gaceta_Nicaragua_1843-1961") # directory path
print((os.getcwd()))
# date range, format DD/MM/YYYY
start = '01/01/1843' # 01/01/1843
end = '31/12/1860' # 31/12/1961
date_format = '%d/%m/%Y'
start = datetime.strptime(start, date_format)
end = datetime.strptime(end, date_format)
class AsambleaSpider(scrapy.Spider):
name = 'asamblea'
allowed_domains = ['asamblea.gob.ni']
start_urls = ['http://digesto.asamblea.gob.ni/consultas/coleccion/']
papers = {
"Diario Oficial": "28",
}
def parse(self, response):
for key, value in list(self.papers.items()):
yield FormRequest(url='http://digesto.asamblea.gob.ni/consultas/util/ws/proxy.php',
headers= {
'X-Requested-With': 'XMLHttpRequest'
}, formdata= {
'hddQueryType': 'initgetRdds',
'cole': value
}
, meta={'paper': key},
callback=self.parse_rdds
)
pass
def parse_rdds(self, response):
data = json.loads(response.body_as_unicode())
for r in data["rdds"]:
if not r['fecPublica']:
continue
r_date = datetime.strptime(r['fecPublica'], date_format)
if start <= r_date <= end:
r['paper'] = response.meta['paper']
rddid = r['rddid']
yield Request("http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=" + rddid,
callback=self.download_pdf, meta=r)
def download_pdf(self, response):
filename = "{paper}/{anio}/".format(**response.meta) + "{titulo}-{fecPublica}.pdf".format(**response.meta).replace("/", "_")
if not os.path.exists(os.path.dirname(filename)):
try:
os.makedirs(os.path.dirname(filename))
except OSError as exc: # guard against race condition
if exc.errno != errno.EEXIST:
raise
with open(filename, 'wb') as f:
f.write(response.body)
It worked perfectly fine (although slow), however, I have two persisting issues with the script.
Firstly, I get the following error since the update:
2019-01-07 11:53:34 [scrapy.core.scraper] ERROR: Spider error processing <POST http://digesto.asamblea.gob.ni/consultas/util/ws/proxy.php> (referer: http://digesto.asamblea.gob.ni/consultas/coleccion/)
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/local/lib/python3.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 30, in process_spider_output
for x in result:
File "/usr/local/lib/python3.7/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python3.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python3.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "gaceta_downloader.py", line 58, in parse_rdds
if not r['fecPublica']:
KeyError: 'fecPublica'
Secondly, once the script runs again (as it did some days ago before updating python and packages) I ran into an issue where the script would sometimes complain that UnicodeEncodeError: ‘ascii’ codec can’t encode character u’\xb0’ in position 27: ordinal not in range(128) which I guess let sometimes to zero byte files. Do you see the encoding error in the source code? Is this related to the above problem?

Related

How to send PURGE method and check response in python

Now I create a project. And try create a unauthenticated cache purge scanner program. I tried this but not work
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import requests
with open('new-target.txt', 'rb') as f:
L = f.seek(-2, os.SEEK_END)
while f.read(1) != b'\n':
f.seek(-2, os.SEEK_CUR)
a = f.readline().decode()
filepath = a
with open(filepath) as fp:
line = fp.readline()
cnt = 1
while line:
x = "{}".format(line.strip())
resp = requests.purge(x + "/")
responsee = resp.status_code
if responsee == 200:
print("\033[1;32;40m [Method - PURGE] \033[0m 1;31;40m Vulnerable: \033" + x)
line = fp.readline()
cnt += 1
Gave this error:
Traceback (most recent call last):
File "/root/Masaüstü/Pentesting Tools/PURGE-PUT-Method-Scanner.py", line 23, in
resp = requests.purge(x + "/")
AttributeError: module 'requests' has no attribute 'purge'

Traceback which you got seems to be preety straightforwards. Module requests which you use in this case does not have that purge attribute, which you are trying to call in line 23.
Please refer to methods listed in documentation:
https://docs.python-requests.org/en/latest/api/

Gooey from argument to read file

So the first argument is the file to open and the second argument is the pattern (or text) to search for.
The program is made to scan a document and find items equal to "Pattern" and print the detector address in "DetectorPattern". I got this program working without Gooey but i thought about adding it for ease of use. My problem lies when the argument get passed to the "with open(filename)" line.
This is the error i get:
Traceback (most recent call last):
File "C:/Users/haral/Google Drive (synkroniseres ikke)/Programmering/Programmer/LogSearch/LogSearchGooey.py", line 42, in <module>
main()
File "C:\Users\haral\PycharmProjects\AutomateBoringStuff\venv\lib\site-packages\gooey\python_bindings\gooey_decorator.py", line 134, in <lambda>
return lambda *args, **kwargs: func(*args, **kwargs)
File "C:/Users/haral/Google Drive (synkroniseres ikke)/Programmering/Programmer/LogSearch/LogSearchGooey.py", line 27, in main
with open(filename, 'r') as reader:
TypeError: expected str, bytes or os.PathLike object, not Namespace
import os
import re
from gooey import Gooey, GooeyParser
pattern = ""
# Chosen search pattern
detectorPattern = re.compile(r'\d\d\.\d\d\d')
# Fire alarm detector pattern, etc. 03.040
filename = ""
foundDetector = []
#Gooey
def main():
parser = GooeyParser(description="Testing")
parser.add_argument(
"Filename",
help="Choose a file",
widget="FileChooser"
)
parser.add_argument(
"store",
help="Choose a pattern to search for"
)
filename = parser.parse_args()
with open(filename, 'r') as reader:
# Read and print the entire file line by line
for line in reader:
findLine = re.search(pattern, line)
if findLine is not None:
mo = detectorPattern.search(findLine.string)
mog = mo.group()
if mog not in foundDetector:
foundDetector.append(mog)
for x in foundDetector:
print(x)
if __name__ == '__main__':
main()

'CommentedMap' object has no attribute '_context_manager' during data dump with ruamel.yaml

Here's my code:
import ruamel.yaml
import pathlib
class YamlLoader:
#staticmethod
def safe_load(filename):
filepath = pathlib.Path(filename)
with open(filepath) as stream:
if ruamel.yaml.version_info < (0, 15):
data = ruamel.yaml.safe_load(stream)
else:
yml = ruamel.yaml.YAML(typ='safe', pure=True)
data = yml.load(stream)
return data
#staticmethod
def save(yaml, filename):
filepath = pathlib.Path(filename)
if ruamel.yaml.version_info < (0, 15):
ruamel.yaml.safe_dump(yaml, filepath)
else:
ruamel.yaml.YAML.dump(yaml, filepath)
my code in main.py:
data = YamlLoader.safe_load("data.yaml")
print(data)
I then get my YAML data in the variable.
However, when I then do:
YamlLoader.save(data, "output.yaml")
I get the error message:
Traceback (most recent call last): File "", line 1, in
File
"/usr/local/lib/python3.8/site-packages/ruamel/yaml/main.py", line
434, in dump
if self._context_manager: AttributeError: 'CommentedMap' object has no attribute '_context_manager'
Most likely I'm using the API in a wrong way, but I can't figure out where the issue is.

The last line of your code has a problem:
ruamel.yaml.YAML.dump(yaml, filepath)
as you are not creating an instance of YAML like you do wnen loading.
Either do:
yml = ruamel.yaml.YAML()
yml.dump(yaml, filepath)
or do :
ruamel.yaml.YAML().dump(yaml, filepath)

Using 'dpkg' in Python causes OSError: [Errno 9] Bad file descriptor

I built a script to find the most recent version of Visual Studio Code, download it and install it on an Ubuntu machine with dpkg. I haven't found a decent Python library for doing this and am using subprocess.call() to invoke a Shell command. This certainly may not be the best way to do it, but this is also a learning project.
It successfully downloads the file and places it in my ~/Downloads dir. When I try to invoke the subprocess.call(), it spits back 'OSError: [Errno 9] Bad file descriptor'
I know my command string is correct. I can invoke it just fine from the CLI. But it doesn't work when called through subprocess.
Any advice on doing this more efficiently is welcome.
"""
Python 3 script
Downloads the latest .deb package for installing VSCode, and installs it
"""
import os # used to direct where to save downloaded file
import subprocess # used to derive filepath of CLI arg
import requests # py3 only
import platform # used to detect the OS
from urllib.request import urlopen, ContentTooShortError, urlretrieve # py3 version of 'import urllib2'
HOME = os.path.expanduser('~')
filePath = HOME + "/Downloads"
fileName = 'vs_code_most_recent_amd64.deb'
outputName = os.path.join(filePath, fileName)
alreadyDownloaded = False
# used in subprocess calls to suppress stdout or stderr
pipeToDevNull = open(os.devnull, 'w')
def IsDownloadable(url):
"""
Check of the link passed in is a downloadable file. Used to shortcut the
processing so that it doesn't attempt to download a URL that isn't
downloadable. Returns True or False.
"""
h = requests.head(url, allow_redirects=True)
header = h.headers
contentType = header.get('content-type')
if 'text' in contentType.lower():
return False
if 'html' in contentType.lower():
return False
return True
def DownloadVSCodePkg(url):
"""
Downloads the file at the specified URL, save it as the above-defined filename
"""
u = urlopen(url)
f = open(outputName, 'wb')
meta = u.info()
fileSize = int(meta.get_all("Content-Length")[0])
fileSizeDL = 0
#blockSize = 8192
blockSize = 16384
while True:
buffer = u.read(blockSize)
if not buffer:
break
fileSizeDL += len(buffer)
f.write(buffer)
status = r"%10d Bytes [%3.2f%%]" % (fileSizeDL, fileSizeDL * 100. / fileSize)
status = status + chr(8)*(len(status)+1)
print("Downloading: {0}".format(status), end="\r", flush=True)
print("Downloading: {0}".format(status))
print("Downloaded: {0}".format(fileName))
f.close()
del f
def CheckDownloadSuccess():
"""
returns bool value if the file we want is in the dir specified
"""
try:
subprocess.check_call("ls " + outputName, stdout=pipeToDevNull, stderr=pipeToDevNull, shell=True)
return True
except subprocess.CalledProcessError:
return False
def UnpackAndInstall():
"""
Invokes dpkg from the linux shell and installs VSCode.
"""
#Detect OS
linuxDistro = platform.linux_distribution()
OSType = linuxDistro[0]
if OSType == 'Ubuntu':
from apt.debfile import DebPackage
pkg = DebPackage(outputName)
command = 'sudo dpkg -i ' + outputName
#The thing that attempts to unpack:
try:
subprocess.check_call(command, stdout=subprocess.STDOUT, stderr=subprocess.STDOUT, shell=True)
except subprocess.CalledProcessError:
print("Install Failed.")
def main():
url = 'https://go.microsoft.com/fwlink/?LinkID=760868'
alreadyDownloaded = CheckDownloadSuccess()
if alreadyDownloaded is False:
if IsDownloadable(url):
DownloadVSCodePkg(url)
# check if the download succeeded, if file doesn't already exist.
if CheckDownloadSuccess():
print("Download Successful!\nFile location => " + outputName)
else:
print("Download Failed...")
else:
print('Link broken: need to update the package resource link.')
else:
print("File already exists.")
UnpackAndInstall()
if __name__ == "__main__":
main()
Here is the traceback and the error from the CLI:
$ python3 setupVSCode.py
Traceback (most recent call last):
File "setupVSCode.py", line 192, in <module>
main()
File "setupVSCode.py", line 189, in main
UnpackAndInstall()
File "setupVSCode.py", line 95, in UnpackAndInstall
subprocess.call(command, stdout=subprocess.STDOUT, stderr=subprocess.STDOUT, shell=True)
File "/usr/lib/python3.6/subprocess.py", line 267, in call
with Popen(*popenargs, **kwargs) as p:
File "/usr/lib/python3.6/subprocess.py", line 709, in __init__
restore_signals, start_new_session)
File "/usr/lib/python3.6/subprocess.py", line 1344, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
OSError: [Errno 9] Bad file descriptor

os.path.expanduser('~') is going to return something like: 'C:\\Users\\user.name' which you're appending '/Downloads', resulting in a bad path like: 'C:\\Users\\user.name/Downloads\\'
Instead of:
filePath = HOME + "/Downloads"
Do:
filePath = HOME + "\Downloads"
Or preferably:
filePath = os.path.join(HOME, 'Downloads')

After speaking with #Steve, I tried removing the output redirectors on the subprocess.call().
The advice on removing all slashes in path construction and instead using "os.path.join()" has been implemented and will be followed as a best practice from here on.
Since the command as constructed worked fine from the CLI, it was a matter of thinking about what the subprocess.call() did that was different. It redirected output. With that removed things work fine
It now looks like so:
HOME = os.path.expanduser('~')
filePath = os.path.join(HOME, "Downloads")
fileName = 'vs_code_most_recent_amd64.deb'
outputName = os.path.join(filePath, fileName)
alreadyDownloaded = False
...
command = 'sudo dpkg -i ' + outputName
try:
subprocess.check_call(command, shell=True)
except subprocess.CalledProcessError:
print("Install Failed.")

Get API response chunk encoded. ERROR: data byte not string (ubuntu)

I have some code that works when I run it on a Windows machine, but when it runs in Ubuntu on a google ComputeEngine VM I get the following error.
Traceback (most recent call last): File "firehose_get.py", line 43,
in
print(json.dumps(json.loads(line),indent=2)) File "/home/stuartkirkup/anaconda3/lib/python3.5/json/init.py", line
312, in loads
s.class.name)) TypeError: the JSON object must be str, not 'bytes'
It's exactly the same code that runs fine on Windows. I've done quite a bit of reading and it looks like an encoding issue - and as you'll see from some of the commented out sections in my code I've tried some ways to change the encoding but without joy. I've tried various things but can't work out how to debug it ... I'm fairly new to Python
I'm using Anaconda which some further reading says it has an ill advised setdefaultencoding hack built in.
Here is the stream header showing it's chunked data, which I believe is why it's bytes
{'Transfer-Encoding': 'chunked', 'Date': 'Thu, 17 Aug 2017 16:53:35 GMT', 'Content-Type': 'application/json', 'x-se
rver': 'db220', 'Content-Encoding': 'gzip'}
Code file - firehose_requests.py (with api keys infor replaced by ####)
import requests
MAX_REDIRECTS = 1000
def get(url, **kwargs):
kwargs.setdefault('allow_redirects', False)
for i in range(0, MAX_REDIRECTS):
response = requests.get(url, **kwargs)
#response.encoding = 'utf-8'
print ("test")
print (response.headers)
if response.status_code == requests.codes.moved or \
response.status_code == requests.codes.found:
if 'Location' in response.headers:
url = response.headers['Location']
content_type_header = response.headers.get('content_type')
print (content_type_header)
continue
else:
print ("Error when reading the Location field from HTTP headers")
return response
Code file - firehose_get.py
import json
import requests
from time import sleep
import argparse
#import ConfigParser
import firehose_requests
from requests.auth import HTTPBasicAuth
# Make it work for Python 2+3 and with Unicode
import io
try:
to_unicode = unicode
except NameError:
to_unicode = str
#request a token from Adobe
request_access_token = requests.post('https://api.omniture.com/token', data={'grant_type':'client_credentials'}, auth=HTTPBasicAuth('##############-livestream-poc','488##############1')).json()
#print(request_access_token)
#grab the token from the JSON returned
access_token = request_access_token["access_token"]
print(access_token)
url = 'https://livestream.adobe.net/api/1/stream/eecoukvanilla-##############'
sleep_sec=0
rec_count=10
bearer = "Bearer " + access_token
headers = {"Authorization": bearer,"accept-encoding":"gzip,deflate"}
r = firehose_requests.get(url, stream=True, headers=headers)
#open empty file
with open('output_file2.txt', 'w') as outfile:
print('', file=outfile)
#Read the Stream
if r.status_code == requests.codes.ok:
count = 0
for line in r.iter_lines():
if line:
#write to screen
print ("\r\n")
print(json.dumps(json.loads(line),indent=2))
#append data to file
with open('output_file2.txt', 'a') as outfile:
print("\r\n", file=outfile)
print(json.dumps(json.loads(line),ensure_ascii = False),file=outfile)
#with io.open('output_file2.txt', 'w', encoding='utf8') as outfile:
# str_ = json.dumps(json.loads(line),
# indent=4, sort_keys=True,
# separators=(',', ': '), ensure_ascii=False)
# outfile.write(to_unicode(str_))
#Break the loop if there are is a -n argument
if rec_count is not None:
count = count + 1
if count >= rec_count:
break
#How long to wait between writes
if sleep_sec is not None :
sleep(sleep_sec)
else:
print ("There was a problem with the Request")
print ("Returned Status Code: " + str(r.status_code))
Thanks

OK I worked it out. I found a lot of people also getting this error but no solutions posted, so this is how I did it
parse and decode the JSON like this
json_parsed = json.loads(line.decode("utf-8"))
Full code:
import json
import requests
from time import sleep
import argparse
#import ConfigParser
import firehose_requests
from requests.auth import HTTPBasicAuth
# Make it work for Python 2+3 and with Unicode
import io
try:
to_unicode = unicode
except NameError:
to_unicode = str
#request a token from Adobe
request_access_token = requests.post('https://api.omniture.com/token', data={'grant_type':'client_credentials'}, auth=HTTPBasicAuth('##########-livestream-poc','488################1')).json()
#print(request_access_token)
#grab the token from the JSON returned
access_token = request_access_token["access_token"]
print(access_token)
url = 'https://livestream.adobe.net/api/1/stream/##################'
sleep_sec=0
rec_count=10
bearer = "Bearer " + access_token
headers = {"Authorization": bearer,"accept-encoding":"gzip,deflate"}
r = firehose_requests.get(url, stream=True, headers=headers, )
#open empty file
with open('output_file.txt', 'w') as outfile:
print('', file=outfile)
#Read the Stream
if r.status_code == requests.codes.ok:
count = 0
for line in r.iter_lines():
if line:
#parse and decode the JSON
json_parsed = json.loads(line.decode("utf-8"))
#write to screen
#print (str(json_parsed))
#append data to file
with open('output_file.txt', 'a') as outfile:
#write to file
print(json_parsed,file=outfile)
#Break the loop if there are is a -n argument
if rec_count is not None:
count = count + 1
if count >= rec_count:
break
#How long to wait between writes
if sleep_sec is not None :
sleep(sleep_sec)
else:
print ("There was a problem with the Request")
print ("Returned Status Code: " + str(r.status_code))

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Python3 script stops working after scrapy update - python-3.x

Related

How to send PURGE method and check response in python

Gooey from argument to read file

'CommentedMap' object has no attribute '_context_manager' during data dump with ruamel.yaml

Using 'dpkg' in Python causes OSError: [Errno 9] Bad file descriptor

Get API response chunk encoded. ERROR: data byte not string (ubuntu)

Categories

Resources