Error while parsing lxml - python-3.x

While parsing XML using lxml, I get an error "reading file objects must return bytes objects". Here's the code
from lxml import etree
from io import StringIO
def parseXML(xmlFile):
"""
parse the xml
"""
data=open(xmlFile)
xml=data.read()
data.close()
tree=etree.parse(StringIO(xml))
context=etree.iterparse(StringIO(xml))
for action, elem in context:
if not elem.text:
if not elem.text:
text="None"
else:
text=elem.text
print(elem.tag + "=>" + text)
if __name__ == "__main__":
parseXML("C:\\Users\\karthik\Desktop\\xml_path\\bgm.xml")
BGM xml
<?xml version="1.0" ?>
<zAppointments reminder="15">
<appointment>
<begin>1181251680</begin>
<uid>040000008200E000</uid>
<alarmTime>1181572063</alarmTime>
<state></state>
<location></location>
<duration>1800</duration>
<subject>Bring pizza home</subject>
</appointment>
<appointment>
<begin>1234360800</begin>
<duration>1800</duration>
<subject>Check MS Office website for updates</subject>
<location></location>
<uid>604f4792-eb89-478b-a14f-dd34d3cc6c21-1234360800</uid>
<state>dismissed</state>
</appointment>
</zAppointments>
Error:
Traceback (most recent call last):
File "C:/Users/karthik/source/ChartAttributes/crecords", line 34, in <module>
parseXML("C:\\Users\\karthik\\Desktop\\xml_path\\bgm.xml")
File "C:/Users/karthik/source/ChartAttributes/crecords", line 26, in parseXML
for action, elem in context:
File "src\lxml\iterparse.pxi", line 208, in lxml.etree.iterparse.__next__ (src\lxml\lxml.etree.c:150010)
File "src\lxml\iterparse.pxi", line 193, in lxml.etree.iterparse.__next__ (src\lxml\lxml.etree.c:149708)
File "src\lxml\iterparse.pxi", line 221, in lxml.etree.iterparse._read_more_events (src\lxml\lxml.etree.c:150208)
TypeError: reading file objects must return bytes objects
Process finished with exit code 1

I think you need the XML as a byte array rather than a character string.
Open the file in binary mode to get a bytes object:
data=open(xmlFile, 'rb')
But it's probably just easier to pass the filename to LXML and let it take care of opening and reading the file:
from lxml import etree
def parseXML(xmlFile):
for action, elem in etree.iterparse(xmlFile):
text = elem.text or "None"
print(elem.tag + "=>" + text)

Related

unable take input from a text file in python crawler

I have created a basic crawler in python, I want to take input from a text file.
I used open/raw_input but there was an error.
When I used input("") function it is prompting for input and was working fine.
The problem only with reading a file
import re
import urllib.request
url = open('input.txt', 'r')
data = urllib.request.urlopen(url).read()
data1 = data.decode("utf8")
print(data1)
file =open('output.txt' , 'w')
file.write(data1)
file.close()
error output below.
Traceback (most recent call last):
File "scrape.py", line 8, in <module>
data = urllib.request.urlopen(url).read()
File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.6/urllib/request.py", line 518, in open
protocol = req.type
AttributeError: '_io.TextIOWrapper' object has no attribute 'type'
the method open returns a file object, and not the content of the file as a string. if you want url to contain the content as a string, change the line to:
url = open('input.txt', 'r').read()

Editing PDF metadata fields with Python3 and pdfrw

I'm trying to edit the metadata Title field of PDFs, to include the ASCII equivalents when possible. I'm using Python3 and the module pdfrw.
How can I do string operations that replace the metadata fields?
My test code is here:
from pdfrw import PdfReader, PdfWriter, PdfString
import unicodedata
def edit_title_metadata(inpdf):
trailer = PdfReader(inpdf)
# this statement is breaking pdfrw
trailer.Info.Title = unicode_normalize(trailer.Info.Title)
# also have tried:
#trailer.Info.Title = PdfString(unicode_normalize(trailer.Info.Title))
PdfWriter("test.pdf", trailer=trailer).write()
return
def unicode_normalize(s):
return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
if __name__ == "__main__":
edit_title_metadata('Anadon-2011-Scientific Opinion on the safety e.pdf')
And the traceback is:
Traceback (most recent call last):
File "get_metadata.py", line 68, in <module>
main()
File "get_metadata.py", line 54, in main
edit_title_metadata(pdf)
File "get_metadata.py", line 11, in edit_title_metadata
trailer.Info.Title = PdfString(unicode_normalize(trailer.Info.Title))
File "get_metadata.py", line 18, in unicode_normalize
return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
File "/path_to_python/python3.7/site-packages/pdfrw/objects/pdfstring.py", line 550, in encode
if isinstance(source, uni_type):
TypeError: isinstance() arg 2 must be a type or tuple of types
Notes:
This issue at GitHub may be related.
FWIW, Also getting same error with Python3.6
I've shared the pdf (which has non-ascii hyphens, unicode char \u2010)
.
wget https://gist.github.com/philshem/71507d4e8ecfabad252fbdf4d9f8bdd2/raw/cce346ab39dd6ecb3a718ad3f92c9f546761e87b/Anadon-2011-Scientific%2520Opinion%2520on%2520the%2520safety%2520e.pdf
You have to use the .decode() method on the metadata fields:
trailer.Info.Title = unicode_normalize(trailer.Info.Title.decode())
And full working code:
from pdfrw import PdfReader, PdfWriter, PdfReader
import unicodedata
def edit_title_metadata(inpdf):
trailer = PdfReader(inpdf)
trailer.Info.Title = unicode_normalize(trailer.Info.Title.decode())
PdfWriter("test.pdf", trailer=trailer).write()
return
def unicode_normalize(s):
return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
if __name__ == "__main__":
edit_title_metadata('Anadon-2011-Scientific Opinion on the safety e.pdf')

How to find the exact line of error from python 3.6.5?

from lxml import etree
import os
import copy
import xml.etree.ElementTree as ET
XMLDoc = etree.parse(open('aa.xml'))
XSLDoc = etree.parse(open('aa.xsl'))
try:
transform = etree.XSLT(XSLDoc)
except:
for error in etree.XSLT.error_log:
print(error.message, error.line)
v = '/person/name'
for Node in XMLDoc.xpath(v):
m2 = copy.deepcopy(Node)
m3 = etree.tostring(m2, method="xml", xml_declaration=True, encoding="utf-8", with_tail=False)
m3 = m3.decode("utf-8")
dc = open('pq.xml', 'w')
dc.write(str(m3))
dc.close()
xm = etree.parse(open('pq.xml'))
q = transform(xm)
print(q)
I have use lxml for transform our xml to another xml through xslt but i have one parsing error in our xslt.
Traceback (most recent call last):
File "C:\Anil\PTest\08\qqq.py", line 13, in <module>
for error in etree.XSLT(XSLDoc).error_log:
File "src\lxml\xslt.pxi", line 410, in lxml.etree.XSLT.__init__
lxml.etree.XSLTParseError: Failed to compile predicate
Please suggest how to find exact problem in our xslt.
XSLT should have a log attached to it, so you can do something like the following:
try:
# Your Code
except:
for error in YourXSLTObject.error_log:
print(error.message, error.line)

Proper Use Of Python 3.x AMFY Module

How am I supposed to use the Amfy module? I try to use it like the JSON module (amfy.loads or amfy.load), but it just gives me errors:
C:\Users\Other>"C:\Users\Other\Desktop\Python3.5.2\test amf.py"
Traceback (most recent call last):
File "C:\Users\Other\Desktop\Python3.5.2\test amf.py", line 4, in <module>
print(amfy.load(cn_rsp.text))
File "C:\Users\Other\Desktop\Python3.5.2\lib\site-packages\amfy\__init__.py", line 9, in load
return Loader().load(input, proto=proto)
File "C:\Users\Other\Desktop\Python3.5.2\lib\site-packages\amfy\core.py", line 33, in load
return self._read_item3(stream, context)
File "C:\Users\Other\Desktop\Python3.5.2\lib\site-packages\amfy\core.py", line 52, in _read_item3
marker = stream.read(1)[0]
AttributeError: 'str' object has no attribute 'read'
this is what I wrote:
import requests
import amfy
cn_rsp = requests.get("http://realm498.c10.castle.rykaiju.com/api/locales/en/get_serialized_new")
print(amfy.load(cn_rsp.text))
After tinkering around and googling some stuff, I found a fix:
New code:
import amfy, requests, json
url = "http://realm416.c9.castle.rykaiju.com/api/locales/en/get_serialized_static"
req = requests.get(url)
if req.status_code == 200:
ret = req.json() if "json" in req.headers["content-type"] else amfy.loads(req.content)
else:
ret = {"failed": req.reason}
with open ("doa manifest.txt", 'w', encoding = 'utf-8') as dump:
json.dumps(ret, dump)
The Terminal throws a UnicodeEncodeError, but I was able to fix that by entering chcp 65001 and then set PYTHONIOENCODING=utf-8
The load method expects an input stream, you provide it a string. Just convert your string into a memory buffer which supports read method like this:
import io
print(amfy.load(io.BytesIO(cn_rsp.text.encode())))
unfortunately serialization fails when using this. Is there another url where it would work, a test URL maybe?
File "C:\Python34\lib\site-packages\amfy\core.py", line 146, in _read_vli
byte = stream.read(1)[0]
IndexError: index out of range

TypeError: Can't convert 'bytes' object to str implicitly for tweepy

from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
ckey=''
csecret=''
atoken=''
asecret=''
class listener(StreamListener):
def on_data(self,data):
print(data)
return True
def on_error(self,status):
print(status)
auth = OAuthHandler(ckey,csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track="cricket")
This code filter the twitter stream based on the filter. But I am getting following traceback after running the code. Can somebody please help
Traceback (most recent call last):
File "lab.py", line 23, in <module>
twitterStream.filter(track="car".strip())
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 430, in filter
self._start(async)
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 346, in _start
self._run()
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 286, in _run
raise exception
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 255, in _run
self._read_loop(resp)
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 298, in _read_loop
line = buf.read_line().strip()
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 171, in read_line
self._buffer += self._stream.read(self._chunk_size)
TypeError: Can't convert 'bytes' object to str implicitly
Im assuming you're using tweepy 3.4.0. The issue you've raised is 'open' on github (https://github.com/tweepy/tweepy/issues/615).
Two work-arounds :
1)
In streaming.py:
I changed line 161 to
self._buffer += self._stream.read(read_len).decode('UTF-8', 'ignore')
and line 171 to
self._buffer += self._stream.read(self._chunk_size).decode('UTF-8', 'ignore')
and then reinstalled via python3 setup.py install on my local copy of tweepy.
2)
remove the tweepy 3.4.0 module, and install 3.3.0 using command: pip install -I tweepy==3.3.0
Hope that helps,
-A
You can't do twitterStream.filter(track="car".strip()). Why are you adding the strip() it's serving no purpose in there.
track must be a str type before you invoke a connection to Twitter's Streaming API and tweepy is preventing that connection because you're trying to add strip()
If for some reason you need it, you can do track_word='car'.strip() then track=track_word, that's even unnecessary because:
>>> print('car'.strip())
car
Also, the error you're getting does not match the code you have listed, the code that's in your question should work fine.

Resources