While parsing XML using lxml, I get an error "reading file objects must return bytes objects". Here's the code
from lxml import etree
from io import StringIO
def parseXML(xmlFile):
"""
parse the xml
"""
data=open(xmlFile)
xml=data.read()
data.close()
tree=etree.parse(StringIO(xml))
context=etree.iterparse(StringIO(xml))
for action, elem in context:
if not elem.text:
if not elem.text:
text="None"
else:
text=elem.text
print(elem.tag + "=>" + text)
if __name__ == "__main__":
parseXML("C:\\Users\\karthik\Desktop\\xml_path\\bgm.xml")
BGM xml
<?xml version="1.0" ?>
<zAppointments reminder="15">
<appointment>
<begin>1181251680</begin>
<uid>040000008200E000</uid>
<alarmTime>1181572063</alarmTime>
<state></state>
<location></location>
<duration>1800</duration>
<subject>Bring pizza home</subject>
</appointment>
<appointment>
<begin>1234360800</begin>
<duration>1800</duration>
<subject>Check MS Office website for updates</subject>
<location></location>
<uid>604f4792-eb89-478b-a14f-dd34d3cc6c21-1234360800</uid>
<state>dismissed</state>
</appointment>
</zAppointments>
Error:
Traceback (most recent call last):
File "C:/Users/karthik/source/ChartAttributes/crecords", line 34, in <module>
parseXML("C:\\Users\\karthik\\Desktop\\xml_path\\bgm.xml")
File "C:/Users/karthik/source/ChartAttributes/crecords", line 26, in parseXML
for action, elem in context:
File "src\lxml\iterparse.pxi", line 208, in lxml.etree.iterparse.__next__ (src\lxml\lxml.etree.c:150010)
File "src\lxml\iterparse.pxi", line 193, in lxml.etree.iterparse.__next__ (src\lxml\lxml.etree.c:149708)
File "src\lxml\iterparse.pxi", line 221, in lxml.etree.iterparse._read_more_events (src\lxml\lxml.etree.c:150208)
TypeError: reading file objects must return bytes objects
Process finished with exit code 1
I think you need the XML as a byte array rather than a character string.
Open the file in binary mode to get a bytes object:
data=open(xmlFile, 'rb')
But it's probably just easier to pass the filename to LXML and let it take care of opening and reading the file:
from lxml import etree
def parseXML(xmlFile):
for action, elem in etree.iterparse(xmlFile):
text = elem.text or "None"
print(elem.tag + "=>" + text)
Related
I have created a basic crawler in python, I want to take input from a text file.
I used open/raw_input but there was an error.
When I used input("") function it is prompting for input and was working fine.
The problem only with reading a file
import re
import urllib.request
url = open('input.txt', 'r')
data = urllib.request.urlopen(url).read()
data1 = data.decode("utf8")
print(data1)
file =open('output.txt' , 'w')
file.write(data1)
file.close()
error output below.
Traceback (most recent call last):
File "scrape.py", line 8, in <module>
data = urllib.request.urlopen(url).read()
File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.6/urllib/request.py", line 518, in open
protocol = req.type
AttributeError: '_io.TextIOWrapper' object has no attribute 'type'
the method open returns a file object, and not the content of the file as a string. if you want url to contain the content as a string, change the line to:
url = open('input.txt', 'r').read()
I'm trying to edit the metadata Title field of PDFs, to include the ASCII equivalents when possible. I'm using Python3 and the module pdfrw.
How can I do string operations that replace the metadata fields?
My test code is here:
from pdfrw import PdfReader, PdfWriter, PdfString
import unicodedata
def edit_title_metadata(inpdf):
trailer = PdfReader(inpdf)
# this statement is breaking pdfrw
trailer.Info.Title = unicode_normalize(trailer.Info.Title)
# also have tried:
#trailer.Info.Title = PdfString(unicode_normalize(trailer.Info.Title))
PdfWriter("test.pdf", trailer=trailer).write()
return
def unicode_normalize(s):
return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
if __name__ == "__main__":
edit_title_metadata('Anadon-2011-Scientific Opinion on the safety e.pdf')
And the traceback is:
Traceback (most recent call last):
File "get_metadata.py", line 68, in <module>
main()
File "get_metadata.py", line 54, in main
edit_title_metadata(pdf)
File "get_metadata.py", line 11, in edit_title_metadata
trailer.Info.Title = PdfString(unicode_normalize(trailer.Info.Title))
File "get_metadata.py", line 18, in unicode_normalize
return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
File "/path_to_python/python3.7/site-packages/pdfrw/objects/pdfstring.py", line 550, in encode
if isinstance(source, uni_type):
TypeError: isinstance() arg 2 must be a type or tuple of types
Notes:
This issue at GitHub may be related.
FWIW, Also getting same error with Python3.6
I've shared the pdf (which has non-ascii hyphens, unicode char \u2010)
.
wget https://gist.github.com/philshem/71507d4e8ecfabad252fbdf4d9f8bdd2/raw/cce346ab39dd6ecb3a718ad3f92c9f546761e87b/Anadon-2011-Scientific%2520Opinion%2520on%2520the%2520safety%2520e.pdf
You have to use the .decode() method on the metadata fields:
trailer.Info.Title = unicode_normalize(trailer.Info.Title.decode())
And full working code:
from pdfrw import PdfReader, PdfWriter, PdfReader
import unicodedata
def edit_title_metadata(inpdf):
trailer = PdfReader(inpdf)
trailer.Info.Title = unicode_normalize(trailer.Info.Title.decode())
PdfWriter("test.pdf", trailer=trailer).write()
return
def unicode_normalize(s):
return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
if __name__ == "__main__":
edit_title_metadata('Anadon-2011-Scientific Opinion on the safety e.pdf')
from lxml import etree
import os
import copy
import xml.etree.ElementTree as ET
XMLDoc = etree.parse(open('aa.xml'))
XSLDoc = etree.parse(open('aa.xsl'))
try:
transform = etree.XSLT(XSLDoc)
except:
for error in etree.XSLT.error_log:
print(error.message, error.line)
v = '/person/name'
for Node in XMLDoc.xpath(v):
m2 = copy.deepcopy(Node)
m3 = etree.tostring(m2, method="xml", xml_declaration=True, encoding="utf-8", with_tail=False)
m3 = m3.decode("utf-8")
dc = open('pq.xml', 'w')
dc.write(str(m3))
dc.close()
xm = etree.parse(open('pq.xml'))
q = transform(xm)
print(q)
I have use lxml for transform our xml to another xml through xslt but i have one parsing error in our xslt.
Traceback (most recent call last):
File "C:\Anil\PTest\08\qqq.py", line 13, in <module>
for error in etree.XSLT(XSLDoc).error_log:
File "src\lxml\xslt.pxi", line 410, in lxml.etree.XSLT.__init__
lxml.etree.XSLTParseError: Failed to compile predicate
Please suggest how to find exact problem in our xslt.
XSLT should have a log attached to it, so you can do something like the following:
try:
# Your Code
except:
for error in YourXSLTObject.error_log:
print(error.message, error.line)
How am I supposed to use the Amfy module? I try to use it like the JSON module (amfy.loads or amfy.load), but it just gives me errors:
C:\Users\Other>"C:\Users\Other\Desktop\Python3.5.2\test amf.py"
Traceback (most recent call last):
File "C:\Users\Other\Desktop\Python3.5.2\test amf.py", line 4, in <module>
print(amfy.load(cn_rsp.text))
File "C:\Users\Other\Desktop\Python3.5.2\lib\site-packages\amfy\__init__.py", line 9, in load
return Loader().load(input, proto=proto)
File "C:\Users\Other\Desktop\Python3.5.2\lib\site-packages\amfy\core.py", line 33, in load
return self._read_item3(stream, context)
File "C:\Users\Other\Desktop\Python3.5.2\lib\site-packages\amfy\core.py", line 52, in _read_item3
marker = stream.read(1)[0]
AttributeError: 'str' object has no attribute 'read'
this is what I wrote:
import requests
import amfy
cn_rsp = requests.get("http://realm498.c10.castle.rykaiju.com/api/locales/en/get_serialized_new")
print(amfy.load(cn_rsp.text))
After tinkering around and googling some stuff, I found a fix:
New code:
import amfy, requests, json
url = "http://realm416.c9.castle.rykaiju.com/api/locales/en/get_serialized_static"
req = requests.get(url)
if req.status_code == 200:
ret = req.json() if "json" in req.headers["content-type"] else amfy.loads(req.content)
else:
ret = {"failed": req.reason}
with open ("doa manifest.txt", 'w', encoding = 'utf-8') as dump:
json.dumps(ret, dump)
The Terminal throws a UnicodeEncodeError, but I was able to fix that by entering chcp 65001 and then set PYTHONIOENCODING=utf-8
The load method expects an input stream, you provide it a string. Just convert your string into a memory buffer which supports read method like this:
import io
print(amfy.load(io.BytesIO(cn_rsp.text.encode())))
unfortunately serialization fails when using this. Is there another url where it would work, a test URL maybe?
File "C:\Python34\lib\site-packages\amfy\core.py", line 146, in _read_vli
byte = stream.read(1)[0]
IndexError: index out of range
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
ckey=''
csecret=''
atoken=''
asecret=''
class listener(StreamListener):
def on_data(self,data):
print(data)
return True
def on_error(self,status):
print(status)
auth = OAuthHandler(ckey,csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track="cricket")
This code filter the twitter stream based on the filter. But I am getting following traceback after running the code. Can somebody please help
Traceback (most recent call last):
File "lab.py", line 23, in <module>
twitterStream.filter(track="car".strip())
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 430, in filter
self._start(async)
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 346, in _start
self._run()
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 286, in _run
raise exception
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 255, in _run
self._read_loop(resp)
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 298, in _read_loop
line = buf.read_line().strip()
File "C:\Python34\lib\site-packages\tweepy\streaming.py", line 171, in read_line
self._buffer += self._stream.read(self._chunk_size)
TypeError: Can't convert 'bytes' object to str implicitly
Im assuming you're using tweepy 3.4.0. The issue you've raised is 'open' on github (https://github.com/tweepy/tweepy/issues/615).
Two work-arounds :
1)
In streaming.py:
I changed line 161 to
self._buffer += self._stream.read(read_len).decode('UTF-8', 'ignore')
and line 171 to
self._buffer += self._stream.read(self._chunk_size).decode('UTF-8', 'ignore')
and then reinstalled via python3 setup.py install on my local copy of tweepy.
2)
remove the tweepy 3.4.0 module, and install 3.3.0 using command: pip install -I tweepy==3.3.0
Hope that helps,
-A
You can't do twitterStream.filter(track="car".strip()). Why are you adding the strip() it's serving no purpose in there.
track must be a str type before you invoke a connection to Twitter's Streaming API and tweepy is preventing that connection because you're trying to add strip()
If for some reason you need it, you can do track_word='car'.strip() then track=track_word, that's even unnecessary because:
>>> print('car'.strip())
car
Also, the error you're getting does not match the code you have listed, the code that's in your question should work fine.