How i can parse efficiency Html in Python?

How i can parse efficiency Html in Python? - python-3.x

I will parse the Html code efficiency without an external libarry.
I have all ready tried with for and had check which symbol is it.
It was this:
list = []
html = """<html><p>Hello</p></html>"""
m = 0
for a in html:
if a == "<":
m = 1
list.append([])
elif a == ">":
m = 0
list.append([])
else:
list[-1] = a
print(list)
The code was on 50KB Files very slow.

May I recommend starting with a simple HTML parser like the one shown below? It uses the standard library that comes with Python and has no external dependencies. You may need to alter and extend it according to your needs, but it gives you a basic DOM API that should be a good beginning point to work from. The code works for the simple case it is meant to tackle; but depending on your needs, you may need to add further functionality to accomplish whatever your end goal may be.
#! /usr/bin/env python3
import html.parser
import pprint
import xml.dom.minidom
def main():
# noinspection PyPep8
document = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
parser = DocumentParser()
parser.feed(document)
parser.close()
model = parser.document.documentElement
model.normalize()
print(model.toprettyxml())
first_title = model.getElementsByTagName('title')[0]
print(first_title.toxml())
print(first_title.tagName)
print(first_title.firstChild.data)
print(first_title.parentNode.tagName)
first_p = model.getElementsByTagName('p')[0]
print(first_p.toxml())
print(first_p.getAttribute('class'))
all_a = model.getElementsByTagName('a')
print(all_a[0].toxml())
pprint.pprint([element.toxml() for element in all_a])
pprint.pprint([element.toxml() for element in find(model, id='link3')])
for element in all_a:
print(element.getAttribute('href'))
print(*get_text(model), sep='\n')
class DocumentParser(html.parser.HTMLParser):
# noinspection SpellCheckingInspection
def __init__(self, *, convert_charrefs=True):
super().__init__(convert_charrefs=convert_charrefs)
self.document = self.focus = xml.dom.minidom.DOMImplementation() \
.createDocument(None, None, None)
#property
def document_has_focus(self):
return self.document is self.focus
def handle_starttag(self, tag, attrs):
element = self.document.createElement(tag)
for name, value in attrs:
element.setAttribute(name, value)
self.focus.appendChild(element)
self.focus = element
def handle_endtag(self, tag):
while self.focus.tagName != tag:
self.focus = self.focus.parentNode
self.focus = self.focus.parentNode
def handle_data(self, data):
if not self.document_has_focus and not data.isspace():
self.focus.appendChild(self.document.createTextNode(data.strip()))
def error(self, message):
raise RuntimeError(message)
def close(self):
super().close()
while not self.document_has_focus:
self.focus = self.focus.parentNode
def find(element, **kwargs):
get_attribute = getattr(element, 'getAttribute', None)
if get_attribute and \
all(get_attribute(key) == value for key, value in kwargs.items()):
yield element
for child in element.childNodes:
yield from find(child, **kwargs)
def get_nodes_by_type(node, node_type):
if node.nodeType == node_type:
yield node
for child in node.childNodes:
yield from get_nodes_by_type(child, node_type)
def get_text(node):
return (node.data for node in get_nodes_by_type(node, node.TEXT_NODE))
if __name__ == '__main__':
main()

Related

XML parsing with class will not find the values

I will split a large XML to small branches and than parse only this parts.
I search modified timestamp "mod_time" tag which is avaliable in "contacts" tag, but my object function call, doesn't find the value. In some contacts is also some tags missing completly.
I tried iterfind('tag_name'), iter(), findall('tag_name'), but my program shows no result and I can't figure out for hours, where my failure is.
Here is my XML reduced to two elements:
<?xml version="1.0" encoding = "utf-8"?>
<phonebooks>
<phonebook name="Telefonbuch">
<contact>
<category>0</category>
<person>
<realName>Dummy, Name, Street</realName>
</person>
<telephony nid="1">
<number type="work" prio="1" id="0">012345678</number>
</telephony>
<services />
<setup />
<features doorphone="0" />
<mod_time>1587477163</mod_time>
<uniqueid>358</uniqueid>
</contact>
<contact>
<category>0</category>
<person>
<realName>Foto Name</realName>
</person>
<telephony nid="1">
<number type="home" prio="1" id="0">067856743</number>
</telephony>
<services />
<setup />
<features doorphone="0" />
<mod_time>1547749691</mod_time>
<uniqueid>68</uniqueid>
</contact>
</phonebook>
</phonebooks>
and her what I have done so fare:
import timeit
import xml.etree.ElementTree as ET
class Phonebook:
def __init__(self, xml_file, tag_node):
"""Split tree in contact branches """
self.xml_file = xml_file
self.tag_node = tag_node
# For furter parsing
contacts = []
i = 0
events =('start','end','start-ns','end-ns')
for event, elem in ET.iterparse(self.xml_file, events=events):
if event == 'end' and elem.tag == self.tag_node[0]:
#print(elem.tag)
contacts.append(elem)
par = Contact(elem, i)
par.parse_node(elem, i)
i += 1
elem.clear()
print("Amount of contacts:", len(contacts))
class Contact:
def __init__(self, branch, i):
self.tree = branch
#print(i, self.tree)
def parse_node(self, branch, i):
for node in branch.iterfind('.//mod_time'):
print(node.text)
def main():
elem = Phonebook('new _dummy1.xml',['contact'])
if __name__ == '__main__':
""" Input XML file definition """
starttime=timeit.default_timer()
main()
print('Finished')
print("Runtime:", timeit.default_timer()-starttime)
Output:
Amount of contacts: 2 Finished Runtime: 0.0006361000050674193
Expected output:
1587477163
1547749691

Code
import timeit
import xml.etree.ElementTree as ET
class Phonebook:
def __init__(self, xml_file, selector):
self.xml_file = xml_file
self.selector = selector
root = ET.parse(xml_file)
contacts = root.findall(selector)
print("Amount of contacts:", len(contacts))
for mod_time in contacts:
print(mod_time.text)
def main():
Phonebook('./_dummy1.xml','.//contact/mod_time')
if __name__ == '__main__':
starttime=timeit.default_timer()
main()
print('Finished')
print("Runtime:", timeit.default_timer()-starttime)
Output
$ python test.py
Amount of contacts: 2
1587477163
1547749691
Finished
Runtime: 0.0006627999973716214

I solved now my issue with the handshake of the object data. I create now an instance of Contact which inherit the values from the parent class Phonbook, instead of call Contact from Phonbook Object. Very helpful was the python documentation about the super() function, which refers to this great page. I post my solution, because it's maybe interessting for others who run in similar issues.
Thanks to all who tried to help!
My changed code:
import psutil
import timeit
import xml.etree.ElementTree as ET
class Phonebook:
def __init__(self, file_path):
"""Split tree in contact branches """
self.file_path = file_path
def contacts_list(self, file_path):
contacts = []
events =('start','end','start-ns','end-ns')
for event, elem in ET.iterparse(self.file_path, events=events):
if event == 'end' and elem.tag == 'contact':
contact = elem
contacts.append(contact)
elem.clear()
return contacts
#print("Superclass:",contacts)
class Contact(Phonebook):
def __init__(self, file_path):
super().__init__(file_path)
def search_node(self, contact, searched_tag):
contact_template =['category','person', 'telephony', 'services', 'setup', 'features', 'mod_time', 'uniqueid' ]
node_tag_list = []
list_difference = []
search_list = []
for node in contact:
if node.tag not in node_tag_list:
node_tag_list.append(node.tag)
for element in contact_template:
if element not in node_tag_list:
list_difference.append(element)
for node in contact:
if node.tag == searched_tag and node.tag not in list_difference:
search_list.append(node.text)
#print(node.text)
else:
if len(list_difference) != 0 and searched_tag in list_difference:
message = self.missed_tag(list_difference)
#print(message)
if message not in search_list:
search_list.append(message)
return search_list
def missed_tag(self, list_difference):
for m in list_difference:
message = f'{m} - not assigned'
return message
def main():
con = Contact('dummy.xml')
contacts = con.contacts_list(('dummy.xml'))
mod_time_list =[]
for contact in contacts:
mod_time = con.search_node(contact, 'mod_time')
mod_time_list.append(mod_time)
print(len(mod_time_list))
print(mod_time_list)
if __name__ == '__main__':
""" Input XML file definition """
starttime=timeit.default_timer()
main()
print('Finished')
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])
# Getting usage of virtual_memory in GB ( 4th field)
print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)
print("Runtime:", timeit.default_timer()-starttime)

How could I create a docstring decorator in the presence of properties?

I have a collection of ever more specialized classes which correspond to collections of the same kind of data (temperature, density, etc) but for different drifts, for example, one subclass has dimensions (nx, ny) and a different suclass has dimensions (ncv), and I want to reflect that in the docstrings, for having a better documentation using Sphinx.
After reading many very useful threads here in Stack Overflow, I have arrived to this model:
import numpy as np
from functools import wraps
def class_decorator(cls):
import ipdb; ipdb.set_trace()
clsdict = {}
mro = cls.mro()
mro.reverse()
for tmp in mro[1:]: ##Ignore object class parent.
clsdict.update(tmp.__dict__)
for name, method in clsdict.items():
if hasattr(method, '__og_doc__'):
try:
method.__doc__ = method.__og_doc__.format(**clsdict)
except:
pass
else:
try:
method.__og_doc__ = method.__doc__
method.__doc__ = method.__doc__.format(**clsdict)
except:
pass
return cls
def mark_documentation(fn):
if not hasattr(fn, '__og_doc__'):
try:
fn.__og_doc__ = fn.__doc__
except:
pass
#wraps(fn)
def wrapped(*args, **kwargs):
return fn(*args, **kwargs)
return wrapped
def documented_property(fn):
if not hasattr(fn, '__og_doc__'):
try:
fn.__og_doc__ = fn.__doc__
except:
pass
#wraps(fn)
def wrapped(*args, **kwargs):
return fn(*args, **kwargs)
prp= property(wrapped)
prp.__og_doc__ = fn.__og_doc__
return prp
#class_decorator
class Base(object):
_GRID_DIM = 'nx, ny'
_TYPE = 'BaseData'
def __init__(self, name):
self.name = name
def shape(self):
""" This docstring contains the type '{_TYPE}' of class."""
print('Simple')
def operation(self, a, b, oper=np.sum, **kwargs):
""" Test for functions with args and kwargs in {_TYPE}"""
return oper([a,b])
#classmethod
def help(cls, var):
try:
print(get(cls, var).__doc__)
except:
print("No docstring yet.")
#class_decorator
class Advanced(Base):
_GRID_DIM = 'ncv'
_TYPE = 'AdvancedData'
def __init__(self,name):
super().__init__(name)
#property
#mark_documentation
# #documented_property
def arkansas(self):
"""({_GRID_DIM}, ns): Size of Arkansaw."""
return 'Yeah'
I am aiming to get the correctly formatted docstring when I call the help method or I use Sphinx, so that:
> adv = Advanced('ADV')
> adv.help("arkansas")
(ncv, ns): Size of Arkansaw.
> adv.help("operation")
Test for functions with args and kwargs in AdvancedData
I have managed to make it work so far, except for properties, because I assigned __og_doc__ to the function, but the property does not have that attribute. My last attempt at monkeypatching this, documented_property, fails because property is inmutable (as expected), and I cannot come up with any way to avoid this roadblock.
Is there any way around this problem?

Nested function in Python class

i have a little "basic understanding" Python problem.
So let me explain my problem.
At first a very simple code snippet.
class Revert:
__sentence = ""
def __init__(self, sentence: str):
self.__sentence = sentence
def get_sentence(self):
return self.__sentence
def revert_sentence(self):
return self.__sentence[::-1]
if __name__ == '__main__':
print(Revert("Stackoverflow").get_sentence())
print(Revert("Stackoverflow").revert_sentence())
So this show normal function calling of python functions.
But how can i transform this code so i can call the revert function like this:
print(Revert("Stackoverflow").get_sentence().revert_sentence())
Maybe I'm miss the forest through the trees. But I didn't get it how to do this.
I already tried to solve the problem with innermethods but this didn't work for me
...
def get_sentence(self):
def revert_sentence():
self.revert_sentence()
return self.__sentence
...
Many thanks in advance

Implement __str__ to return the actual string. Then in the existing methods, return the object. This way you can chain. But when print is applied to it, that __str__ method will kick in:
class Revert:
__sentence = ""
def __init__(self, sentence: str):
self.__sentence = sentence
def get_sentence(self):
return self
def revert_sentence(self):
return Revert(self.__sentence[::-1])
# Some more such methods ...
def upper(self):
return Revert(self.__sentence.upper())
def first(self, count):
return Revert(self.__sentence[:count])
def dotted(self):
return Revert(".".join(self.__sentence))
# For getting a string
def __str__(self):
return self.__sentence
print(Revert("Stackoverflow").get_sentence().revert_sentence())
print(Revert("Stackoverflow")
.revert_sentence()
.first(8)
.upper()
.revert_sentence()
.first(4)
.dotted()) # "O.V.E.R"
Note that now the .get_sentence() method is not really doing much, and you can always strip it from a chain.

Here You go:
class Revert:
__sentence = ""
def __init__(self, sentence: str):
self.__sentence = sentence
def get_sentence(self):
return self.__sentence
def revert_sentence(self):
# It's important to know that you are making changes in the same instance of the object
self.__sentence = self.__sentence[::-1]
return self
def pseudo_revert(self):
# Return a new object with reverted string, but this instance still has original string intact.
return Revert(self.__sentence[::-1])
if __name__ == '__main__':
r1 = Revert("Stackoverflow")
r2 = Revert("Stackoverflow")
print(r1.get_sentence()) # Stackoverflow
print(r1.revert_sentence().get_sentence()) # wolfrevokcatS
print(r1.get_sentence()) # wolfrevokcatS
print(r2.get_sentence()) # Stackoverflow
print(r2.pseudo_revert().get_sentence()) # wolfrevokcatS
print(r2.get_sentence()) # Stackoverflow
Hope this helps you understand the object, instance of an object, and method of object distinctly.

Iterator to iterate over a generator

I am parsing an XML file, one of xml tag has a structure like this:
<product>
<item seq="division-sec">Division</item>
<item seq="dept-sec">Dept Info</item>
<item seq="label01">Label 01</item>
<item seq="label02">Label 02</item>
...
<item seq="labelN">Label N</item>
<item seq="date-mfg">27-11-2017</item>
<item seq="date-exp">28-11-2019</item>
</product>
I have written a generator function to access product labels in a sequence:
def product_labels(xmlpage):
#... parsed xml here
for item in xmlpage:
#-- process item for validation such as case sensitivity, etc
yield item # ("division-sec", "Division")
Now I'm looking for a function or may be an iterator class, which would allow me to get first, last, prev, next and search methods on it:
class ProdcutReader(object):
def __init__(self, product_labels):
self.product_labels = product_labels
def __iter__(self):
return self
def __prev__(self):
return prev(self.product_labels ) #-- Dont know how to do this :(
def __next__(self):
current = self.current(next(self.product_labels))
return current
def current(self, obj=None):
if not obj:
return self.first()
return obj
def first(self):
return list(self.product_labels)[0]
# search by label seq
def search(self, seq):
# Not sure if this is the correct way
for i in self.product_labels:
if i[0] == seq:
self.current(i)
return i
... # With some more methods (if search works I can have some more methods)
So assume I need to use that object, it should work like this:
from product_reader import ProdcutReader as Reader
>>>r = Reader(product_labels)
>>>r.first()
("division-sec", "Division")
>>>r.last()
("date-exp", "28-11-2019")
>>>r.current() # Let us say current is ("label01", "Label 01")
("label01", "Label 01")
>>>r.next() # or next(r)
("label02", "Label 02")
>>>r.prev()
("dept-sec", "Dept Info")
>>r.search("date-mfg") # This should also set the searched as r.current()
("date-mfg", "27-11-2017")
If I can get how to write for prev, next and search I will be able to write remaining methods like, first, last, current etc.

You apparently need random access to any element at any time. To my eyes, I don't see why you don't simply use a list.
If you need the current/next/previous functionality without having to keep track of a counter variable, you could still base your construct on a list:
class ProductReader(object):
def __init__(self, product_labels):
self.generator = product_labels
self.active_generator = self.generator()
self.element = None
self.cur = -1
def at_n(self, index):
if self.cur > index:
self.active_generator = self.generator()
while len(self.storage) < index+1:
self.element = next(self.active_generator)
self.cur = index
return self.element
def current(self):
if self.cur > -1:
return self.element
def last(self):
try:
while True:
self.element = next(self.generator)
self.cur += 1
except StopIteration:
return self.element
def first(self):
return self.at_n(0)
def next(self):
return self.at_n(self.cur+1)
__next__ = next
def prev(self):
return self.at_n(self.cur-1)
def search(self, query):
oldcur = self.cur
self.active_generator = self.generator()
for i, element in enumerate(self.generator()):
if query in element:
self.cur = i
self.element = element
return element
self.at_n(self.cur) # reset to old state
return None

Python Binary Tree

I'm working on a binary tree in Python3 and so far almost everything has been working like expected; however, I have a function that is supposed to return a list of all children for any given node and for whatever reason I'm only getting a list of the object addresses, and not calling my overridden __str__(self) method.
from collections import deque # http://docs.python.org/3.1/tutorial/datastructures.html
class BinaryNode: # binary tree functionality via iterative means
def __init__(self, name, data):
self.Left = None
self.Right = None
self.Parent = None
self.Name = name
self.Data = data
return
def AddNew(self, name, data):
q = []
q.append(self)
while q:
i = q.pop()
if i.Name == name:
i.Data = data
return i
elif name < i.Name:
if i.Left:
q.append(i.Left)
else:
i.Left = BinaryNode(name, data)
i.Left.Parent = i
return i.Left
else:
if i.Right:
q.append(i.Right)
else:
i.Right = BinaryNode(name, data)
i.Right.Parent = i
return i.Right
def Find(self, name):
q = deque()
q.append(self)
'''if self.Left: q.append(self.Left)
if self.Right: q.append(self.Right)'''
while q:
i = q.pop()
print(i)
if i.Name == name:
return i
elif name < i.Name:
if i.Left: q.append(i.Left)
else: return None
else:
if i.Right: q.append(i.Left)
else: return None
def Children(self):
children = []
q = deque()
if self.Left: q.append(self.Left)
if self.Right: q.append(self.Right)
while q:
i = q.popleft()
if i.Left: q.append(i.Left)
if i.Right: q.append(i.Right)
children.append(i)
return children
def Parents(self):
lst = []
i = self.Parent
while i is not None:
lst.append(i)
i = i.Parent
return lst
def __str__(self): return "{} : {}".format(self.Name, self.Data)
and I'm testing it by calling
test = BinaryNode("Jesse", 21)
print(test)
print(test.AddNew("David", 22))
print(test.AddNew("Marli", 23))
print(str(test.Children()))
print(test.Find("David"))
print(test.Find("David").Children())
print(test.Find("Gary")) #Will return None
with the resulting console output
Jesse : 21
David : 22
Marli : 23
[<__main__.BinaryNode object at 0x000000000333E160>, <__main__.BinaryNode object at 0x000000000333E1D0>, <__main__.BinaryNode object at 0x000000000333E198>]
David : 22
[<__main__.BinaryNode object at 0x000000000333E1D0>]
None
UPDATE:
Here is the answer I implemented:
def __repr__ (self): return str(self)

Python containers always use the representation of contained objects.
Implement a __repr__ method too and that'll be used when printing the list; you can make it an alias for __str__ if you wish:
__repr__ = __str__
or explicitly print each element in your list:
print(', '.join(map(str, test.Children())))

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How i can parse efficiency Html in Python? - python-3.x

Related

XML parsing with class will not find the values

How could I create a docstring decorator in the presence of properties?

Nested function in Python class

Iterator to iterate over a generator

Python Binary Tree

Categories

Resources