XML parsing with class will not find the values - python-3.x

I will split a large XML to small branches and than parse only this parts.
I search modified timestamp "mod_time" tag which is avaliable in "contacts" tag, but my object function call, doesn't find the value. In some contacts is also some tags missing completly.
I tried iterfind('tag_name'), iter(), findall('tag_name'), but my program shows no result and I can't figure out for hours, where my failure is.
Here is my XML reduced to two elements:
<?xml version="1.0" encoding = "utf-8"?>
<phonebooks>
<phonebook name="Telefonbuch">
<contact>
<category>0</category>
<person>
<realName>Dummy, Name, Street</realName>
</person>
<telephony nid="1">
<number type="work" prio="1" id="0">012345678</number>
</telephony>
<services />
<setup />
<features doorphone="0" />
<mod_time>1587477163</mod_time>
<uniqueid>358</uniqueid>
</contact>
<contact>
<category>0</category>
<person>
<realName>Foto Name</realName>
</person>
<telephony nid="1">
<number type="home" prio="1" id="0">067856743</number>
</telephony>
<services />
<setup />
<features doorphone="0" />
<mod_time>1547749691</mod_time>
<uniqueid>68</uniqueid>
</contact>
</phonebook>
</phonebooks>
and her what I have done so fare:
import timeit
import xml.etree.ElementTree as ET
class Phonebook:
def __init__(self, xml_file, tag_node):
"""Split tree in contact branches """
self.xml_file = xml_file
self.tag_node = tag_node
# For furter parsing
contacts = []
i = 0
events =('start','end','start-ns','end-ns')
for event, elem in ET.iterparse(self.xml_file, events=events):
if event == 'end' and elem.tag == self.tag_node[0]:
#print(elem.tag)
contacts.append(elem)
par = Contact(elem, i)
par.parse_node(elem, i)
i += 1
elem.clear()
print("Amount of contacts:", len(contacts))
class Contact:
def __init__(self, branch, i):
self.tree = branch
#print(i, self.tree)
def parse_node(self, branch, i):
for node in branch.iterfind('.//mod_time'):
print(node.text)
def main():
elem = Phonebook('new _dummy1.xml',['contact'])
if __name__ == '__main__':
""" Input XML file definition """
starttime=timeit.default_timer()
main()
print('Finished')
print("Runtime:", timeit.default_timer()-starttime)
Output:
Amount of contacts: 2 Finished Runtime: 0.0006361000050674193
Expected output:
1587477163
1547749691

Code
import timeit
import xml.etree.ElementTree as ET
class Phonebook:
def __init__(self, xml_file, selector):
self.xml_file = xml_file
self.selector = selector
root = ET.parse(xml_file)
contacts = root.findall(selector)
print("Amount of contacts:", len(contacts))
for mod_time in contacts:
print(mod_time.text)
def main():
Phonebook('./_dummy1.xml','.//contact/mod_time')
if __name__ == '__main__':
starttime=timeit.default_timer()
main()
print('Finished')
print("Runtime:", timeit.default_timer()-starttime)
Output
$ python test.py
Amount of contacts: 2
1587477163
1547749691
Finished
Runtime: 0.0006627999973716214

I solved now my issue with the handshake of the object data. I create now an instance of Contact which inherit the values from the parent class Phonbook, instead of call Contact from Phonbook Object. Very helpful was the python documentation about the super() function, which refers to this great page. I post my solution, because it's maybe interessting for others who run in similar issues.
Thanks to all who tried to help!
My changed code:
import psutil
import timeit
import xml.etree.ElementTree as ET
class Phonebook:
def __init__(self, file_path):
"""Split tree in contact branches """
self.file_path = file_path
def contacts_list(self, file_path):
contacts = []
events =('start','end','start-ns','end-ns')
for event, elem in ET.iterparse(self.file_path, events=events):
if event == 'end' and elem.tag == 'contact':
contact = elem
contacts.append(contact)
elem.clear()
return contacts
#print("Superclass:",contacts)
class Contact(Phonebook):
def __init__(self, file_path):
super().__init__(file_path)
def search_node(self, contact, searched_tag):
contact_template =['category','person', 'telephony', 'services', 'setup', 'features', 'mod_time', 'uniqueid' ]
node_tag_list = []
list_difference = []
search_list = []
for node in contact:
if node.tag not in node_tag_list:
node_tag_list.append(node.tag)
for element in contact_template:
if element not in node_tag_list:
list_difference.append(element)
for node in contact:
if node.tag == searched_tag and node.tag not in list_difference:
search_list.append(node.text)
#print(node.text)
else:
if len(list_difference) != 0 and searched_tag in list_difference:
message = self.missed_tag(list_difference)
#print(message)
if message not in search_list:
search_list.append(message)
return search_list
def missed_tag(self, list_difference):
for m in list_difference:
message = f'{m} - not assigned'
return message
def main():
con = Contact('dummy.xml')
contacts = con.contacts_list(('dummy.xml'))
mod_time_list =[]
for contact in contacts:
mod_time = con.search_node(contact, 'mod_time')
mod_time_list.append(mod_time)
print(len(mod_time_list))
print(mod_time_list)
if __name__ == '__main__':
""" Input XML file definition """
starttime=timeit.default_timer()
main()
print('Finished')
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])
# Getting usage of virtual_memory in GB ( 4th field)
print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)
print("Runtime:", timeit.default_timer()-starttime)

Related

Instantiation of objects crashes the kernel

I have tried creating instances of the class eachItem from a csv file, however, my code keeps crashing.
class eachItem():
pattern_iphone_7 = '.*[iI]*[pP]hone 7'
list_of_all_objects = []
def __init__(self, item_name, item_price, item_link, item_image, item_location):
self.dictionary = {}
self.dictionary['name'] = item_name
self.dictionary['price'] = item_price
self.dictionary['link'] = item_link
self.dictionary['image'] = item_image
self.dictionary['location'] = item_location
print(self.dictionary)
#self.__class__.list_of_all_objects.append(self.dictionary)
if re.match(self.__class__.pattern_iphone_7, self.dictionary['name']):
print(self.dictionary['name'])
item_iPhone_7(
self.dictionary['name'],
self.dictionary['price'],
self.dictionary['link'],
self.dictionary['image'],
self.dictionary['location'])
else:
self.__class__.list_of_all_objects.append(self.dictionary)
#classmethod
def from_csvfile(cls):
with open("scraped facebook marketplace data.csv", "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
items = list(reader)
i=0
for item in items:
print(f"Instantiated instance number: {i}")
i+= 1
eachItem(
item_name = item.get('name'),
item_price = item.get('price'),
item_link = item.get('link'),
item_image = item.get('image'),
item_location = item.get('location')
)
def __repr__(self):
return f"name: {self.dictionary['name']}, price: {self.dictionary['price']}, link: {self.dictionary['link']}, image: {self.dictionary['image']}, location: {self.dictionary['location']}"
eachItem.from_csvfile()
class item_iPhone_7(eachItem):
list_of_instances=[]
def __init__(self, item_name, item_price, item_link, item_image, item_location):
super().__init__(item_name, item_price, item_link, item_image, item_location)
item_iPhone_7.list_of_instances.append(self.dictionary)
if __name__ == "__main__":
eachItem.from_csvfile()
I am trying to do it this way because as I am trying to create instances of the class eachItem, I would also want to automatically create instances of the class item_iPhone_7. That is the reason why I try to create instance of the child class in the parent class.
How could I try to do this in a safer way, without crashing the kernel?

Implementation of MVC Design Pattern in Python3

I am trying to implement MVC using Python3.8. I have used this https://www.tutorialspoint.com/python_design_patterns/python_design_patterns_model_view_controller.htm Python2's example for practice.
But, I am receiving the following error:
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
My code is as following:
model.py
import json
class Person:
def __init__(self, first = None, last = None):
self.first = first
self.last = last
def name(self):
return ('%s %s' %(self.first, self.last))
#classmethod
def getAll(self):
database = open('data.txt', 'r')
result = []
jsonList = json.loads(database.read())
for item in jsonList:
item = json.loads(item)
person = Person(item['first'], item['last'])
result.append(person)
return result
view.py
from model import Person
def showAllView(list):
print ('In our db we have %i users. Here they are:' % len(list))
for item in list:
print (item.name())
def startView():
print ('MVC - the simplest example')
print ('Do you want to see everyone in my db?[y/n]')
def endView():
print ('Goodbye!')
controller.py
from model import Person
import view
def showAll():
#gets list of all Person objects
people_in_db = Person.getAll()
return view.showAllView(people_in_db)
def start():
view.startView()
answer = input('Enter y or n')
if answer == 'y':
return showAll()
else:
return view.endView()
if __name__ == "__main__":
start()
Data.txt
[{
"first": "abc",
"last": "xyz"
}]
Please, guide me in this and help me find the error. Thanks in advance.
I have solved the problem myself. The main problem was loading JSON elements twice in model.py, like below:
jsonList = json.loads(database.read())
for item in jsonList:
item = json.loads(item)
Now I have solved it by removing item = json.loads(item).

How i can parse efficiency Html in Python?

I will parse the Html code efficiency without an external libarry.
I have all ready tried with for and had check which symbol is it.
It was this:
list = []
html = """<html><p>Hello</p></html>"""
m = 0
for a in html:
if a == "<":
m = 1
list.append([])
elif a == ">":
m = 0
list.append([])
else:
list[-1] = a
print(list)
The code was on 50KB Files very slow.
May I recommend starting with a simple HTML parser like the one shown below? It uses the standard library that comes with Python and has no external dependencies. You may need to alter and extend it according to your needs, but it gives you a basic DOM API that should be a good beginning point to work from. The code works for the simple case it is meant to tackle; but depending on your needs, you may need to add further functionality to accomplish whatever your end goal may be.
#! /usr/bin/env python3
import html.parser
import pprint
import xml.dom.minidom
def main():
# noinspection PyPep8
document = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
parser = DocumentParser()
parser.feed(document)
parser.close()
model = parser.document.documentElement
model.normalize()
print(model.toprettyxml())
first_title = model.getElementsByTagName('title')[0]
print(first_title.toxml())
print(first_title.tagName)
print(first_title.firstChild.data)
print(first_title.parentNode.tagName)
first_p = model.getElementsByTagName('p')[0]
print(first_p.toxml())
print(first_p.getAttribute('class'))
all_a = model.getElementsByTagName('a')
print(all_a[0].toxml())
pprint.pprint([element.toxml() for element in all_a])
pprint.pprint([element.toxml() for element in find(model, id='link3')])
for element in all_a:
print(element.getAttribute('href'))
print(*get_text(model), sep='\n')
class DocumentParser(html.parser.HTMLParser):
# noinspection SpellCheckingInspection
def __init__(self, *, convert_charrefs=True):
super().__init__(convert_charrefs=convert_charrefs)
self.document = self.focus = xml.dom.minidom.DOMImplementation() \
.createDocument(None, None, None)
#property
def document_has_focus(self):
return self.document is self.focus
def handle_starttag(self, tag, attrs):
element = self.document.createElement(tag)
for name, value in attrs:
element.setAttribute(name, value)
self.focus.appendChild(element)
self.focus = element
def handle_endtag(self, tag):
while self.focus.tagName != tag:
self.focus = self.focus.parentNode
self.focus = self.focus.parentNode
def handle_data(self, data):
if not self.document_has_focus and not data.isspace():
self.focus.appendChild(self.document.createTextNode(data.strip()))
def error(self, message):
raise RuntimeError(message)
def close(self):
super().close()
while not self.document_has_focus:
self.focus = self.focus.parentNode
def find(element, **kwargs):
get_attribute = getattr(element, 'getAttribute', None)
if get_attribute and \
all(get_attribute(key) == value for key, value in kwargs.items()):
yield element
for child in element.childNodes:
yield from find(child, **kwargs)
def get_nodes_by_type(node, node_type):
if node.nodeType == node_type:
yield node
for child in node.childNodes:
yield from get_nodes_by_type(child, node_type)
def get_text(node):
return (node.data for node in get_nodes_by_type(node, node.TEXT_NODE))
if __name__ == '__main__':
main()

In OOP in python, are different instances of an object when initialised with a default value the same?

I am trying to understand object oriented programming. I am doing this by creating a small poker like program. I have come across a problem whose minimal working example is this:
For this code:
import random
class superthing(object):
def __init__(self,name,listthing=[]):
self.name = name
self.listthing = listthing
def randomlyadd(self):
self.listthing.append(random.randint(1,50))
def __str__(self):
return '\nName: '+str(self.name)+'\nList: '+str(self.listthing)
Aboy = superthing('Aboy')
Aboy.randomlyadd()
print(Aboy)
Anotherboy = superthing('Anotherboy')
Anotherboy.randomlyadd()
print(Anotherboy)
I expect this output :
Name: Aboy
List: [44]
(some number between 1 and 50)
Name: Anotherboy
List: [11]
(again a random number between 1 and 50)
But what I get is:
Name: Aboy
List: [44]
(Meets my expectation)
Name: Anotherboy
List: [44,11]
(it appends this number to the list in the previous instance)
Why is this happening? The context is that two players are dealt a card from a deck. I am sorry if a similar question exists, if it does, I will read up on it if you can just point it out. New to stack overflow. Thanks in advance.
For the non minimal example, I am trying this:
import random
class Card(object):
def __init__(self, suit, value):
self.suit = suit
self.value = value
def getsuit(self):
return self.suit
def getval(self):
return self.value
def __str__(self):
if(self.suit == 'Clubs'):
suitstr = u'\u2663'
elif(self.suit == 'Diamonds'):
suitstr = u'\u2666'
elif(self.suit == 'Hearts'):
suitstr = u'\u2665'
elif(self.suit == 'Spades'):
suitstr = u'\u2660'
if((self.value<11)&(self.value>1)):
valuestr = str(self.value)
elif(self.value == 11):
valuestr = 'J'
elif(self.value == 12):
valuestr = 'Q'
elif(self.value == 13):
valuestr = 'K'
elif((self.value == 1)|(self.value == 14)):
valuestr = 'A'
return(valuestr+suitstr)
class Deck(object):
def __init__(self,DeckCards=[]):
self.DeckCards = DeckCards
def builddeck(self):
suits = ['Hearts','Diamonds','Clubs','Spades']
for suit in suits:
for i in range(13):
self.DeckCards.append(Card(suit,i+1))
def shuffle(self):
for i in range(len(self)):
r = random.randint(0,len(self)-1)
self.DeckCards[i],self.DeckCards[r] = self.DeckCards[r],self.DeckCards[i]
def draw(self):
return self.DeckCards.pop()
def __str__(self):
return str([card.__str__() for card in self.DeckCards])
def __len__(self):
return len(self.DeckCards)
class Player(object):
def __init__(self,Name,PlayerHandcards = [],Balance = 1000):
self.Name = Name
self.Hand = PlayerHandcards
self.Balance = Balance
def deal(self,deck):
self.Hand.append(deck.draw())
def __str__(self):
return 'Name :'+str(self.Name)+'\n'+'Hand: '+str([card.__str__() for card in self.Hand])+'\n'+'Balance: '+str(self.Balance)
deck1 = Deck()
deck1.builddeck()
deck1.shuffle()
Alice = Player('Alice')
Alice.deal(deck1)
print(Alice)
Bob = Player('Bob')
Bob.deal(deck1)
print(Bob)
And after dealing to Bob they both have the same hands. If you have some other suggestions regarding the code, you are welcome to share that as well.
This is a duplicate of “Least Astonishment” and the Mutable Default Argument as indicated by #Mad Physicist. Closing this question for the same.

Why are my class functions executed when importing the class?

it's probably a very basic question but I was unable to find an answer that I could thoroughly understand.
In my main program main_program.py, I'm importing a class that itself imports another class:
in main_program.py:
from createTest import *
in createTest.py:
print("TEST")
from recordRecallQused import *
print("TEST")
now in recordRecallQused:
class recordRecallQused:
def __init__(self, path):
self.path = path
try:
with open(self.path, 'r',newline = '') as question_used:
question_used.closed
except IOError:
#if file doesnt exist
print("the file doesn't exist")
with open(self.path, 'w',newline = '') as question_used:
question_used.closed
def recallQused(self):
list_Qused = []
print("I'm being executed")
with open(self.path, 'r',newline = '') as question_used:
questionused = csv.reader(question_used)
for item in questionused:
if len(item)>0:
list_Qused.append(item[0])
question_used.closed
return list_Qused
What I obtain in the kernel:
>TEST
>I'm being executed
>TEST
so functions inside the class are executed even though they are not called, but I have read that it's "normal", "def" are no statements but "live" things.
Still, I have tried something much more simple:
in main_program_TEST.py
from class1 import *
a = class1()
in class1.py:
print("t")
from class2 import *
print("t")
class class1:
def __init__(self):
pass
def message(self):
print("prout")
in class2.py:
class class2:
def __init__(self):
pass
def message(self):
print("prout2")
When executing main_program_TEST.py the kernel displays
>t
>t
so this time the functions in class2.py have not been executed, otherwise the kernel would show instead:
>t
>prout2
>t
I really wonder why.
Stephen Rauch you are right, part of my code in recordRecallQused.py was calling the function.
"""#load all list
print("loading questions info")
# questions info: answers, category
list_AllQ = []
with open('questionsInfoTo130.csv', newline = '') as csvfile:
questionsInfo = csv.reader(csvfile)
# loop over the questions information rows
for (i,row) in enumerate(questionsInfo):
if(i!=0):
list_AllQ.append(row)
csvfile.close()
path = 'question_used.csv'"""
list_AllQ = [[0,1,2,1,"que"],[0,1,2,2,"que"],[0,1,2,3,"que"],[0,1,2,4,"que"],[0,1,2,55,"que"],[0,1,2,6,"que"],[0,1,2,7,"que"],[0,1,2,8,"que"],[0,1,2,9,"que"]]
a = recordRecallQused('question_used.csv')
list_Qused = a.recallQused()
list_avQ = a.createListavQ(list_Qused, list_AllQ)
list_Qtest = a.createListQtest(list_avQ)
a.recordQused(list_Qtest)

Resources