PdfMiner: Erro processing the page literal required: /b'begin' - python-3.x

I am trying to read .pdf file using python3 with package called pdfminer which I have done successfully but for some of the page in .pdf file while reading the page using interpreter.process_page in getAllPages() of the following code I am getting an errors as follows:
error processing the page literal required: /b'begin'.
error processing the page Unknown operator: 'Qq'.
This is happening only for few docs but not able to find out what is the problem , in which case this could happen?
Code:-
class PDFDoc():
def __init__(self):
self.rsrcmgr = PDFResourceManager()
self.laparams = LAParams()
self.device = PDFPageDetailedAggregator(self.rsrcmgr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
self.doc_values = []
self.total_no_of_pages = 0
self.doc_page_dict = collections.OrderedDict()
# self.doc = None
"""
Read PDF Document
"""
def readDoc(self, doc_name):
fp = open(doc_name, 'rb')
self.parser = PDFParser(fp)
self.doc = PDFDocument(self.parser)
"""
Read all pages in the document and saved in List of tuples format.
It contains the text and their coordinate info along with page number
"""
def getAllPages(self):
for page in PDFPage.create_pages(self.doc):
self.interpreter.process_page(page)
# receive the LTPage object for this page
self.device.get_result()
self.doc_values = self.device.rows
"""
Get the total number of pages
"""
def getTotalPages(self):
self.total_no_of_pages = max(self.doc_page_dict)+1
"""
Convert the document info into Page-wise dict. {Key:Value}-->{Page no:[Page text, coordinates]}
"""
def getPageDict(self):
for i in range(len(self.doc_values)):
left = self.doc_values[i][1]
bottom = self.doc_values[i][2]
content = self.doc_values[i][-1]
if self.doc_page_dict.get(self.doc_values[i][0]):
self.doc_page_dict[self.doc_values[i][0]].append({'left':left, 'bottom':bottom, 'content':content})
else:
self.doc_page_dict[self.doc_values[i][0]]=[{'left':left, 'bottom':bottom, 'content':content}]
"""
Align the page text in case they are misaligned
"""
def create_page_table_modified(self, pagedict_list):
# ##print(pagedict_list)
page_dict = collections.OrderedDict()
page_table_1 = []
page_table = []
exc_arr = []
count = 0
for line in pagedict_list:
row = []
temp_key = float(line['bottom'])
if not line in exc_arr and line["content"]:
row.append(line)
exc_arr.append(line)
for line_1 in pagedict_list:
if not line_1 in exc_arr and line_1["content"]:
# #print('last_top:', last_top, each_dict_adjusted['bottom'])
if abs(int(line["bottom"]) - int(line_1["bottom"])) <= 6:
row.append(line_1)
exc_arr.append(line_1)
if row:
page_dict[temp_key] = row
page_table.append(row)
count += 1
# ##print("\n\nPage:",page_table)
page_dict_keys = sorted(page_dict, reverse=True)
for i in page_dict_keys:
# i = sorted(i, key=lambda k: k['left'])
page_table_1.append(page_dict[i])
return page_table_1
"""
Sort the line elements based on its position coordinates
"""
def sortRowElements(self,row_list):
return sorted(row_list, key=lambda k:k['left'])
"""
Combine line elements to form the line text
"""
def combineText(self, row):
temp_ = []
# for i in range(len(row)):
text = [k['content'] for k in row]
temp_.append(' '.join(text))
return ' '.join(temp_)
"""
To call aligning and sorting functions
"""
def sortText(self):
for page in self.doc_page_dict:
self.doc_page_dict[page] = self.create_page_table_modified(self.doc_page_dict[page])
self.doc_page_dict[page] = [self.sortRowElements(line) for line in self.doc_page_dict[page]]
"""
To get text from particular page of the document --> List of line text
"""
def pageText(self, page_no):
page_text = [self.combineText(line) for line in self.doc_page_dict[page_no]]
return page_text
read_document = PDFDoc()

Related

Read data from txt file, store it, use it for analyzing, write it to the txt file

The task is to read from given txt file the data add the numbers in there to the list[], so that every number in a row will be a element/object in this list. After reading the file created list will be sent to the main().
this list with the objects will be parameters for the def Analyze part in where at the same time
will be found min, max, average and sum.
def lueTiedosto(data):
Tiedosto = open("L07T4D1.txt", 'r', encoding="UTF-8")
Rivi = Tiedosto.readline()
while (len(Rivi) > 0):
data.append(int(Rivi))
Rivi = Tiedosto.readline()
for element in data:
print(element)
print(f"Tiedosto L07T4D1.txt luettu.")
Tiedosto.close()
return element
The fixed code which works:
def lueTiedosto(data):
Lue = input("Luettavan tiedoston nimi on ''.\n")
print(f"Anna uusi nimi, enter säilyttää nykyisen: ", end='')
Tiedosto = open(Lue, 'r', encoding="UTF-8")
Rivi = Tiedosto.readline()
while (len(Rivi) > 0):
data.append(int(Rivi))
Rivi = Tiedosto.readline()
print(f"Tiedosto '{Lue}' luettu.")
Tiedosto.close()
return data
Making an assumption that your input file is similar to the following:
10000
12345
10008
12000
I would do the following:
filepath = r".......\L07T4D1.txt" # Path to file being loaded
def readData(filepath: str) -> list[int]:
# Returns a list of integers from file
rslt = []
with open (filepath, 'r') as f:
data = f.readline().strip()
while data:
data = data.split(' ')
rslt.append(int(data[0]))
data = f.readline().strip()
return rslt
def analyze(data: list[int]) -> None:
# prints results of data analysis
print(f'Max Value = {max(data)}')
print(f'Min Value = {min(data)}')
print(f'Sum Value = {sum(data)}')
print(f'Avg Value = {sum(data)/len(data)}')
Running analyze(readData(filepath)) Yields:
Max Value = 12345
Min Value = 10000
Sum Value = 44353
Avg Value = 11088.25

Method reads properly but the written text file only has 1 line. Is \n not working?

The goal is to extract specific data from a text file under a folder
then write that data into another file under different folder
The extraction part works, save to variables and can even print them
The rises when you try to write them to a file
The file is empty
Need to write in this format
{self.title};;;{self.author};;;{self.release_date};;;
{self.last_update_date};;;{self.language};;;{self.producer};;;{self.book_path}
# This class includes all the operations related to a book
class Operation:
"""
Need to include these class variables
book_title_list (List of all books titles such as “[title1, title2, title3, …]”)
book_info_dict = “{title1: obj1, title2:obj2, title3:obj3…….}”)
"""
book_folder_path = './data/books_data/'
book_info_path = './data/result_data/books.txt'
def extract_book_info(self):
directory_files = os.listdir(self.book_folder_path) # Stores the .txt files under books_data folder
try:
for i in directory_files:
with open(f'{self.book_folder_path}/{i}', 'r', encoding='utf8') as f:
f_contents = f.readlines()
f_line_free = list(map(lambda x: x.strip(), f_contents))
f_lists = f_line_free[10:22] # Slicing only the required elements of the list
"""
Extracting only the necessary part and storing them
under proper variables
"""
title = f_lists[0]
author = f_lists[2]
release_date = f_lists[4]
last_update_date = f_lists[5]
language = f_lists[7]
producer = f_lists[11]
"""
Extracting the desired values
"""
title_data = title[7:]
author_data = author[8:]
release_date_data = release_date[14:]
last_update_date_data = last_update_date[24:-1]
language_data = language[10:]
producer_data = producer[13:]
print(title_data)
with open(self.book_info_path, 'w', encoding="utf8") as wf:
wf.write(f'{title_data};;;{author_data};;;{release_date_data};;;'
f'{last_update_date_data};;;{language_data};;;{producer_data};;;{self.book_info_path}\n')
return True
except FileNotFoundError:
return False
except Exception:
return False

Converting a textfile to a dictionary

My file:
Goal: to read a txt file and return dictionaries
What I have:
def load_snacks(snack_file: TextIO) -> Tuple[Dict[str, List[str]],
Dict[str, List[str]]]:
"""Return a two-item tuple containing a "healthysnack_to_junkfood" dictionary
and a "healthysnack_to_healthysnack" dictionary with the data from snack_file.
"""
snack_H2J = {}
snack_H2H = {}
line = snack_file.readline().strip()
while line != '':
# due to structure of the file, line contains a healthy snack name
healthysnack_name = line
# properly format the 1st healthy snack name, use helper fcn (see below for helper fcn)
flip_name_and_del_comma(healthysnack_name)
healthy_list = []
junk_list = []
line = snack_file.readline().strip()
while line != '\n':
if ',' in line:
snack_H2J[healthysnack_name] = line
a = flip_name_and_del_comma(line)
healthy_list.append(a)
else:
snack_H2H[healthysnack_name] = line
junk_list.append(line)
line = snack_file.readline().strip()
return (snack_H2J, snack_H2H)
Below is my helperfcn; I have verified that this works
def flip_name_and_del_comma(s: str) -> str:
""" Retrun a new str that reverses the format name order from 'colour, healthy snack name' to
'healthy snack name to colour'
>>> flip_name_and_del_comma('orange, carrot')
'carrot orange'
>>> flip_name_and_del_comma('yellow, mango')
'mango yellow'
"""
s_reversed = ', '.join(reversed(s.split(', ')))
s_comma_delete = s_reversed.replace(', ', ' ')
return s_comma_delete

Properly using dataclasses to return values of items

The project is to sort items - using a particular algorithm - into boxes. I am having trouble after assigning each items to the proper class, to return to another function and use and modify the data held within the object in the data class.
My testing file looks like this:
17 10 4
Abacus 3
Blender 5
Chessboard 3
Dishes 6
My classes:
#dataclass
class InventoryItem:
name: str
weight: float
#dataclass
class BoxInventory:
name: str
maxWeight: float
remainingWeight: float
contents: dict = ""
"""
def listContents(self, contents):
self.listContents = contents
def remainingWeight(self, remainingWeight):
self.remainingWeight = remainingWeight
def addItemWeight(self, itemWeight):
self.remainingWeight -= itemWeight
def addItemList(self, itemName, itemWeight, contents):
self.contents = contents[itemName] = contents[itemWeight]
"""
Here is where I read my text file and transfer it to a class:
"""
Take the given txt file and format into proper list for boxes and items
:param filename: The filename of the text file
:return: Send lists to to be used by an algo.
"""
with open(filename, 'r') as myFile: # Open the correct file
itemDict = {}
boxDict = {}
myList = [line.split() for line in myFile.readlines()]
boxLine = ' '.join(myList[0])
for line in range(1, len(myList)):
lines = ''.join(myList[line])
itemName = lines[:-1]
weight = lines[len(lines) - 1:]
item = InventoryItem(itemName, int(weight))
itemDict[itemName] = [item]
boxString = ""
count = 0
for char in boxLine:
if char != " ":
boxString = boxString + char
else:
boxName = "Box" + str(count)
box = BoxInventory(boxName, int(boxString), int(boxString))
boxDict[boxName] = [box]
boxString = ""
count += 1
myReturn = {}
myReturn['boxDict'] = boxDict
myReturn['itemDict'] = itemDict
return myReturn
Non-implemented algorithm:
def roomiest(myReturnDict):
"""
For each item find the box with the greatest remaining allowed weight that can support the item and place the item in that box
:param boxList: The list of boxes in the class from the given file
:param itemList: The list of items in the class from the given file
:return: If boxes were able to fit all items(1); items in box with individual weights(2); Box name with max
weight(3); items with their weights that were left behind(4)
"""
itemList = myReturnDict.get("itemDict")
boxList = myReturnDict.get("boxDict")
My problem is that I do know how to read the parsed data from my
fileReader function in my algo. function.
Your input function is a little strange as you're storing the objects in a list of length 1 inside a dictionary. So your data looks like:
'Dishes': [InventoryItem(name='Dishes', weight=6)]
instead of
'Dishes': InventoryItem(name='Dishes', weight=6)
You might have a reason for it, but changing itemDict[itemName] = [item] to itemDict[itemName] = item makes your code a little easier to follow (and the same for boxDict[boxName] = [box]). With that change you can access the parsed data easily with the following:
for item_name, item in itemList.items():
print(item.name)
print(item.weight)
This iterates through the itemList dictionary, getting the key, value pairs which in this case is itemName, item (or [item] in your original code. If you don't want to change that, replace item with item[0] in the code above). Then you can access attributes of your Class directly by calling their label.
You can get the box with most space remaining, using
sorted_box_list = (sorted(boxList.values(), key=operator.attrgetter('remainingWeight'), reverse=True))
What I have done is rather than using a dictionay I am using a list to pass on the data to a new function.
Text File --> List --> Dict --> List --> sortedList
Here is my new fileReader function:
def fileReader(filename):
"""
Take the given txt file and format into proper list for boxes and items
:param filename: The filename of the text file
:return: Send lists to to be used by an algo.
"""
with open(filename, 'r') as myFile: # Open the correct file
itemList = []
boxList = []
myList = [line.split() for line in myFile.readlines()]
boxLine = ' '.join(myList[0])
for line in range(1, len(myList)):
lines = ''.join(myList[line])
itemName = lines[:-1]
weight = lines[len(lines) - 1:]
item = InventoryItem(itemName, int(weight))
itemList.append(item)
boxString = ""
count = 0
for char in boxLine:
if char != " ":
boxString = boxString + char
else:
boxName = "Box" + str(count)
box = BoxInventory(boxName, int(boxString), int(boxString))
boxList.append(box)
boxString = ""
count += 1
I then read and sort the data in each algotithm using this same method:
def roomiest(myReturnDict):
"""
For each item find the box with the greatest remaining allowed weight that can support the item and place the item in that box
:param boxList: The list of boxes in the class from the given file
:param itemList: The list of items in the class from the given file
:return: If boxes were able to fit all items(1); items in box with individual weights(2); Box name with max
weight(3); items with their weights that were left behind(4)
"""
itemData = list(myReturnDict.get("itemList"))
boxData = list(myReturnDict.get("boxList"))
sortedItemList = sorted(itemData, key=lambda x: x.weight, reverse=True)
sortedBoxList = sorted(boxData, key=lambda x: x.remainingWeight, reverse=True)
myReturn = {}
myReturn['boxList'] = boxList
myReturn['itemList'] = itemList
return myReturn
My dataclasses look like the following:
#dataclass
class InventoryItem:
name: str
weight: float
#dataclass
class BoxInventory:
name: str
maxWeight: float
remainingWeight: float
contents: dict = ""
def itemWeight(item):
print("Weight of", item.name, "is: ", item.weight, "\n")
return item.weight
def remainWeight(box):
print("Rem. weight in ", box.name, "is: ", box.remainingWeight, "\n")
return box.remainingWeight

Never resets list

I am trying to create a calorie counter the standard input goes like this:
python3 calories.txt < test.txt
Inside calories the food is the following format: apples 500
The problem I am having is that whenever I calculate the values for the person it seems to never return to an empty list..
import sys
food = {}
eaten = {}
finished = {}
total = 0
#mappings
def calories(x):
with open(x,"r") as file:
for line in file:
lines = line.strip().split()
key = " ".join(lines[0:-1])
value = lines[-1]
food[key] = value
def calculate(x):
a = []
for keys,values in x.items():
for c in values:
try:
a.append(int(food[c]))
except:
a.append(100)
print("before",a)
a = []
total = sum(a) # Problem here
print("after",a)
print(total)
def main():
calories(sys.argv[1])
for line in sys.stdin:
lines = line.strip().split(',')
for c in lines:
values = lines[0]
keys = lines[1:]
eaten[values] = keys
calculate(eaten)
if __name__ == '__main__':
main()
Edit - forgot to include what test.txt would look like:
joe,almonds,almonds,blue cheese,cabbage,mayonnaise,cherry pie,cola
mary,apple pie,avocado,broccoli,butter,danish pastry,lettuce,apple
sandy,zuchini,yogurt,veal,tuna,taco,pumpkin pie,macadamia nuts,brazil nuts
trudy,waffles,waffles,waffles,chicken noodle soup,chocolate chip cookie
How to make it easier on yourself:
When reading the calories-data, convert the calories to int() asap, no need to do it every time you want to sum up somthing that way.
Dictionary has a .get(key, defaultvalue) accessor, so if food not found, use 100 as default is a 1-liner w/o try: ... except:
This works for me, not using sys.stdin but supplying the second file as file as well instead of piping it into the program using <.
I modified some parsings to remove whitespaces and return a [(name,cal),...] tuplelist from calc.
May it help you to fix it to your liking:
def calories(x):
with open(x,"r") as file:
for line in file:
lines = line.strip().split()
key = " ".join(lines[0:-1])
value = lines[-1].strip() # ensure no whitespaces in
food[key] = int(value)
def getCal(foodlist, defValueUnknown = 100):
"""Get sum / total calories of a list of ingredients, unknown cost 100."""
return sum( food.get(x,defValueUnknown ) for x in foodlist) # calculate it, if unknown assume 100
def calculate(x):
a = []
for name,foods in x.items():
a.append((name, getCal(foods))) # append as tuple to list for all names/foods eaten
return a
def main():
calories(sys.argv[1])
with open(sys.argv[2]) as f: # parse as file, not piped in via sys.stdin
for line in f:
lines = line.strip().split(',')
for c in lines:
values = lines[0].strip()
keys = [x.strip() for x in lines[1:]] # ensure no whitespaces in
eaten[values] = keys
calced = calculate(eaten) # calculate after all are read into the dict
print (calced)
Output:
[('joe', 1400), ('mary', 1400), ('sandy', 1600), ('trudy', 1000)]
Using sys.stdin and piping just lead to my console blinking and waiting for manual input - maybe VS related...

Resources