How to force pdfminer to analyze layout from left to right ant top to bottom? - layout

I use a common code fro converting a pdf to text using pdfminer, something like this:
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
with open(path, 'rb') as fp:
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
It's a good library and works well on many files, but sometimes it shows stupid layout analysis. For example in a two-columns pdf like the following, it first shows the text of the second column.
I guess in all or most English articles, the text runs from top to down and left to right. So, why pdfminer doesnt' follow this simple rule? Anyways, how can I force it to obey this processing?

Related

Reading a ini (text) file in Python 3 and use the data to print strings on the screen independently from the text file encode format

in these days I asked a couple of questions related to string encode and file encode in Python but the real problem is harder then I supposed to be and now I have a clearer understanding of the problem.
Windows encoded text files in different formats depending by the language or the language group. So, I would be able to read a ini (text file) encoded in different formats because some keys contain strings that I need to display on forms and menues on the screen.
So I would like to write something (that has to work with any text file and encode format) similar to this code example:
from configparser import ConfigParser
import magic
def detect(iniFileName):
return magic.Magic(mime_encoding=True).from_file(iniFileName)
#---------------------------------------------------------------------------
encoding = detect('config.ini')
ini = ConfigParser()
ini.read('config.ini', encoding)
title = ini.get('USERENTRY-FORM', 'TITLE')
'''
then title is passed to the tk form
'''
UserEntryForm.setTitle(title)
if _DEBUG == True:
print("USERENTRY-FORM title=",title)
This is the new solution that seems to work better because recognizes better the encode format, AIniParser is a wrapper class arund ConfigParser.
from chardet import detect
def chardetPrint(filename):
text = open(filename, 'rb').read()
print(filename,": ",detect(text))
#---------------------------------------------------------------------------
def chardet(filename):
text = open(filename, 'rb').read()
print(filename,": ",detect(text)['encoding']) # only for test
return detect(text)['encoding']
if __name__ == '__main__':
from ainiparser import AIniParser
def testIniRead(filename, section, key):
with open(filename, "rb") as f:
line = f.readline()
print("line 1: ", line)
encode = chardet(filename)
ini = AIniParser(filename, encoding=encode)
ini._open()
text = ini.getString(section, key)
print(text)
def main():
testIniRead("../cdata/config-cro-16.ini", "CGUIINTF", "game-turn-text")
testIniRead("../bat/output-lva-16/idata/config-lva-16.ini", "CGUIINTF", "game-turn-text")
testIniRead("../idata/config-lva-16.ini", "CGUIINTF", "game-turn-text")
testIniRead("../idata/domande.ini", "D2", "B")
#---------------------------------------------------------------------------
main()
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
This solution seems recognizes better how files are encoded but I am still testing it.

extracting text from pdf - PyPDF2

I am following the tutorial on the page for extract text from pdf:
http://www.blog.pythonlibrary.org/2018/06/07/an-intro-to-pypdf2/
And I can print the pdf information, but I cannot print the content of the pages. It doesn't throw any error, but I can't see the text of the pdf either
What could be the problem?
from PyPDF2 import PdfFileReader
def get_info(path):
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
info = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
#print(info)
author = info.author
creator = info.creator
producer = info.producer
subject = info.subject
title = info.title
print(author)
print(creator)
print(producer)
print(subject)
print(title)
def text_extractor(path):
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
# get the first page
page = pdf.getPage(0)
print(page)
print('Page type: {}'.format(str(type(page))))
text = page.extractText()
print(text) #THIS PART SHOULD PRINT TEXT FROM PDF, BUT DOESNT WORK
if __name__ == '__main__':
#URL PDF: https://oficinavirtual.ugr.es/apli/solicitudPAU/test.pdf
path = 'test.pdf'
get_info(path)
print("\n"*2)
text_extractor(path)
Although this is not the solution, you can simply install pdfminer3 with pip and use minimal reproducible example here

How to use PDFminer.six with python 3?

I want to use pdfminer.six which is a tool, that can be used with Python3 for extracting information from PDF documents. The problem is there is no good documentation at all and no source code example on how to use the tool.
I have already tried some code from StackOverflow but it didn't work. Below is my code.
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
I want some code example on how to use this tool to get data from PDFs.
Install pdfminer.six or pdfminer3 (https://github.com/gwk/pdfminer3/)
install: pip install pdfminer3
I switched to pdfminer3 when I upgraded to 3.7 from 3.6
I use on ubuntu and macos with python 3.7.3
pdfminer3 comes with two handy tools: pdf2txt.py and dumppdf.py
examine the source. Fairly small and easy to understand.
Following is a working example (once the location of the pdf file is added)
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open('/path/to/file.pdf', 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
# close open handles
converter.close()
fake_file_handle.close()
print(text)
Full disclosure, I am one of the maintainers of pdfminer.six. It is a community-maintained version of pdfminer for python 3.
Nowadays, it has multiple api's to extract text from a PDF, depending on your needs. Behind the scenes, all of these api's use the same logic for parsing and analyzing the layout.
(All the examples assume your PDF file is called example.pdf)
Commandline
If you want to extract text just once you can use the commandline tool pdf2txt.py:
$ pdf2txt.py example.pdf
High-level api
If you want to extract text (properties) with Python, you can use the high-level api. This approach is the go-to solution if you want to programmatically extract information from a PDF.
from pdfminer.high_level import extract_text
# Extract text from a pdf.
text = extract_text('example.pdf')
# Extract iterable of LTPage objects.
pages = extract_pages('example.pdf')
Composable api
There is also a composable api that gives a lot of flexibility in handling the resulting objects. For example, it allows you to create your own layout algorithm. This method is suggested in the other answers, but I would only recommend this when you need to customize some component.
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
output_string = StringIO()
with open('example.pdf', 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
print(output_string.getvalue())
Similar question and answers here. I'll try to keep them in sync.

python pdfminer converts pdf file into one chunk of string with no spaces between words

I was using the following code mainly taken from DuckPuncher's answer to this post Extracting text from a PDF file using PDFMiner in python? to convert pdfs to text files:
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
The pdfs are downloaded and stored in my local directory using the following code and stored in my local directory. It worked fine.
import requests
url = 'link_to_the_pdf'
file_name = './name.pdf'
response = requests.get(url)
with open(file_name, 'wb') as f:
f.write(response.content)
However, for some pdfs, the convert_pdf_to_txt() returned the content as almost one chunk of string with no spaces between words. For example, after downloading the following pdf from http://www.ece.rochester.edu/~gsharma/papers/LocalImageRegisterEI2005.pdf, and applying the convert_pdf_to_txt() function, I got a text file in which the words are not separated by spaces. An excerpt of the text file is
3Predominantmethodsinthelattergrouparefromcomputervisionarea,e.g.,plane+p
arallax4methodfor3-Dscenestructurecomputation.Inthispaper,weproposeanewlocalimageregistrationtechnique,inthefirstclass,basedonadaptivefilteringtechniques.Adaptivefiltershavebeenutilizedsuccessfullyforsystemidentificationpurposesin1-D.
Can someone help me fix this problem please? Is it the format of this particular pdf that's causing the problem or something else, because with some other pdfs, the convert_pdf_to_txt() function is working fine.
According to this thread some pdfs mark the entire text as figure and by default PDFMiner doesn't try to perform layout analysis for figure text. To override this behavior the all_texts parameter needs to be set to True.
Here is an example that works for me based on this post.
import io
import pdfminer
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
# Perform layout analysis for all text
laparams = pdfminer.layout.LAParams()
setattr(laparams, 'all_texts', True)
def extract_text_from_pdf(pdf_path):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
# close open handles
converter.close()
fake_file_handle.close()
if text:
return text
text = extract_text_from_pdf('test.pdf')

pdfminer reads numbers incorrectly when the pdf text is in hebrew

I use this pdfminer code to read from pdf:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
The extracted text seems correct including numbers.
I tried to use the same code to read PDFs in Hebrew (by adding [::-1] to the results so that it appears from right to left). With regards to letters, the text appears correctly. However, when it comes to numbers, although the code extracts them, the wrong numbers show up.
Any ideas whys?

Resources