Python/PyPDF4: How do I specify the /PageLabels in the created PDF? - python-3.x

I am using PyPDF4 to create an offline-readable version of the journal "Nature".
I use PyPDF4 PdfFileReader to read the individual article PDFs and PdfFileWriter to create a single, merged ouput.
The problem that I am trying to solve is that the page numbers of some issues do not start at 1, for example, issue 7805 starts with page 563.
How do I specify the desired /PageLabels in the document catalog?
for pdf_file in pdf_files:
input_pdf = PdfFileReader(open(pdf_file, 'rb'))
page_indices = file_page_dictionary[pdf_file]
for page_index in page_indices:
page = input_pdf.getPage(page_index)
# Specify actual page number here:
# page.setPageNumber(actual_page_numbers[page_index])
output.addPage(page)
with open(pdf_output_name, 'wb') as f:
output.write(f)

After exploring the PDF standard and a bit of hacking, I found that the following function will add a single PageLabels entry that creates page lables starting from offset (i.e. the first page will be labelled the offset, the second page, offset+1, etc.).
# output_pdf is an instance of PdfFileWriter().
# offset is the desired page offset.
def add_pagelabels(output_pdf, offset):
number_type = PDF.DictionaryObject()
number_type.update({PDF.NameObject("/S"):PDF.NameObject("/D")})
number_type.update({PDF.NameObject("/St"):PDF.NumberObject(offset)})
nums_array = PDF.ArrayObject()
nums_array.append(PDF.NumberObject(0)) # physical page index
nums_array.append(number_type)
page_numbers = PDF.DictionaryObject()
page_numbers.update({PDF.NameObject("/Nums"):nums_array})
page_labels = PDF.DictionaryObject()
page_labels.update({PDF.NameObject("/PageLabels"): page_numbers})
root_obj = output_pdf._root_object
root_obj.update(page_labels)
Additional page label entries can be created (i.e. with different offsets or different numbering styles).
Note that the first PDF page has an index of 0.
# Use PyPDF to manipulate pages
from PyPDF4 import PdfFileWriter, PdfFileReader
# To manipulate the PDF dictionary
import PyPDF4.pdf as PDF
def pdf_pagelabels_roman():
number_type = PDF.DictionaryObject()
number_type.update({PDF.NameObject("/S"):PDF.NameObject("/r")})
return number_type
def pdf_pagelabels_decimal():
number_type = PDF.DictionaryObject()
number_type.update({PDF.NameObject("/S"):PDF.NameObject("/D")})
return number_type
def pdf_pagelabels_decimal_with_offset(offset):
number_type = pdf_pagelabels_decimal()
number_type.update({PDF.NameObject("/St"):PDF.NumberObject(offset)})
return number_type
...
nums_array = PDF.ArrayObject()
# Each entry consists of an index followed by a page label...
nums_array.append(PDF.NumberObject(0)) # Page 0:
nums_array.append(pdf_pagelabels_roman()) # Roman numerals
# Each entry consists of an index followed by a page label...
nums_array.append(PDF.NumberObject(1)) # Page 1 -- 10:
nums_array.append(pdf_pagelabels_decimal_with_offset(first_offset)) # Decimal numbers, with Offset
# Each entry consists of an index followed by a page label...
nums_array.append(PDF.NumberObject(10)) # Page 11 --> :
nums_array.append(pdf_pagelabels_decimal_with_offset(second_offset))
page_numbers = PDF.DictionaryObject()
page_numbers.update({PDF.NameObject("/Nums"):nums_array})
page_labels = PDF.DictionaryObject()
page_labels.update({PDF.NameObject("/PageLabels"): page_numbers})
root_obj = output._root_object
root_obj.update(page_labels)

Related

extracting data from multiple pdfs and putting that data into an excel table

I am taking data extracted from multiple pdfs that were merged into one pdf.
The data is based on clinical measurements taken from a sample at different time points. Some time points have certain measurement values while others are missing.
So far, I've been able to merge the pdfs, extract the text and specific data from the text, but I want to put it all into a corresponding excel table.
Below is my current code:
import PyPDF2
from PyPDF2 import PdfFileMerger
from glob import glob
#merge all pdf files in current directory
def pdf_merge():
merger = PdfFileMerger()
allpdfs = [a for a in glob("*.pdf")]
[merger.append(pdf) for pdf in allpdfs]
with open("Merged_pdfs1.pdf", "wb") as new_file:
merger.write(new_file)
if __name__ == "__main__":
pdf_merge()
#scan pdf
text =""
with open ("Merged_pdfs1.pdf", "rb") as pdf_file, open("sample.txt", "w") as text_file:
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
for page_number in range(0, number_of_pages):
page = read_pdf.getPage(page_number)
text += page.extractText()
text_file.write(text)
#turn text script into list, separated by newlines
def Convert(text):
li = list(text.split("\n"))
return li
li = Convert(text)
filelines = []
for line in li:
filelines.append(line)
print(filelines)
#extract data from text and put into dictionary
full_data = []
test_data = {"Sample":[], "Timepoint":[],"Phosphat (mmol/l)":[], "Bilirubin, total (µmol/l)":[],
"Bilirubin, direkt (µmol/l)":[], "Protein (g/l)":[], "Albumin (g/l)":[],
"AST (U/l)":[], "ALT (U/l)":[], "ALP (U/l)":[], "GGT (U/l)":[], "IL-6 (ng/l)":[]}
for line2 in filelines:
# For each data item, extract it from the line and strip whitespace
if line2.startswith("Phosphat"):
test_data["Phosphat (mmol/l)"].append(line2.split(" ")[-2].strip())
if line2.startswith("Bilirubin,total"):
test_data["Bilirubin, total (µmol/l)"].append(line2.split(" ")[-2].strip())
if line2.startswith("Bilirubin,direkt"):
test_data["Bilirubin, direkt (µmol/l)"].append(line2.split(" ")[-4].strip())
if line2.startswith("Protein "):
test_data["Protein (g/l)"].append( line2.split(" ")[-2].strip())
if line2.startswith("Albumin"):
test_data["Albumin (g/l)"].append(line2.split(" ")[-2].strip())
if line2.startswith("AST"):
test_data["AST (U/l)"].append(line2.split(" ")[-2].strip())
if line2.startswith("ALT"):
test_data["ALT (U/l)"].append(line2.split(" ")[-4].strip())
if line2.startswith("Alk."):
test_data["ALP (U/l)"].append(line2.split(" ")[-2].strip())
if line2.startswith("GGT"):
test_data["GGT (U/l)"].append(line2.split(" ")[-4].strip())
if line2.startswith("Interleukin-6"):
test_data["IL-6 (ng/l)"].append(line2.split(" ")[-4].strip())
for sampnum in range(100):
num = str(sampnum)
sampletype = "T" and "H"
if line2.startswith(sampletype+num):
sample = sampletype+num
test_data["Sample"]=sample
for time in range(0,360):
timepoint = str(time) + "h"
word_list = list(line2.split(" "))
for word in word_list:
if word == timepoint:
test_data["Timepoint"].append(word)
full_data.append(test_data)
import pandas as pd
df = pd.DataFrame(full_data)
df.to_excel("IKC4.xlsx", sheet_name="IKC", index=False)
print(df)
The issue is I'm wondering how to move the individual items in the list to their own cells in excel, with the proper timepoint, since they dont necessarily correspond to the right timepoint. For example, timepoint 1 and 3 can have protein measurements, whereas timepoint 2 is missing this info, but timepoint 3 measurements are found at position 2 in the list and will likely be in the wrong row for an excel table.
I figured maybe I need to make an alternative dictionary for the timepoints, and attach the corresponding measurements to the proper timepoint. I'm starting to get confused though on how to do all this and am now asking for help!
Thanks in advance :)
I tried doing an "else" argument after every if argument to add a "-" if there if a measurement wasnt present for that timepoint, but I got far too many dashes since it iterates through the lines of the entire pdf.

Writing from a list to a Word document

I have written code (shown below) that reads in a word document, and writes each "run" with formatting into a list. Thus creating a list of formatted runs, so that I may use some or all at a later time. Next, I have written code to write said runs from said list to a new document. The issue is I keep getting an error stating " "Run" object is not iterable". I don't know if the issue is that writing all of the paragraph information to a list that can be recalled can not be done this way, or if I am trying to write it to the document the wrong way.
import tkinter as tk
from tkinter.filedialog import askopenfilename
from docx import Document # Invokes Document command from docx
def get_para_data(output_doc_name, paragraph):
"""
Write the run to the new file and then set its font, bold, alignment, color etc. data.
"""
output_run = []
output_para = output_doc_name.add_paragraph()
for run in paragraph.runs:
if paragraph:
output_run = output_para.add_run(run.text)
# Run's bold data
output_run.bold = run.bold
# Run's italic data
output_run.italic = run.italic
# Run's underline data
output_run.underline = run.underline
# Run's color data
output_run.font.color.rgb = run.font.color.rgb
# Run's font data
output_run.style.name = run.style.name
# Paragraph's alignment data
output_para.paragraph_format.alignment = paragraph.paragraph_format.alignment
else:
output_run = []
return output_run
# IMPORT WORD DOCUMENT
root = tk.Tk()
root.withdraw()
# returns the file path as variable for future use.
doc_path = askopenfilename(title="Choose Word File")
# Imports Word Document to Modify.
document = Document(doc_path)
# Number of paragraphs in document.
t = len(document.paragraphs)
# Preallocation of list.
output_paragraph = [None]*t
result = Document()
# Begin loop to create list of paragraph data using function created above.
i = 0
for para in document.paragraphs:
output_paragraph[i] = get_para_data(result, document.paragraphs[i])
# Write desired portion of document into a new document.
document_new = Document()
new_line = []
a = 0
for out in output_paragraph:
# Check to Verify it is not a blank line/return.
if output_paragraph:
new_line[a] = document_new.add_paragraph(output_paragraph[a])
a += 1
# if it is a blank line/return write blank line return.
else:
document_new.add_paragraph(text='\\r', style=None)
a += 1
My expected results were that the text in the new document was the same as in the previous document, but in an order i choose. Similar to a copy and paste, but I wanted the ability to choose which portions I "pasted" and when.

Extract specific pages into PDF file

I need to extract a set of pages from a pdf that contains several sets. Such conjunctions are distinguished by submissions. Inside the pdf has the following information ...
1 - Set of 3 shipments
Page: 1/continued
Page: 2/continued
Page: 3/last
2 - Set of 2 shipments
Page: 1/continued
Page: 2/last
2 - Set of 1 shipping
Page 1/1
This is to speed up my service, since I have to separate these sets manually.
from PyPDF2 import PdfFileWriter, PdfFileReader
import re
output = PdfFileWriter()
input1 = PdfFileReader(open("pdf_teste.PDF", "rb"))
totalPages = input1.getNumPages()
print ("total pages to process:" +str(totalPages))
for i in range(totalPages):
p = i
print ("processing page %s" %str(i))
output.addPage(input1.getPage(p))
p = input1.getPage(p).extractText()#extract text to search for identifier
pr = re.search("Diretor", p)#search for the identifier; to be replaced with a list
#if there's a match, do work
if pr:
outputStream = open("test"+str(i)+".pdf", "wb")
output.write(outputStream)
outputStream.close()
print ('match on page %s' %str(i))
print ('\n')
This code almost does what I want.
He divides the first set, but from the second it repeats the first set and the second set. But I want a pdf for each set.

No space between words while reading and extracting the text from a pdf file in python?

Hello Community Members,
I want to extract all the text from an e-book with .pdf as the file extension. I came to know that python has a package PyPDF2 to do the necessary action. Somehow, I have tried and able to extract text but it results in inappropriate space between the extracted words, sometimes the results is the result of 2-3 merged words.
Further, I want to extract the text from page 3 onward, as the initial pages deals with the cover page and preface. Also, I don't want to include the last 5 pages as it contains the glossary and index.
Does there exist any other way to read a .pdf binary file with NO ENCRYPTION?
The code snippet, whatever I have tried up to now is as follows.
import PyPDF2
def Read():
pdfFileObj = open('book1.pdf','rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
#discerning the number of pages will allow us to parse through all #the pages
num_pages = pdfReader.numPages
count = 0
global text
text = []
while(count < num_pages):
pageObj = pdfReader.getPage(count)
count +=1
text += pageObj.extractText().split()
print(text)
Read()
This is a possible solution:
import PyPDF2
def Read(startPage, endPage):
global text
text = []
cleanText = ""
pdfFileObj = open('myTest2.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
while startPage <= endPage:
pageObj = pdfReader.getPage(startPage)
text += pageObj.extractText()
startPage += 1
pdfFileObj.close()
for myWord in text:
if myWord != '\n':
cleanText += myWord
text = cleanText.split()
print(text)
Read(0,0)
Read() parameters --> Read(first page to read, last page to read)
Note: To read the first page starts from 0 not from 1 (as for example in an array).

Trouble Writing to a new excel file

I'm very new to python and got an assignment asking me to:
Design your own code in do something here part to save the title, id, share count
and comment count of each news media in separated columns of a Excel (.xls) file.
Design your own code to read the share count and comment count from the Excel
file created in step 3, and calculate the average share count and comment count of
those news media websites.
Here is my current code:
from urllib import request
import json
from pprint import pprint
import xlwt
'''
import xlrd
from xlutils import copy
'''
website_list = [
'http://www.huffingtonpost.com/',
'http://www.cnn.com/',
'https://www.nytimes.com/',
'http://www.foxnews.com/',
'http://www.nbcnews.com/'
] # place your list of website urls, e.g., http://jmu.edu
for website in website_list:
url_str = 'https://graph.facebook.com/'+website # create the url for facebook graph api
response = request.urlopen(url_str) # read the reponse into computer
html_str = response.read().decode("utf-8") # convert the reponse into string
json_data = json.loads(html_str) # convert the string into json
pprint (json_data)
book = xlwt.Workbook()
sheet_test = book.add_sheet('keys')
sheet_test.write(0,0,'Title')
sheet_test.write(0,1,'ID')
sheet_test.write(0,2,'Share Count')
sheet_test.write(0,3,'Comment Count')
for i in range(0,5):
for website in website_list[i]:
sheet_test.write(i,0,json_data['og_object']['title'])
sheet_test.write(i,1,json_data['id'])
sheet_test.write(i,2,json_data['share']['share_count'])
sheet_test.write(i,3,json_data['share']['comment_count'])
book.save('C:\\Users\\stinesr\\Downloads\\Excel\\keys.xls')
'''
reading_book = xlrd.open_workbook('C:\\Users\\stinesr\\Downloads\\Excel\\key.xls')
sheet_read = reading_book.sheet_by_name('keys')
num_record = sheet_read.nrows
writing_book = copy(reading_book)
sheet_write = writing_book.get_sheet(0)
print(sheet_write.name)
for i in range(num_record):
row = sheet_read.row_values(i)
if i == 0:
sheet_write.write(0,4,'Share Count Average')
sheet_write.write(0,5,'Comment Count Average')
else:
sheet_write.write(i,4,row[2])
sheet_write.write(i,5,row[3])
writing_book.save('C:\\Users\\stinesr\\Downloads\\Excel\\keys.xls')
'''
Any and all help is appreciated, thank you.
The Traceback error says in the nested for-loops on lines 40-45 you are attempting to overwrite the row 0 from the previous lines. You need to start from row 1, since row 0 already contains the header.
But before that, json_data is only keeping the last response, you'll want to create a list of "responses" and append each response to that list.
You need only one for-loop at line 40:
In summary:
website_list = [
'http://www.huffingtonpost.com/',
'http://www.cnn.com/',
'https://www.nytimes.com/',
'http://www.foxnews.com/',
'http://www.nbcnews.com/'
] # place your list of website urls, e.g., http://jmu.edu
json_list = []
for website in website_list:
url_str = 'https://graph.facebook.com/' + website # create the url for facebook graph api
response = request.urlopen(url_str) # read the reponse into computer
html_str = response.read().decode("utf-8") # convert the reponse into string
json_data = json.loads(html_str) # convert the string into json
json_list.append(json_data)
pprint (json_list)
book = xlwt.Workbook()
sheet_test = book.add_sheet('keys')
sheet_test.write(0,0,'Title')
sheet_test.write(0,1,'ID')
sheet_test.write(0,2,'Share Count')
sheet_test.write(0,3,'Comment Count')
for i in range(len(json_list)):
sheet_test.write(i+1, 0, json_list[i]['og_object']['title'])
sheet_test.write(i+1, 1, json_list[i]['id'])
sheet_test.write(i+1, 2, json_list[i]['share']['share_count'])
sheet_test.write(i+1, 3, json_list[i]['share']['comment_count'])
book.save('C:\\Users\\stinesr\\Downloads\\Excel\\keys.xls')
Should give you an Excel document that resembles:

Resources