I want a logo file to be attached everytime in the word document, when I run the code,
Ideally the code should look like :
from docx import Document
document = Document()
logo = open('logo.eps', 'r') #the logo path that is to be attached
document.add_heading('Underground Heating Oil Tank Search Report', 0) #simple heading that will come bellow the logo in the header.
document.save('report for xyz.docx') #saving the file
is this possible in the python-docx or should i try some other library to do this? if possible please tell me how,
with the following code, you can create a table with two columns the first element is the logo, the second element is the text part of the header
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
document = Document()
header = document.sections[0].header
htable=header.add_table(1, 2, Inches(6))
htab_cells=htable.rows[0].cells
ht0=htab_cells[0].add_paragraph()
kh=ht0.add_run()
kh.add_picture('logo.png', width=Inches(1))
ht1=htab_cells[1].add_paragraph('put your header text here')
ht1.alignment = WD_ALIGN_PARAGRAPH.RIGHT
document.save('yourdoc.docx')
A simpler way to include logo and a header with some style (Heading 2 Char here):
from docx import Document
from docx.shared import Inches, Pt
doc = Document()
header = doc.sections[0].header
paragraph = header.paragraphs[0]
logo_run = paragraph.add_run()
logo_run.add_picture("logo.png", width=Inches(1))
text_run = paragraph.add_run()
text_run.text = '\t' + "My Awesome Header" # For center align of text
text_run.style = "Heading 2 Char"
Related
from xlrd import open_workbook
from docx import Document
from docx.shared import Pt
from docx.shared import RGBColor
document = Document('demo1.docx')
document.styles['Normal'].font.color.rgb = RGBColor(255,0,255) #chang color
document.tables[0].add_row()
document.tables[0].add_row()
document.tables[0].add_row()
document.tables[1].add_row()
document.tables[1].add_row()
document.tables[1].add_row()
document.tables[0].rows[3].cells[1].text = "No1"
document.tables[0].rows[4].cells[2].text = "No2"
document.tables[1].rows[3].cells[1].text = "No3"
document.tables[1].rows[4].cells[2].text = "No4"
document.save('result.docx')
The above code is not only making tables[0].rows[4].cells[2] and tables[1].rows[3].cells[1] pink, it is making the entire document to Pink. See attached image below.
How can I set only the tables[0].rows[4].cells[2] and tables[1].rows[3].cells[1] text pink?
I am new to python.. and have done a small hands-on on the python-docx module.
I am having a requirement in which I have to read a word document which contains multiple tables and text.
Out of this document I have to select specific table to read and that selection depends on the text written in the line just above the table, and then I have to process the data of that table.
I am able to read the table data by referring the table with its index, but in this case the table index is unknown and it can be at any position in the document. The only thing by which I can identify the table is the text written in the line just above the table.
Can you please help me achieving this?
I have a solution made using BeautifulSoup and not python-docx. What I have done here is traversed through OOXML of word(.docx) document.
from bs4 import BeautifulSoup
import zipfile
wordoc = input('Enter your file name here or name with path: ')
text1 = 'Enter your text written above the table'
text1 = ''.join(text1.split())
document = zipfile.ZipFile(wordoc)
xml_content = document.read('word/document.xml')
document.close()
soup = BeautifulSoup(xml_content, 'xml')
for document in soup.children:
for body in document.children:
for tag in body.children:
if tag.name == 'p' and (''.join(tag.text.split())) == text1:
table = tag.find_next_sibling('w:tbl')
table_contents = []
for wtc in table.findChildren('w:tc'):
cell_text = ''
for wr in wtc.findChildren('w:r'):
# We want to exclude striked-out text
if not wr.findChildren('w:strike'):
cell_text += wr.text
table_contents.append(cell_text)
print(table_contents)
Is it possible to exclude the contents of footers and headers of a page from a pdf file during extracting the text from it. As these contents are least important and almost redundant.
Note: For extracting the text from the .pdf file, I am using the PyPDF2 package on python version = 3.7.
How to exclude the contents of the footers and headers in PyPDF2. Any help is appreciated.
The code snippet is as follows:
import PyPDF2
def Read(startPage, endPage):
global text
text = []
cleanText = " "
pdfFileObj = open('C:\\Users\\Rocky\\Desktop\\req\\req\\0000 - gamma j.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
print(num_pages)
while (startPage <= endPage):
pageObj = pdfReader.getPage(startPage)
text += pageObj.extractText()
startPage += 1
pdfFileObj.close()
for myWord in text:
if myWord != '\n':
cleanText += myWord
text = cleanText.strip().split()
print(text)
Read(1, 1)
As there are no features provided by PyPDF2 officially, I've written a function of my own to exclude the headers and footers in a pdf page which is working fine for my use case. You can add your own Regex patterns in page_format_pattern variable. Here I'm checking only in the first and last elements of my text list.
You can run this function for each page.
def remove_header_footer(self,pdf_extracted_text):
page_format_pattern = r'([page]+[\d]+)'
pdf_extracted_text = pdf_extracted_text.lower().split("\n")
header = pdf_extracted_text[0].strip()
footer = pdf_extracted_text[-1].strip()
if re.search(page_format_pattern, header) or header.isnumeric():
pdf_extracted_text = pdf_extracted_text[1:]
if re.search(page_format_pattern, footer) or footer.isnumeric():
pdf_extracted_text = pdf_extracted_text[:-1]
pdf_extracted_text = "\n".join(pdf_extracted_text)
return pdf_extracted_text
Hope you find this helpful.
At the moment, PyPDF2 does not offer this. It's also unclear how to do it well as those are not semantically represented within the pdf
As a heuristic, you could search for duplicates in the top / bottom of the extracted text of pages. That would likely work well for long documents and not work at all for 1-page documents
You need to consider that the first few pages might have no header or a different header than the rest. Also, there can be differences between chapters and even / odd pages
(side note: I'm the maintainer of PyPDF2 and I think this would be awesome to have)
I'm using Python3 and I have a long text file and I would like to create a new pdf and write the text inside.
I tried using reportlab but it writes only one line.
from reportlab.pdfgen pdfgen import canvas
c = canvas.Canvas("hello.pdf")
c.drawString(100,750, text)
c.save()
I know that I can tell it in which line to write what. But is there a library where I can just give the text and the margins and it will write it in the pdf file ?
Thanks
EDIT:
Or instead of that I could also use a library that easily converts txt file to pdf file ?
Simply drawing your string on the canvas won't do your job.
If its just raw text and you don't need to do any modifications like heading and other kinds of stuff to your text, then you can simply put your text in Flowables i.e Paragraph, and your Flowables can be appended to your story[].
You can adjust the margins according to your use.
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
from reportlab.lib.pagesizes import letter
styles = getSampleStyleSheet()
styleN = styles['Normal']
styleH = styles['Heading1']
story = []
pdf_name = 'your_pdf_file.pdf'
doc = SimpleDocTemplate(
pdf_name,
pagesize=letter,
bottomMargin=.4 * inch,
topMargin=.6 * inch,
rightMargin=.8 * inch,
leftMargin=.8 * inch)
with open("your_text_file.txt", "r") as txt_file:
text_content = txt_file.read()
P = Paragraph(text_content, styleN)
story.append(P)
doc.build(
story,
)
For more information on Flowables read reportlab-userguide
I want to read the docx document paragraph by paragraph and if there is a picture (InlineShape), then process it with the text around it. The function Document.inline_shapes will give the list of all inline shapes in the document. But I want to get the one, that appears exactly in the current paragraph if exists...
An example of code:
from docx import Document
doc = Document("test.docx")
blip = doc.inline_shapes[0]._inline.graphic.graphicData.pic.blipFill.blip
rID = blip.embed
document_part = doc.part
image_part = document_part.related_parts[rID]
fr = open("test.png", "wb")
fr.write(image_part._blob)
fr.close()
(this is how I want to save these pictures)
Assume your paragraph is par, you may use the following code to find the images
import xml.etree.ElementTree as ET
def hasImage(par):
"""get all of the images in a paragraph
:param par: a paragraph object from docx
:return: a list of r:embed
"""
ids = []
root = ET.fromstring(par._p.xml)
namespace = {
'a':"http://schemas.openxmlformats.org/drawingml/2006/main", \
'r':"http://schemas.openxmlformats.org/officeDocument/2006/relationships", \
'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}
inlines = root.findall('.//wp:inline',namespace)
for inline in inlines:
imgs = inline.findall('.//a:blip', namespace)
for img in imgs:
id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
ids.append(id)
return ids