Generate flattened PDF with Python - python

When I print a PDF from any of my source PDFs, the file size drops and removes the text boxes presents in form. In short, it flattens the file.
This is behavior I want to achieve.
The following code to create a PDF using another PDF as a source (the one I want to flatten), it writes the text boxes form as well.
Can I get a PDF without the text boxes, flatten it? Just like Adobe does when I print a PDF as a PDF.
My other code looks something like this minus some things:
import os
import StringIO
from pyPdf import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
directory = os.path.join(os.getcwd(), "source") # dir we are interested in
fif = [f for f in os.listdir(directory) if f[-3:] == 'pdf'] # get the PDFs
for i in fif:
packet = StringIO.StringIO()
can = canvas.Canvas(packet, pagesize=letter)
can.rotate(-90)
can.save()
packet.seek(0)
new_pdf = PdfFileReader(packet)
fname = os.path.join('source', i)
existing_pdf = PdfFileReader(file(fname, "rb"))
output = PdfFileWriter()
nump = existing_pdf.getNumPages()
page = existing_pdf.getPage(0)
for l in range(nump):
output.addPage(existing_pdf.getPage(l))
page.mergePage(new_pdf.getPage(0))
outputStream = file("out-"+i, "wb")
output.write(outputStream)
outputStream.close()
print fName + " written as", i
Summing up: I have a pdf, I add a text box to it, covering up info and adding new info, and then I print a pdf from that pdf. The text box becomes not editable or moveable any longer. I wanted to automate that process but everything I tried still allowed that text box to be editable.

If installing an OS package is an option, then you could use pdftk with its python wrapper pypdftk like this:
import pypdftk
pypdftk.fill_form('filled.pdf', out_file='flattened.pdf', flatten=True)
You would also need to install the pdftk package, which on Ubuntu could be done like this:
sudo apt-get install pdftk
The pypdftk library can by downloaded from PyPI:
pip install pypdftk
Update: pdftk was briefly removed from Ubuntu in version 18.04, but it seems it is back since 20.04.

A simple but more of a round about way it to covert the pdf to images than to put those image into a pdf.
You'll need pdf2image and PIL
Like So
from pdf2image import convert_from_path
from PIL import Image
images = convert_from_path('temp.pdf')
im1 = images[0]
images.pop(0)
pdf1_filename = "flattened.pdf"
im1.save(pdf1_filename, "PDF" ,resolution=100.0, save_all=True, append_images=images)
Edit:
I created a library to do this called fillpdf
pip install fillpdf
from fillpdf import fillpdfs
fillpdfs.flatten_pdf('input.pdf', 'newflat.pdf')

Per the Adobe Docs, you can change the Bit Position of the Editable Form Fields to 1 to make the field ReadOnly. I provided a full solution here, but it uses Django:
https://stackoverflow.com/a/55301804/8382028
Adobe Docs (page 552):
https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
Use PyPDF2 to fill the fields, then loop through the annotations to change the bit position:
from io import BytesIO
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import BooleanObject, NameObject, NumberObject
# open the pdf
input_stream = open("YourPDF.pdf", "rb")
reader = PdfFileReader(input_stream, strict=False)
if "/AcroForm" in reader.trailer["/Root"]:
reader.trailer["/Root"]["/AcroForm"].update(
{NameObject("/NeedAppearances"): BooleanObject(True)}
)
writer = PdfFileWriter()
writer.set_need_appearances_writer()
if "/AcroForm" in writer._root_object:
# Acro form is form field, set needs appearances to fix printing issues
writer._root_object["/AcroForm"].update(
{NameObject("/NeedAppearances"): BooleanObject(True)}
)
data_dict = dict() # this is a dict of your form values
writer.addPage(reader.getPage(0))
page = writer.getPage(0)
# update form fields
writer.updatePageFormFieldValues(page, data_dict)
for j in range(0, len(page["/Annots"])):
writer_annot = page["/Annots"][j].getObject()
for field in data_dict:
if writer_annot.get("/T") == field:
# make ReadOnly:
writer_annot.update({NameObject("/Ff"): NumberObject(1)})
output_stream = BytesIO()
writer.write(output_stream)
# output_stream is your flattened PDF

A solution that goes for Windows as well, converts many pdf pages and flatens the chackbox values as well. For some reason #ViaTech code did not work in my pc (Windows7 python 3.8)
Followed #ViaTech indications and used extensively #hchillon code from this post
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import BooleanObject, NameObject, IndirectObject, TextStringObject, NumberObject
def set_need_appearances_writer(writer):
try:
catalog = writer._root_object
# get the AcroForm tree and add "/NeedAppearances attribute
if "/AcroForm" not in catalog:
writer._root_object.update({
NameObject("/AcroForm"): IndirectObject(len(writer._objects), 0, writer)})
need_appearances = NameObject("/NeedAppearances")
writer._root_object["/AcroForm"][need_appearances] = BooleanObject(True)
return writer
except Exception as e:
print('set_need_appearances_writer() catch : ', repr(e))
return writer
class PdfFileFiller(object):
def __init__(self, infile):
self.pdf = PdfFileReader(open(infile, "rb"), strict=False)
if "/AcroForm" in self.pdf.trailer["/Root"]:
self.pdf.trailer["/Root"]["/AcroForm"].update(
{NameObject("/NeedAppearances"): BooleanObject(True)})
# newvals and newchecks have keys have to be filled. '' is not accepted
def update_form_values(self, outfile, newvals=None, newchecks=None):
self.pdf2 = MyPdfFileWriter()
trailer = self.pdf.trailer['/Root'].get('/AcroForm', None)
if trailer:
self.pdf2._root_object.update({
NameObject('/AcroForm'): trailer})
set_need_appearances_writer(self.pdf2)
if "/AcroForm" in self.pdf2._root_object:
self.pdf2._root_object["/AcroForm"].update(
{NameObject("/NeedAppearances"): BooleanObject(True)})
for i in range(self.pdf.getNumPages()):
self.pdf2.addPage(self.pdf.getPage(i))
self.pdf2.updatePageFormFieldValues(self.pdf2.getPage(i), newvals)
for j in range(0, len(self.pdf.getPage(i)['/Annots'])):
writer_annot = self.pdf.getPage(i)['/Annots'][j].getObject()
for field in newvals:
writer_annot.update({NameObject("/Ff"): NumberObject(1)})
self.pdf2.updatePageFormCheckboxValues(self.pdf2.getPage(i), newchecks)
with open(outfile, 'wb') as out:
self.pdf2.write(out)
class MyPdfFileWriter(PdfFileWriter):
def __init__(self):
super().__init__()
def updatePageFormCheckboxValues(self, page, fields):
for j in range(0, len(page['/Annots'])):
writer_annot = page['/Annots'][j].getObject()
for field in fields:
writer_annot.update({NameObject("/Ff"): NumberObject(1)})
origin = ## Put input pdf path here
destination = ## Put output pdf path here, even if the file does not exist yet
newchecks = {} # A dict with all checkbox values that need to be changed
newvals = {'':''} # A dict with all entry values that need to be changed
# newvals dict has to be equal to {'':''} in case that no changes are needed
c = PdfFileFiller(origin)
c.update_form_values(outfile=destination, newvals=newvals, newchecks=newchecks)
print('PDF has been created\n')

I had trouble flattening a form that I had entered content into using pdfrw (How to Populate Fillable PDF's with Python) and found that I had to add an additional step using generate_fdf (pdftk flatten loses fillable field data).
os.system('pdftk '+outtemp+' generate_fdf output '+outfdf)
os.system('pdftk '+outtemp+' fill_form '+outfdf+' output '+outpdf)
I came to this solution because I was able to flatten a file just fine using ghostscript's pdf2ps followed by ps2pdf on my Mac, but the quality had low resolution when I ran it on an Amazon Linux instance. I couldn't figure out why that was the case and so moved to the pdftk solution.

Related

Add text to multiple page long PDF using Python

My Situation
Hi, I'm relatively new to coding with Python, and I'm currently working on a "PDF-Writer" which should read in PDF files and then draw a certain String to it plus some other stuff. I'm doing this with canvas and PyPDF2 and it works fine.
My question is referring to this question:
(Add text to Existing PDF using Python)
Others already commented under the post but didn't post any answers there that's why I'm asking this question. I would have commented under the post, but sadly, I don't have the reputation to comment. But I think resolving the issue by asking a new question is also fine.
My problems
My first problem is that I can only write on the first page of the document. I would like to write on all pages of the document.
My second problem is that the first page of the document is also the last page for some reason.
Here my Code:
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.units import cm
file = input("Give path to the file >> ")
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize=letter)
# register font
pdfmetrics.registerFont(TTFont('Arial', 'Arial.ttf'))
can.setFont(psfontname= "Arial",size = 10)
# create a new string
can.drawString(93, 54.5, "2022-12-21")
# create a white rectangle
can.setFillColorRGB(254,254,254)
can.rect(92.5,65,2*cm,0.34*cm, fill=1,stroke=0)
# save the pdf
can.save()
# move to the beginning of the StringIO buffer
packet.seek(0)
# create a new PDF with Reportlab
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(open(file, "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.pages[0]
page.merge_page(new_pdf.pages[0])
output.append_pages_from_reader(existing_pdf)
output.add_page(page)
# finally, write "output" to a real file
output_stream = open("destination3.pdf", "wb")
output.write(output_stream)
output_stream.close()
print("PDF created!")
What I've tried
Well, I added output.append_pages_from_reader(existing_pdf) to the code which led to the problems. Now I'm searching after a properly working Code :)
Well I got the answer, here is an excerpt:
existing_pdf = PdfFileReader(open(filename2, "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing pages
for x in range(existing_pdf.getNumPages()):
page = existing_pdf.pages[x]
page.merge_page(new_pdf.pages[0])
output.add_page(page)
# finally, write "output" to a real file
output_stream = open(outputPath, "wb")
output.write(output_stream)
output_stream.close()
print("PDF created!")
This code does exactly what I was expecting in my question above.

Printing a PDF with Python from variable, instead a file

I would like to print a PDF file with an external printer. However, since I'm about to open, create or transform multiple files in some loop, I would like to print the thing without the need of saving it as a PDF file in every iteration.
Simplified code looks like this:
import PyPDF2
import os
pdf_in = open('tubba.pdf', 'rb')
pdf_reader = PyPDF2.PdfFileReader(pdf_in)
pdf_writer = PyPDF2.PdfFileWriter()
page = pdf_reader.getPage(0)
page.rotateClockwise(90)
# Some other operations done on the page, such as scaling, cropping etc.
pdf_writer.addPage(page)
pdf_out = open('rotated.pdf', 'wb')
pdf_writer.write(pdf_out)
pdf_print = os.startfile('rotated.pdf', 'print')
pdf_out.close()
pdf_in.close()
Is there any way to print "page", or "pdf_writer"?
Best regards
You can just use variables.
Eg.
path = 'C\yourfile.pdf'
os.startfile(path) #just pass the variable here

How to convert whole pdf to text in python

I have to convert whole pdf to text. i have seen at many places converting pdf to text but particular page.
from PyPDF2 import PdfFileReader
import os
def text_extractor(path):
with open(os.path.join(path,file), 'rb') as f:
pdf = PdfFileReader(f)
###Here i can specify page but i need to convert whole pdf without specifying pages###
page = pdf.getPage(0)
text = page.extractText()
print(text)
if __name__ == '__main__':
path="C:\\Users\\AAAA\\Desktop\\BB"
for file in os.listdir(path):
if not file.endswith(".pdf"):
continue
text_extractor(path)
How to convert whole pdf file to text without using getpage()??
You may want to use textract as this answer recommends to get the full document if all you want is the text.
If you want to use PyPDF2 then you can first get the number of pages then iterate over each page such as:
from PyPDF2 import PdfFileReader
import os
def text_extractor(path):
with open(os.path.join(path,file), 'rb') as f:
pdf = PdfFileReader(f)
###Here i can specify page but i need to convert whole pdf without specifying pages###
text = ""
for page_num in range(pdf.getNumPages()):
page = pdf.getPage(page_num)
text += page.extractText()
print(text)
if __name__ == '__main__':
path="C:\\Users\\AAAA\\Desktop\\BB"
for file in os.listdir(path):
if not file.endswith(".pdf"):
continue
text_extractor(path)
Though you may want to remember which page the text came from in which case you could use a list:
page_text = []
for page_num in range(pdf.getNumPages()): # For each page
page = pdf.getPage(page_num) # Get that page's reference
page_text.append(page.extractText()) # Add that page to our array
for page in page_text:
print(page) # print each page
You could use tika to accomplish this task, but the output needs a little cleaning.
from tika import parser
parse_entire_pdf = parser.from_file('mypdf.pdf', xmlContent=True)
parse_entire_pdf = parse_entire_pdf['content']
print (parse_entire_pdf)
This answer uses PyPDF2 and encode('utf-8') to keep the output per page together.
from PyPDF2 import PdfFileReader
def pdf_text_extractor(path):
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
# Get total pdf page number.
totalPageNumber = pdf.numPages
currentPageNumber = 0
while (currentPageNumber < totalPageNumber):
page = pdf.getPage(currentPageNumber)
text = page.extractText()
# The encoding put each page on a single line.
# type is <class 'bytes'>
print(text.encode('utf-8'))
#################################
# This outputs the text to a list,
# but it doesn't keep paragraphs
# together
#################################
# output = text.encode('utf-8')
# split = str(output, 'utf-8').split('\n')
# print (split)
#################################
# Process next page.
currentPageNumber += 1
path = 'mypdf.pdf'
pdf_text_extractor(path)
Try pdfreader. You can extract either plain text or decoded text containing "pdf markdown":
from pdfreader import SimplePDFViewer, PageDoesNotExist
fd = open(you_pdf_file_name, "rb")
viewer = SimplePDFViewer(fd)
plain_text = ""
pdf_markdown = ""
try:
while True:
viewer.render()
pdf_markdown += viewer.canvas.text_content
plain_text += "".join(viewer.canvas.strings)
viewer.next()
except PageDoesNotExist:
pass
PDF is a page-oriented format & therefore you'll need to deal with the concept of pages.
What makes it perhaps even more difficult, you're not guaranteed that the text excerpts you're able to extract are extracted in the same order as they are presented on the page: PDF allows one to say "put this text within a 4x3 box situated 1" from the top, with a 1" left margin.", and then I can put the next set of text somewhere else on the same page.
Your extractText() function simply gets the extracted text blocks in document order, not presentation order.
Tables are notoriously difficult to extract in a common, meaningful way... You see them as tables, PDF sees them as text blocks placed on the page with little or no relationship.
Still, getPage() and extractText() are good starting points & if you have simply formatted pages, they may work fine.
I found out a very simple way to do this.
You have to follow this steps:
Install PyPDF2 :To do this step if you use Anaconda, search for Anaconda Prompt and digit the following command, you need administrator permission to do this.
pip install PyPDF2
If you're not using Anaconda you have to install pip and put its path
to your cmd or terminal.
Python Code: This following code shows how to convert a pdf file very easily:
import PyPDF2
with open("pdf file path here",'rb') as file_obj:
pdf_reader = PyPDF2.PdfFileReader(file_obj)
raw = pdf_reader.getPage(0).extractText()
print(raw)
I just used pdftotext module to get this done easily.
import pdftotext
# Load your PDF
with open("test.pdf", "rb") as f:
pdf = pdftotext.PDF(f)
# creating a text file after iterating through all pages in the pdf
file = open("test.txt", "w")
for page in pdf:
file.write(page)
file.close()
Link: https://github.com/manojitballav/pdf-text

Python vs PDF - What is the best way to edit an existing PDF template? [duplicate]

I need to add some extra text to an existing PDF using Python, what is the best way to go about this and what extra modules will I need to install.
Note: Ideally I would like to be able to run this on both Windows and Linux, but at a push Linux only will do.
Edit: pypdf and ReportLab look good but neither one will allow me to edit an existing PDF, are there any other options?
Example for [Python 2.7]:
from pyPdf import PdfFileWriter, PdfFileReader
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = StringIO.StringIO()
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(10, 100, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
# create a new PDF with Reportlab
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(file("original.pdf", "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(0)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# finally, write "output" to a real file
outputStream = file("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
Example for Python 3.x:
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(10, 100, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
# create a new PDF with Reportlab
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(open("original.pdf", "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.pages[0]
page.merge_page(new_pdf.pages[0])
output.add_page(page)
# finally, write "output" to a real file
output_stream = open("destination.pdf", "wb")
output.write(output_stream)
output_stream.close()
I know this is an older post, but I spent a long time trying to find a solution. I came across a decent one using only ReportLab and PyPDF so I thought I'd share:
read your PDF using PdfFileReader(), we'll call this input
create a new pdf containing your text to add using ReportLab, save this as a string object
read the string object using PdfFileReader(), we'll call this text
create a new PDF object using PdfFileWriter(), we'll call this output
iterate through input and apply .mergePage(*text*.getPage(0)) for each page you want the text added to, then use output.addPage() to add the modified pages to a new document
This works well for simple text additions. See PyPDF's sample for watermarking a document.
Here is some code to answer the question below:
packet = StringIO.StringIO()
can = canvas.Canvas(packet, pagesize=letter)
<do something with canvas>
can.save()
packet.seek(0)
input = PdfFileReader(packet)
From here you can merge the pages of the input file with another document.
pdfrw will let you read in pages from an existing PDF and draw them to a reportlab canvas (similar to drawing an image). There are examples for this in the pdfrw examples/rl1 subdirectory on github. Disclaimer: I am the pdfrw author.
cpdf will do the job from the command-line. It isn't python, though (afaik):
cpdf -add-text "Line of text" input.pdf -o output .pdf
Leveraging David Dehghan's answer above, the following works in Python 2.7.13:
from PyPDF2 import PdfFileWriter, PdfFileReader, PdfFileMerger
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = StringIO.StringIO()
# create a new PDF with Reportlab
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(290, 720, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader("original.pdf")
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(0)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# finally, write "output" to a real file
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
Don't use mergePage, It may not work for some pdfs
You should use mergeRotatedTranslatedPage
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen.canvas import Canvas
page_to_merge = 0 #Refers to the First page of PDF
xcoor = 250 #To be changed according to your pdf
ycoor = 650 #To be changed according to your pdf
input_pdf = PdfFileReader(open("Source.pdf", "rb"))
page_count = input_pdf.getNumPages()
inputpdf_page_to_be_merged = input_pdf.getPage(page_to_merge)
packet = io.BytesIO()
c = Canvas(packet,pagesize=(inputpdf_page_to_be_merged.mediaBox.getWidth(),inputpdf_page_to_be_merged.mediaBox.getHeight()))
c.drawString(xcoor,ycoor,"Hello World")
c.save()
packet.seek(0)
overlay_pdf = PdfFileReader(packet)
overlay = overlay_pdf.getPage(0)
output = PdfFileWriter()
for PAGE in range(page_count):
if PAGE == page_to_merge:
inputpdf_page_to_be_merged.mergeRotatedTranslatedPage(overlay,
inputpdf_page_to_be_merged.get('/Rotate') or 0,
overlay.mediaBox.getWidth()/2, overlay.mediaBox.getWidth()/2)
output.addPage(inputpdf_page_to_be_merged)
else:
Page_in_pdf = input_pdf.getPage(PAGE)
output.addPage(Page_in_pdf)
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
If you're on Windows, this might work:
PDF Creator Pilot
There's also a whitepaper of a PDF creation and editing framework in Python. It's a little dated, but maybe can give you some useful info:
Using Python as PDF Editing and Processing Framework
You may have better luck breaking the problem down into converting PDF into an editable format, writing your changes, then converting it back into PDF. I don't know of a library that lets you directly edit PDF but there are plenty of converters between DOC and PDF for example.

Add text to Existing PDF using Python

I need to add some extra text to an existing PDF using Python, what is the best way to go about this and what extra modules will I need to install.
Note: Ideally I would like to be able to run this on both Windows and Linux, but at a push Linux only will do.
Edit: pypdf and ReportLab look good but neither one will allow me to edit an existing PDF, are there any other options?
Example for [Python 2.7]:
from pyPdf import PdfFileWriter, PdfFileReader
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = StringIO.StringIO()
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(10, 100, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
# create a new PDF with Reportlab
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(file("original.pdf", "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(0)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# finally, write "output" to a real file
outputStream = file("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
Example for Python 3.x:
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(10, 100, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
# create a new PDF with Reportlab
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(open("original.pdf", "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.pages[0]
page.merge_page(new_pdf.pages[0])
output.add_page(page)
# finally, write "output" to a real file
output_stream = open("destination.pdf", "wb")
output.write(output_stream)
output_stream.close()
I know this is an older post, but I spent a long time trying to find a solution. I came across a decent one using only ReportLab and PyPDF so I thought I'd share:
read your PDF using PdfFileReader(), we'll call this input
create a new pdf containing your text to add using ReportLab, save this as a string object
read the string object using PdfFileReader(), we'll call this text
create a new PDF object using PdfFileWriter(), we'll call this output
iterate through input and apply .mergePage(*text*.getPage(0)) for each page you want the text added to, then use output.addPage() to add the modified pages to a new document
This works well for simple text additions. See PyPDF's sample for watermarking a document.
Here is some code to answer the question below:
packet = StringIO.StringIO()
can = canvas.Canvas(packet, pagesize=letter)
<do something with canvas>
can.save()
packet.seek(0)
input = PdfFileReader(packet)
From here you can merge the pages of the input file with another document.
pdfrw will let you read in pages from an existing PDF and draw them to a reportlab canvas (similar to drawing an image). There are examples for this in the pdfrw examples/rl1 subdirectory on github. Disclaimer: I am the pdfrw author.
cpdf will do the job from the command-line. It isn't python, though (afaik):
cpdf -add-text "Line of text" input.pdf -o output .pdf
Leveraging David Dehghan's answer above, the following works in Python 2.7.13:
from PyPDF2 import PdfFileWriter, PdfFileReader, PdfFileMerger
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
packet = StringIO.StringIO()
# create a new PDF with Reportlab
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(290, 720, "Hello world")
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader("original.pdf")
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(0)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# finally, write "output" to a real file
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
Don't use mergePage, It may not work for some pdfs
You should use mergeRotatedTranslatedPage
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen.canvas import Canvas
page_to_merge = 0 #Refers to the First page of PDF
xcoor = 250 #To be changed according to your pdf
ycoor = 650 #To be changed according to your pdf
input_pdf = PdfFileReader(open("Source.pdf", "rb"))
page_count = input_pdf.getNumPages()
inputpdf_page_to_be_merged = input_pdf.getPage(page_to_merge)
packet = io.BytesIO()
c = Canvas(packet,pagesize=(inputpdf_page_to_be_merged.mediaBox.getWidth(),inputpdf_page_to_be_merged.mediaBox.getHeight()))
c.drawString(xcoor,ycoor,"Hello World")
c.save()
packet.seek(0)
overlay_pdf = PdfFileReader(packet)
overlay = overlay_pdf.getPage(0)
output = PdfFileWriter()
for PAGE in range(page_count):
if PAGE == page_to_merge:
inputpdf_page_to_be_merged.mergeRotatedTranslatedPage(overlay,
inputpdf_page_to_be_merged.get('/Rotate') or 0,
overlay.mediaBox.getWidth()/2, overlay.mediaBox.getWidth()/2)
output.addPage(inputpdf_page_to_be_merged)
else:
Page_in_pdf = input_pdf.getPage(PAGE)
output.addPage(Page_in_pdf)
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
If you're on Windows, this might work:
PDF Creator Pilot
There's also a whitepaper of a PDF creation and editing framework in Python. It's a little dated, but maybe can give you some useful info:
Using Python as PDF Editing and Processing Framework
You may have better luck breaking the problem down into converting PDF into an editable format, writing your changes, then converting it back into PDF. I don't know of a library that lets you directly edit PDF but there are plenty of converters between DOC and PDF for example.

Categories