Applying PDF watermark via a for loop - python

I am working through the book 'Automate the boring stuff with Python' and I am trying to run the code to watermark a .pdf on all pages but the watermark only appears on the first page.
So the problem must either be in the loop or in the writing. Can anyone help me figure it out? Thank you
Running Python 3.5.0 on a windows 7 machine.
Code below:
import PyPDF2
minutesFile = open('meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(minutesFile)
minutesFirstPage = pdfReader.getPage(0)
pdfWatermarkReader = PyPDF2.PdfFileReader(open('watermark.pdf', 'rb'))
minutesFirstPage.mergePage(pdfWatermarkReader.getPage(0))
pdfWriter = PyPDF2.PdfFileWriter()
pdfWriter.addPage(minutesFirstPage)
for pageNum in range(1, pdfReader.numPages):
pageObj = pdfReader.getPage(pageNum)
pdfWriter.addPage(pageObj)
resultPdfFile = open('watermarkedCover.pdf', 'wb')
pdfWriter.write(resultPdfFile)
minutesFile.close()
resultPdfFile.close()

import PyPDF2
template = PyPDF2.PdfFileReader(open('yourfilw.pdf', 'rb'))
watermark = PyPDF2.PdfFileReader(open('watermarkfile.pdf', 'rb'))
output = PyPDF2.PdfFileWriter()
for i in range(template.getNumPages()):
page = template.getPage(i)
page.mergePage(watermark.getPage(0))
output.addPage(page)
with open('water_marked.pdf', 'wb') as file:
output.write(file)

The fact that "the watermark only appears on the first page" isn't a bug, it's exactly what this code was designed to do. I see no attempt to modify the code to add the watermark to every page. Be honest about the situation and what effort you've put in to change it, even if "None".
Here's my rework of the code to watermark every page:
import PyPDF2
watermarkFile = open('watermark.pdf', 'rb')
pdfWatermarkReader = PyPDF2.PdfFileReader(watermarkFile)
minutesFile = open('meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(minutesFile)
pdfWriter = PyPDF2.PdfFileWriter()
for pageNum in range(pdfReader.numPages):
pageObj = pdfReader.getPage(pageNum)
pageObj.mergePage(pdfWatermarkReader.getPage(0))
pdfWriter.addPage(pageObj)
resultPdfFile = open('watermarkedCover.pdf', 'wb')
pdfWriter.write(resultPdfFile)
watermarkFile.close()
minutesFile.close()
resultPdfFile.close()
Try it out and see if it does what you want.

Related

How to turn a large PDF's pages to images efficiently? [duplicate]

I would like to take a multi-page pdf file and create separate pdf files per page.
I have downloaded reportlab and have browsed the documentation, but it seems aimed at pdf generation. I haven't yet seen anything about processing PDF files themselves.
Is there an easy way to do this in python?
from PyPDF2 import PdfWriter, PdfReader
inputpdf = PdfReader(open("document.pdf", "rb"))
for i in range(len(inputpdf.pages)):
output = PdfWriter()
output.add_page(inputpdf.pages[i])
with open("document-page%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
etc.
I missed here a solution where you split the PDF to two parts consisting of all pages so I append my solution if somebody was looking for the same:
from PyPDF2 import PdfFileWriter, PdfFileReader
def split_pdf_to_two(filename,page_number):
pdf_reader = PdfFileReader(open(filename, "rb"))
try:
assert page_number < pdf_reader.numPages
pdf_writer1 = PdfFileWriter()
pdf_writer2 = PdfFileWriter()
for page in range(page_number):
pdf_writer1.addPage(pdf_reader.getPage(page))
for page in range(page_number,pdf_reader.getNumPages()):
pdf_writer2.addPage(pdf_reader.getPage(page))
with open("part1.pdf", 'wb') as file1:
pdf_writer1.write(file1)
with open("part2.pdf", 'wb') as file2:
pdf_writer2.write(file2)
except AssertionError as e:
print("Error: The PDF you are cutting has less pages than you want to cut!")
The PyPDF2 package gives you the ability to split up a single PDF into multiple ones.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output_filename = '{}_page_{}.pdf'.format(fname, page+1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
Source: https://www.blog.pythonlibrary.org/2018/04/11/splitting-and-merging-pdfs-with-python/
I know that the code is not related to python, however i felt like posting this piece of R code which is simple, flexible and works amazingly. The PDFtools package in R is amazing in splitting merging PDFs at ease.
library(pdftools) #Rpackage
pdf_subset('D:\\file\\20.02.20\\22 GT 2017.pdf',
pages = 1:51, output = "subset.pdf")
The earlier answers with PyPDF2 for splitting pdfs are not working anymore with the latest version update. The authors recommend using pypdf instead and this version of PyPDF2==3.0.1 will be the last version of PyPDF2. The function needs to be modified as follows:
import os
from PyPDF2 import PdfReader, PdfWriter
def split_pdfs(input_file_path):
inputpdf = PdfReader(open(input_file_path, "rb"))
out_paths = []
if not os.path.exists("outputs"):
os.makedirs("outputs")
for i, page in enumerate(inputpdf.pages):
output = PdfWriter()
output.add_page(page)
out_file_path = f"outputs/{input_file_path[:-4]}_{i}.pdf"
with open(out_file_path, "wb") as output_stream:
output.write(output_stream)
out_paths.append(out_file_path)
return out_paths
Note: The same function will work with pypdf as well. Import PdfReader and PdfWriter from pypdf rather than PyPDF2.
import fitz
src = fitz.open("source.pdf")
for page in src:
tar = fitz.open() # output PDF for 1 page
# copy over current page
tar.insert_pdf(src, from_page=page.number, to_page=page.number)
tar.save(f"page-{page.number}.pdf")
tar.close()

Brand new looking for direction on PDF Page remover

I have been looking for a basic PDF page remover. What I am looking for is something that will let me put in the page numbers I want removed from the file I select. Still really new I am reading a few books to learn, but wanting to use this to make my job a bit easier as the current program is very stupid where I have to make sure to remember to subtract a number from the page given due to how PDF is.
I have to basic codes I would love some critical review of so I can hopefully refine it and make it to something that is easy to use. I thank anyone for their time and help loving how python reads and feels.
from PyPDF2 import PdfFileWriter, PdfFileReader
pages_delete = [4]
input = PdfFileReader('example.pdf', 'rb')
output = PdfFileWriter()
for i in range(input.getNumPages()):
if i not in pages_delete:
p = input.getPage(i)
output.addPage(p)
with open('pages_deleted.pdf', 'wb') as f:
output.write(f)
this is the second one
import PyPDF2 import PdfFileWriter, PdfFileReader
pdf1File = open('*file_name*')
pdf1Reader = PyPDF2.PdfFileReader(pdf1File)
pdfWriter = PyPDF2.PdfFileWriter()
for pageNum in range(pdf1Reader.numPages):
pageObj - pdf1Reader.getPage(pageNum)
pdfWriter.removePage(pageObj)
pdfOutputFile = open('*filename*-Edit')
pdfWriter.write(pdfOutputFile)
pdfOutFile.close()

Converting to pdf using soffice adds blank page

I'm trying to convert an .ods-file to pdf using soffice & python:
import os
import subprocess
def ods_to_pdf(ods_filename):
file = os.path.join(os.getcwd(), ods_filename)
path_to_soffice = "path/to/soffice"
subprocess.run([path_to_soffice, "--headless", "--convert-to", "pdf", file], check=True)
It works fine, but the resulting pdf has a blank page (sometimes two) at the end. Does anyone know how I can prevent this behaviour? The code runs in a Docker container with Ubuntu 18.04 as base image. LibreOffice version: 7.1.0 (I've also tried 6.1.6.3, same result).
I could not find out how to prevent LibreOffice from adding blank pages, but fixed the problem by removing the blank pages after converting:
import PyPDF2
output = PyPDF2.PdfFileWriter()
input = PyPDF2.PdfFileReader(open("file.pdf", "rb"))
number_of_pages = input.getNumPages()
for current_page_number in range(number_of_pages):
page = input.getPage(current_page_number)
if page.extractText() != "":
output.addPage(page)
output_stream = open("output.pdf", "wb")
output.write(output_stream)

Assistance with refining saving output from Python script to text file

I need a little help figuring out why this isn't working as expected.
the following code, opens a PDF file, extracts the text and should save the individual text files according to the name of the PDF file, however its not producing any output. please help. code is as follows:
import PyPDF2
import os
import glob
directory = 'C:/LIVE/2017/'
fileStructure = glob.glob("C:/LIVE/2017/*")
names = [os.path.basename(x) for x in glob.glob('C:/LIVE/2017/*')]
for file in os.listdir(directory):
with open(os.path.join(directory,file), 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfFileReader(pdfFileObj, strict=False)
pageObj = pdfReader.getPage(0)
number_of_pages = pdfReader.getNumPages()
for page_number in range(number_of_pages):
page = pdfReader.getPage(page_number)
page_content = page.extractText().encode('utf-8')
getFileName = os.path.basename(pdfFileObj.name)
bcn = getFileName.rsplit(' ', 1)[-1]
bcNum = os.path.splitext(os.path.basename(bcn))[0]
text_file = open(bcNum, "w")
text_file.write(page_content)
text_file.close()
Does it extract strings from the PDF to begin with? I've tried using PyPDF2 before and noticed it often has trouble getting text from PDFs if they're not formatted exactly right. I've had much more success using the module Tika.
from tika import parser
def read_pdf(pdf):
raw = parser.from_file(pdf)
return raw['content']
text_list = list()
for file in os.listdir(directory):
raw_content = read_pdf(pdf)
text_list.append(raw_content)

split a multi-page pdf file into multiple pdf files with python?

I would like to take a multi-page pdf file and create separate pdf files per page.
I have downloaded reportlab and have browsed the documentation, but it seems aimed at pdf generation. I haven't yet seen anything about processing PDF files themselves.
Is there an easy way to do this in python?
from PyPDF2 import PdfWriter, PdfReader
inputpdf = PdfReader(open("document.pdf", "rb"))
for i in range(len(inputpdf.pages)):
output = PdfWriter()
output.add_page(inputpdf.pages[i])
with open("document-page%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
etc.
I missed here a solution where you split the PDF to two parts consisting of all pages so I append my solution if somebody was looking for the same:
from PyPDF2 import PdfFileWriter, PdfFileReader
def split_pdf_to_two(filename,page_number):
pdf_reader = PdfFileReader(open(filename, "rb"))
try:
assert page_number < pdf_reader.numPages
pdf_writer1 = PdfFileWriter()
pdf_writer2 = PdfFileWriter()
for page in range(page_number):
pdf_writer1.addPage(pdf_reader.getPage(page))
for page in range(page_number,pdf_reader.getNumPages()):
pdf_writer2.addPage(pdf_reader.getPage(page))
with open("part1.pdf", 'wb') as file1:
pdf_writer1.write(file1)
with open("part2.pdf", 'wb') as file2:
pdf_writer2.write(file2)
except AssertionError as e:
print("Error: The PDF you are cutting has less pages than you want to cut!")
The PyPDF2 package gives you the ability to split up a single PDF into multiple ones.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output_filename = '{}_page_{}.pdf'.format(fname, page+1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
Source: https://www.blog.pythonlibrary.org/2018/04/11/splitting-and-merging-pdfs-with-python/
The earlier answers with PyPDF2 for splitting pdfs are not working anymore with the latest version update. The authors recommend using pypdf instead and this version of PyPDF2==3.0.1 will be the last version of PyPDF2. The function needs to be modified as follows:
import os
from PyPDF2 import PdfReader, PdfWriter
def split_pdfs(input_file_path):
inputpdf = PdfReader(open(input_file_path, "rb"))
out_paths = []
if not os.path.exists("outputs"):
os.makedirs("outputs")
for i, page in enumerate(inputpdf.pages):
output = PdfWriter()
output.add_page(page)
out_file_path = f"outputs/{input_file_path[:-4]}_{i}.pdf"
with open(out_file_path, "wb") as output_stream:
output.write(output_stream)
out_paths.append(out_file_path)
return out_paths
Note: The same function will work with pypdf as well. Import PdfReader and PdfWriter from pypdf rather than PyPDF2.
I know that the code is not related to python, however i felt like posting this piece of R code which is simple, flexible and works amazingly. The PDFtools package in R is amazing in splitting merging PDFs at ease.
library(pdftools) #Rpackage
pdf_subset('D:\\file\\20.02.20\\22 GT 2017.pdf',
pages = 1:51, output = "subset.pdf")
Updated solution for the latest release of PyPDF (3.0.0) and to split a range of pages.
from PyPDF2 import PdfReader, PdfWriter
file_name = r'c:\temp\junk.pdf'
pages = (121, 130)
reader = PdfReader(file_name)
writer = PdfWriter()
page_range = range(pages[0], pages[1] + 1)
for page_num, page in enumerate(reader.pages, 1):
if page_num in page_range:
writer.add_page(page)
with open(f'{file_name}_page_{pages[0]}-{pages[1]}.pdf', 'wb') as out:
writer.write(out)
import fitz
src = fitz.open("source.pdf")
for page in src:
tar = fitz.open() # output PDF for 1 page
# copy over current page
tar.insert_pdf(src, from_page=page.number, to_page=page.number)
tar.save(f"page-{page.number}.pdf")
tar.close()

Categories