I am trying to split a PDF file by finding a key word of text and then grabbing that page the key word is on and the following 4 pages after, so total of 5 pages, and splitting them from that original PDF and putting them into their own PDF so the new PDF will have those 5 pages only, then loop through again find that key text again because its repeated further down the original PDF X amount of times, grabbing that page plus the 4 after and putting into its own PDF.
Example: key word is found on page 7 the first loop so need page 7 and also pages 8-11 and put those 5 pages 7-11 into a pdf file,
the next loop they key word is found on page 12 so need page 12 and pages 13-16 so pages 12-16 split onto their own pdf at this point it has created 2 separate pdfs
the below code finds the key word and puts it into its own pdf file but only got it for that one page not sure how to include the range
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
path = "example.pdf"
fname = os.path.basename(path)
reader = PdfFileReader(path)
for page_number in range(reader.getNumPages()):
writer = PdfFileWriter()
writer.addPage(reader.getPage(page_number))
text = reader.getPage(page_number).extractText()
text_stripped = text.replace("\n", "")
print(text_stripped)
if text_stripped.find("Disregarded Branch") != (-1):
output_filename = f"{fname}_page_{page_number + 1}.pdf"
with open(output_filename, "wb") as out:
writer.write(out)
print(f"Created: {output_filename}")
disclaimer: I am the author of borb, the library used in this answer.
I think your question comes down to 2 common functionalities:
find the location of a given piece of text
merge/split/extract pages from a PDF
For the first part, there is a good tutorial in the examples repo.
You can find it here. I'll repeat one of the examples here for completeness.
import typing
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
from borb.toolkit.text.simple_text_extraction import SimpleTextExtraction
def main():
# read the Document
doc: typing.Optional[Document] = None
l: SimpleTextExtraction = SimpleTextExtraction()
with open("output.pdf", "rb") as in_file_handle:
doc = PDF.loads(in_file_handle, [l])
# check whether we have read a Document
assert doc is not None
# print the text on the first Page
print(l.get_text_for_page(0))
if __name__ == "__main__":
main()
This example extracts all the text from page 0 of the PDF. of course you could simply iterate over all pages, and check whether a given page contains the keyword you're looking for.
For the second part, you can find a good example in the examples repository. This is the link. This example (and subsequent example) takes you through the basics of frankensteining a PDF from various sources.
The example I copy/paste here will show you how to build a PDF by alternatively picking a page from input document 1, and input document 2.
import typing
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
import typing
from decimal import Decimal
from borb.pdf.document.document import Document
from borb.pdf.page.page import Page
from borb.pdf.pdf import PDF
def main():
# open doc_001
doc_001: typing.Optional[Document] = Document()
with open("output_001.pdf", "rb") as pdf_file_handle:
doc_001 = PDF.loads(pdf_file_handle)
# open doc_002
doc_002: typing.Optional[Document] = Document()
with open("output_002.pdf", "rb") as pdf_file_handle:
doc_002 = PDF.loads(pdf_file_handle)
# create new document
d: Document = Document()
for i in range(0, 10):
p: typing.Optional[Page] = None
if i % 2 == 0:
p = doc_001.get_page(i)
else:
p = doc_002.get_page(i)
d.append_page(p)
# write
with open("output_003.pdf", "wb") as pdf_file_handle:
PDF.dumps(pdf_file_handle, d)
if __name__ == "__main__":
main()
You've almost got it!
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def create_4page_pdf(base_pdf_path, start):
reader = PdfFileReader(base_pdf_path)
writer = PdfFileWriter()
for i in range(4):
index = start + i
if index < len(reader.pages):
page = reader.pages[index]
writer.addPage(page)
fname = os.path.basename(base_pdf_path)
output_filename = f"{fname}_page_{start + 1}.pdf"
with open(output_filename, "wb") as out:
writer.write(out)
print(f"Created: {output_filename}")
def main(base_pdf_path="example.pdf"):
base_pdf_path = "example.pdf"
reader = PdfFileReader(base_pdf_path)
for page_number, page in enumerate(reader.pages):
text = page.extractText()
text_stripped = text.replace("\n", "")
print(text_stripped)
if text_stripped.find("Disregarded Branch") != (-1):
create_4page_pdf(base_pdf_path, page_number)
Related
I've found some guides online on how to make a PDF searchable if it was scanned. However, I'm currently struggling with figuring out how to do it for a multipage PDF.
My code takes multipaged PDFs, converts each page into a JPG, runs OCR on each page and then converts it into a PDF. However, only the last page is returned.
import pytesseract
from pdf2image import convert_from_path
pytesseract.pytesseract.tesseract_cmd = 'directory'
TESSDATA_PREFIX = 'directory'
tessdata_dir_config = '--tessdata-dir directory'
# Path of the pdf
PDF_file = r"pdf directory"
def pdf_text():
# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_file, 500)
image_counter = 1
for page in pages:
# Declare file names
filename = "page_"+str(image_counter)+".jpg"
# Save the image of the page in system
page.save(filename, 'JPEG')
# Increment the counter to update filename
image_counter = image_counter + 1
# Variable to get count of total number of pages
filelimit = image_counter-1
outfile = "out_text.pdf"
# Open the file in append mode so that all contents of all images are added to the same file
f = open(outfile, "a")
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
filename = "page_"+str(i)+".jpg"
# Recognize the text as string in image using pytesseract
result = pytesseract.image_to_pdf_or_hocr(filename, lang="eng", config=tessdata_dir_config)
f = open(outfile, "w+b")
f.write(bytearray(result))
f.close()
pdf_text()
How can I run this for all pages and output one merged PDF?
I can't run it but I think all problem is because you use open(..., 'w+b') inside loop - and this remove previous content, and finally you write only last page.
You should use already opened file open(outfile, "a") and close it after loop.
# --- before loop ---
f = open(outfile, "ab")
# --- loop ---
for i in range(1, filelimit+1):
filename = f"page_{i}.jpg"
result = pytesseract.image_to_pdf_or_hocr(filename, lang="eng", config=tessdata_dir_config)
f.write(bytearray(result))
# --- after loop ---
f.close()
BTW:
But there is other problem - image_to_pdf_or_hocr creates full PDF - with special headers and maybe footers - and appending two results can't create correct PDF. You would have to use special modules to merge pdfs. Like Merge PDF files
Something similar to
# --- before loop ---
from PyPDF2 import PdfFileMerger
import io
merger = PdfFileMerger()
# --- loop ---
for i in range(1, filelimit + 1):
filename = "page_"+str(i)+".jpg"
result = pytesseract.image_to_pdf_or_hocr(filename, lang="eng", config=tessdata_dir_config)
pdf_file_in_memory = io.BytesIO(result)
merger.append(pdf_file_in_memory)
# --- after loop ---
merger.write(outfile)
merger.close()
There are a number of potential issues here and without being able to debug it's hard to say what is the root cause.
Are the JPGs being successfully created, and as separate files as is expected?
I would suspect that pages = convert_from_path(PDF_file, 500) is not returning as expected - have you manually verified they are being created as expected?
I'd like to use PyMuPDF : I'd like to split a pdf, with for each splitted file, a file named with the name of the bookmark, with only page
I've succefully my files, for exemple 4 PDF files for a 4 pages PDF source.... but in the several pdf, I don't have one page but with a random number of page ?
import sys, fitz
file = '/home/ilyes/Bulletins_Originaux.pdf'
bookmark = ''
try:
doc = fitz.open(file)
toc = doc.getToC(simple = True)
except Exception as e:
print(e)
for i in range(len(toc)):
documentPdfCible=toc[i][1]
documentPdfCibleSansSlash=documentPdfCible.replace("/","-")
numeroPage=toc[i][2]
pagedebut=numeroPage
pagefin=numeroPage + 1
print (pagedebut)
print (pagefin)
doc2 = fitz.open(file)
doc2.insertPDF(doc, from_page = pagedebut, to_page = pagefin, start_at = 0)
doc2.save('/home/ilyes/' + documentPdfCibleSansSlash + ".pdf")
doc2.close
Could you tell me what's wrong ?
Maybee because I use always "doc2" in the loop ?
Thanks you,
Abou Ilyès
Seems weird, that you open the same document twice.
You open your pdf file at doc = fitz.open(file) and again at doc2 = fitz.open(file).
Then you insert pages into the same file by doc2.insertPDF(doc, from_page = pagedebut, to_page = pagefin, start_at = 0).
Of course the doc files toc will get messed up completely by "randomly" inserting pages.
I recommend to replace doc2 = fitz.open(file) with doc2 = fitz.open()
This will create an empty "in memory" pdf (see the documentation), in which you can then insert the pages you need from doc. Then save this as a new pdf by its bookmark title by running
doc2.save('/home/ilyes/' + documentPdfCibleSansSlash + ".pdf")
I have a list of pdf files and I need to highlight specific text on each page of these files and save a snapshot for each of the text instances.
So far I am able to highlight the text and save the entire page of a pdf file as a snapshot. But, I want to find the position of highlighted text and take a zoomed in the snapshot which will be more detailed compared to the full page snapshot.
I'm pretty sure there must be a solution to this problem. I am new to Python and hence I am not able to find it. I would be really grateful if someone can help me out with this.
I have tried using PyPDF2, Pymupdf libraries but I couldn't figure out the solution. I also tried highlighting by providing coordinates which works but couldn't find a way to get these coordinates as output.
[![Sample snapshot from the code[![\]\[1\]][1]][1]][1]
#import PyPDF2
import os
import fitz
from wand.image import Image
import csv
#import re
#from pdf2image import convert_from_path
check = r'C:\Users\Pradyumna.M\Desktop\Pradyumna\Automation\Intel Bytes\Create Source Docs\Sample Check 8 Apr 2019'
dir1 = check + '\\Source Docs\\'
dir2 = check + '\\Output\\'
dir = [dir1, dir2]
for x in dir:
try:
os.mkdir(x)
except FileExistsError:
print("Directory ", x, " already exists")
### READ PDF FILE
with open('upload1.csv', newline='') as myfile:
reader = csv.reader(myfile)
for row in reader:
rowarray = '; '.join(row)
src = rowarray.split("; ")
file = check + '\\' + src[4] + '.pdf'
print(file)
#pdfFileObj = open(file,'rb')
#pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
#print("Total number of pages: " + str(pdfReader.numPages))
doc = fitz.open(file)
print(src[5])
for i in range(int(src[5])-1, int(src[5])):
i = int(i)
page = doc[i]
print("Processing page: " + str(i))
text = src[3]
#SEARCH TEXT
print("Searching: " + text)
text_instances = page.searchFor(text)
for inst in text_instances:
highlight = page.addHighlightAnnot(inst)
file1 = check + '\\Output\\' + src[4] + '_output.pdf'
print(file1)
doc.save(file1, garbage=4, deflate=True, clean=True)
### Screenshot
with(Image(filename=file1, resolution=150)) as source:
images = source.sequence
newfilename = check + "\\Source Docs\\" + src[0] + '.jpeg'
Image(images[i]).save(filename=newfilename)
print("Screenshot of " + src[0] + " saved")
"couldn't find a way to get these coordinates as output"
- you can get the coordinates out by doing this:
for inst in text_instances:
print(inst)
inst are fitz.Rect objects which contain the top left and bottom right coordinates of the piece of text that was found. All the information is available in the docs.
I managed to highlight points and also save a cropped region using the following snippet of code. I am using python 3.7.1 and my output for fitz.version is ('1.14.13', '1.14.0', '20190407064320').
import fitz
doc = fitz.open("foo.pdf")
inst_counter = 0
for pi in range(doc.pageCount):
page = doc[pi]
text = "hello"
text_instances = page.searchFor(text)
five_percent_height = (page.rect.br.y - page.rect.tl.y)*0.05
for inst in text_instances:
inst_counter += 1
highlight = page.addHighlightAnnot(inst)
# define a suitable cropping box which spans the whole page
# and adds padding around the highlighted text
tl_pt = fitz.Point(page.rect.tl.x, max(page.rect.tl.y, inst.tl.y - five_percent_height))
br_pt = fitz.Point(page.rect.br.x, min(page.rect.br.y, inst.br.y + five_percent_height))
hl_clip = fitz.Rect(tl_pt, br_pt)
zoom_mat = fitz.Matrix(2, 2)
pix = page.getPixmap(matrix=zoom_mat, clip = hl_clip)
pix.writePNG(f"pg{pi}-hl{inst_counter}.png")
doc.close()
I tested this on a sample pdf that i peppered with "hello":
Some of the outputs from the script:
I composed the solution out of the following pages of the documentation:
Tutorial page to get introduced into the library
page.searchFor to figure out the return type of the searchFor method
fitz.Rect to understand what the returned objects from page.searchFor are
Collection of Recipes page (called faq in the URL) to figure out how to crop and save part of a pdf page
I am trying to copy elements of a doc from one doc file to other. The text part is easy, the images is where it gets tricky.
Attaching an image to explain the structure of the doc: Just some text and 1 image.
from docx import Document
import io
doc = Document('/Users/neha/Desktop/testing.docx')
new_doc = Document()
for elem in doc.element.body:
new_doc.element.body.append(elem)
new_doc.save('/Users/neha/Desktop/out.docx')
This gets me the whole structure of the doc in the new_doc but the image is still blank. Image below:
Good thing is I have the blank image in the right place so I thought of getting the byte level data from the previous image and insert it in the new doc. Here is how I extended the above code:
from docx import Document
import io
doc = Document('/Users/neha/Desktop/testing.docx')
new_doc = Document()
for elem in doc.element.body:
new_doc.element.body.append(elem)
im = doc.inline_shapes[0]
blip = im._inline.graphic.graphicData.pic.blipFill.blip
rId = blip.embed
doc_part = doc.part
image_part = doc_part.related_parts[rId]
bytes = image_part._blob #Here I get the byte level data for the image
im2 = new_doc.inline_shapes[0]
blip2 = im2._inline.graphic.graphicData.pic.blipFill.blip
rId2 = blip2.embed
document_part2 = new_doc.part
document_part2.related_parts[rId2]._blob = bytes
new_doc.save('/Users/neha/Desktop/out.docx')
But the image still shows empty in the new_doc. What should I do from here?
I figured out a solution a couple of days back. However the text loses formatting using this way, but the images are correctly placed.
So the idea is, for para in paras for the source doc, if there is text, I write it to dest doc. And if there is an inline image present, I add a unique identifier at that place in the dest doc (refer here to see how these identifiers work, and contexts in docxtpl). These identifiers and docxtpl proved to be particularly useful here. And then using those unique identifiers I create a 'context' (as shown below) which is basically a map mapping the unique identifier to its particular InlineImage, and finally I render this context..
Below is my code (Apologies for the unnecessary indentation, I copied it directly from my text editor, and shift+tab doesn't work here :P)
from docxtpl import DocxTemplate, InlineImage
import Document
import io
import xml.etree.ElementTree as ET
dest = DocxTemplate()
source = Document(source_path)
context = {}
ims = [im for im in source.inline_shapes]
im_addresses = []
im_streams = []
count = 0
for im in ims:
blip = im._inline.graphic.graphicData.pic.blipFill.blip
rId = blip.embed
doc_part = source.part
image_part = doc_part.related_parts[rId]
byte_data = image_part._blob
image_stream = io.BytesIO(byte_data)
im_streams.append(image_stream)
image_name = self.img_path+"img_"+"_"+str(count)+".jpeg"
with open(image_name, "wb") as fh:
fh.write(byte_data)
fh.close()
im_addresses.append(image_name)
count += 1
paras = source.paragraphs
im_idx = 0
for para in paras:
p = dest.add_paragraph()
r = p.add_run()
if(para.text):
r.add_text(para.text)
root = ET.fromstring(para._p.xml)
namespace = {'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}
inlines = root.findall('.//wp:inline',namespace)
if(len(inlines) > 0):
uid = "img_"+str(im_idx)
r.add_text("{{ " + uid + " }}")
context[uid] = InlineImage(dest,im_addresses[im_idx])
im_idx += 1
try:
dest.render(context)
except Exception as e:
print(e)
dest.save(dest_path)
PS: If a paragraph has two images, this code will prove to be sub-optimal.. One will have to make some change in the following:
if(len(inlines) > 0):
uid = "img_"+str(im_idx)
r.add_text("{{ " + uid + " }}")
context[uid] = InlineImage(dest,im_addresses[im_idx])
im_idx += 1
Will have to add a for loop inside the if statement as well. Since I didn't need as usually my images were big enough, so they always came in different paragraphs. Just a side note for anyone who may need it..
Cheers!
You could try:
Extracting the images from the first document by unzipping the .docx file (per How can I search a word in a Word 2007 .docx file?)
Save those images to the file system (as foo.png, for instance)
Generate the new .docx file with Python and add the .png file using document.add_picture('foo.png').
This problem is solved by this package https://docxtpl.readthedocs.io/en/latest/
from typing import List
from PyPDF2 import PdfFileReader
from PyPDF2.generic import Destination
def get_outlines(pdf_filepath: str) -> List[Destination]:
"""Get the bookmarks of a PDF file."""
with open(pdf_filepath, "rb") as fp:
pdf_file_reader = PdfFileReader(fp)
outlines = pdf_file_reader.getOutlines()
return outlines
print(get_outlines("PDF-export-example.pdf"))
pyPdf.pdf.Destination has many properties, but I can't find any referring page number of that bookmark. How can I get the page number of the bookmarks?
For example outlines[1].page.idnum returns a number which is approximately 3 times bigger than referenced page number in PDF document, which I assume references some object smaller then page, as running .page.idnum on whole PDF document outline returns array of numbers which is not even linearly correlated with "real" page number destinations in PDF document and it's roughly multiple by ~ 3
Update: This question is same as this: split a pdf based on outline although I don't understand what author did in his self answer there. Seems too complicated to me to be usable
As #theta pointed out "split a pdf based on outline" has the code required to extract page numbers. If you feel this is complicated I copied part of the code which maps page ids to page numbers and made it a function. Here is a working example that prints page number of bookmark o[0]:
from PyPDF2 import PdfFileReader
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
# main
f = open('document.pdf','rb')
p = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(p)
o = p.getOutlines()
pg_num = pg_id_num_map[o[0].page.idnum] + 1
print(pg_num)
probably too late for #theta but might help others :) btw my first post on stackoverflow so excuse me if I did not follow the usual format
To extend this further:
If you are looking to get the exact location on the page for a bookmark this will make your job easier:
from PyPDF2 import PdfFileReader
import PyPDF2 as pyPdf
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
def outlines_pg_zoom_info(outlines, pg_id_num_map, result=None):
if result is None:
result = dict()
if type(outlines) == list:
for outline in outlines:
result = outlines_pg_zoom_info(outline, pg_id_num_map, result)
elif type(outlines) == pyPdf.pdf.Destination:
title = outlines['/Title']
result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
return result
# main
pdf_name = 'document.pdf'
f = open(pdf_name,'rb')
pdf = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(pdf)
outlines = pdf.getOutlines()
bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map)
print(bookmarks_info)
Note: My bookmarks are section numbers (ex: 1.1 Introduction) and I am mapping the bookmark info to the section number. If your bookmarks are different modify this part of the code:
elif type(outlines) == pyPdf.pdf.Destination:
title = outlines['/Title']
result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
Manage bookmarks recursively with vjayky and Giulio D suggestion
PyPDF2 >= v1.25
from PyPDF2 import PdfFileReader
def printBookmarksPageNumbers(pdf):
def review_and_print_bookmarks(bookmarks, lvl=0):
for b in bookmarks:
if type(b) == list:
review_and_print_bookmarks(b, lvl + 4)
continue
pg_num = pdf.getDestinationPageNumber(b) + 1 #page count starts from 0
print("%s%s: Page %s" %(" "*lvl, b.title, pg_num))
review_and_print_bookmarks(pdf.getOutlines())
with open('document.pdf', "rb") as f:
pdf = PdfFileReader(f)
printBookmarksPageNumbers(pdf)
PyPDF2 < v1.25
from PyPDF2 import PdfFileReader
def printBookmarksPageNumbers(pdf):
# Map page ids to page numbers
pg_id_to_num = {}
for pg_num in range(0, pdf.getNumPages()):
pg_id_to_num[pdf.getPage(pg_num).indirectRef.idnum] = pg_num
def review_and_print_bookmarks(bookmarks, lvl=0):
for b in bookmarks:
if type(b) == list:
review_and_print_bookmarks(b, lvl + 4)
continue
pg_num = pg_id_to_num[b.page.idnum] + 1 #page count starts from 0
print("%s%s: Page %s" %(" "*lvl, b.title, pg_num))
review_and_print_bookmarks(pdf.getOutlines())
with open('document.pdf', "rb") as f:
pdf = PdfFileReader(f)
printBookmarksPageNumbers(pdf)
In 2019, for ones who are interested in a faster way, it's possible to use:
from PyPDF2 import PdfFileReader
def printPageNumberFrom(filename):
with open(filename, "rb") as f:
pdf = PdfFileReader(f)
bookmarks = pdf.getOutlines()
for b in bookmarks:
print(pdf.getDestinationPageNumber(b) + 1) #page count starts from 0
I'm not sure but according to the docs for pypdf.Destination the page number for the bookmark is just Destination.page .