crop pdf and add them side by side using python - python

from PyPDF2 import PdfWriter, PdfReader
reader = PdfReader('my-aadhar-card.pdf')
writer = PdfWriter()
page = reader.pages[0]
page.cropbox.upper_right = (290,264)
page.cropbox.lower_left = (32,102)
writer.add_page(page)
with open('result1.pdf','wb') as fp:
writer.write(fp)
writer2 = PdfWriter()
page2 = reader.pages[0]
page2.cropbox.upper_left = (560,264)
page2.cropbox.lower_right = (302,102)
writer2.add_page(page)
with open('result2.pdf','wb') as fp2:
writer2.write(fp2)
Help: I am cropping 2 pieces of pdf from Aadhaar card PDF download from uidai, I need to combine result1.pdf and result2.pdf side by side like the below image, When I merge them its creating 2 pages and one below the other
thanks in Advance

Related

How to make a watermark transparent in pypdf?

I'm trying to add a slightly transparent watermark to a bunch of pdf's. The problem is that my watermark only shows up on non-solid white pages (i.e if there is a solid color or a solid white background, the watermark doesn't show). My watermark is a pdf file. This is my code:
from pathlib import Path
from typing import Union, Literal, List
import pypdf
from pypdf import PdfWriter, PdfReader, PageObject
def watermark(
content_pdf: Path,
stamp_pdf: Path,
pdf_result: Path,
page_indices: Union[Literal["ALL"], List[int]] = "ALL",
):
reader = PdfReader(content_pdf)
if page_indices == "ALL":
page_indices = list(range(0, len(reader.pages)))
writer = PdfWriter()
reader_stamp = PdfReader(stamp_pdf)
for index in page_indices:
page = PageObject.create_blank_page(reader_stamp)
page.merge_page(reader_stamp.pages[0])
content_page = reader.pages[index]
mediabox = content_page.mediabox
page.merge_page(content_page)
page.mediabox = mediabox
writer.add_page(page)
with open(pdf_result, "wb") as fp:
writer.write(fp)
watermark(content_pdf=path/to/pdf.pdf,
stamp_pdf=path/to/watermark.pdf,
pdf_result=path/to/outout/dir )
This is the pdf I get as an output. Essentially, the top pdf (where the watermark is in the background) is the result I want, but when there's a solid color in the original pdf the watermark doesn't show.
I have also tried adding the watermark as a stamp (see code below), but naturally the watermark is added on top of the pdf, blocking some of the text.
from pathlib import Path
from typing import Union, Literal, List
from pypdf import PdfWriter, PdfReader
def stamp(
content_pdf: Path,
stamp_pdf: Path,
pdf_result: Path,
page_indices: Union[Literal["ALL"], List[int]] = "ALL",
):
reader = PdfReader(stamp_pdf)
image_page = reader.pages[0]
writer = PdfWriter()
reader = PdfReader(content_pdf)
if page_indices == "ALL":
page_indices = list(range(0, len(reader.pages)))
for index in page_indices:
content_page = reader.pages[index]
mediabox = content_page.mediabox
content_page.merge_page(image_page)
content_page.mediabox = mediabox
writer.add_page(content_page)
with open(pdf_result, "wb") as fp:
writer.write(fp)
watermark(content_pdf=path/to/pdf.pdf,
stamp_pdf=path/to/watermark.pdf,
pdf_result=path/to/outout/dir )
So my question is, is there a way to modify one of these code snippets so that I get the output I want?

PDF range split

I am trying to split a PDF file by finding a key word of text and then grabbing that page the key word is on and the following 4 pages after, so total of 5 pages, and splitting them from that original PDF and putting them into their own PDF so the new PDF will have those 5 pages only, then loop through again find that key text again because its repeated further down the original PDF X amount of times, grabbing that page plus the 4 after and putting into its own PDF.
Example: key word is found on page 7 the first loop so need page 7 and also pages 8-11 and put those 5 pages 7-11 into a pdf file,
the next loop they key word is found on page 12 so need page 12 and pages 13-16 so pages 12-16 split onto their own pdf at this point it has created 2 separate pdfs
the below code finds the key word and puts it into its own pdf file but only got it for that one page not sure how to include the range
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
path = "example.pdf"
fname = os.path.basename(path)
reader = PdfFileReader(path)
for page_number in range(reader.getNumPages()):
writer = PdfFileWriter()
writer.addPage(reader.getPage(page_number))
text = reader.getPage(page_number).extractText()
text_stripped = text.replace("\n", "")
print(text_stripped)
if text_stripped.find("Disregarded Branch") != (-1):
output_filename = f"{fname}_page_{page_number + 1}.pdf"
with open(output_filename, "wb") as out:
writer.write(out)
print(f"Created: {output_filename}")
disclaimer: I am the author of borb, the library used in this answer.
I think your question comes down to 2 common functionalities:
find the location of a given piece of text
merge/split/extract pages from a PDF
For the first part, there is a good tutorial in the examples repo.
You can find it here. I'll repeat one of the examples here for completeness.
import typing
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
from borb.toolkit.text.simple_text_extraction import SimpleTextExtraction
def main():
# read the Document
doc: typing.Optional[Document] = None
l: SimpleTextExtraction = SimpleTextExtraction()
with open("output.pdf", "rb") as in_file_handle:
doc = PDF.loads(in_file_handle, [l])
# check whether we have read a Document
assert doc is not None
# print the text on the first Page
print(l.get_text_for_page(0))
if __name__ == "__main__":
main()
This example extracts all the text from page 0 of the PDF. of course you could simply iterate over all pages, and check whether a given page contains the keyword you're looking for.
For the second part, you can find a good example in the examples repository. This is the link. This example (and subsequent example) takes you through the basics of frankensteining a PDF from various sources.
The example I copy/paste here will show you how to build a PDF by alternatively picking a page from input document 1, and input document 2.
import typing
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
import typing
from decimal import Decimal
from borb.pdf.document.document import Document
from borb.pdf.page.page import Page
from borb.pdf.pdf import PDF
def main():
# open doc_001
doc_001: typing.Optional[Document] = Document()
with open("output_001.pdf", "rb") as pdf_file_handle:
doc_001 = PDF.loads(pdf_file_handle)
# open doc_002
doc_002: typing.Optional[Document] = Document()
with open("output_002.pdf", "rb") as pdf_file_handle:
doc_002 = PDF.loads(pdf_file_handle)
# create new document
d: Document = Document()
for i in range(0, 10):
p: typing.Optional[Page] = None
if i % 2 == 0:
p = doc_001.get_page(i)
else:
p = doc_002.get_page(i)
d.append_page(p)
# write
with open("output_003.pdf", "wb") as pdf_file_handle:
PDF.dumps(pdf_file_handle, d)
if __name__ == "__main__":
main()
You've almost got it!
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def create_4page_pdf(base_pdf_path, start):
reader = PdfFileReader(base_pdf_path)
writer = PdfFileWriter()
for i in range(4):
index = start + i
if index < len(reader.pages):
page = reader.pages[index]
writer.addPage(page)
fname = os.path.basename(base_pdf_path)
output_filename = f"{fname}_page_{start + 1}.pdf"
with open(output_filename, "wb") as out:
writer.write(out)
print(f"Created: {output_filename}")
def main(base_pdf_path="example.pdf"):
base_pdf_path = "example.pdf"
reader = PdfFileReader(base_pdf_path)
for page_number, page in enumerate(reader.pages):
text = page.extractText()
text_stripped = text.replace("\n", "")
print(text_stripped)
if text_stripped.find("Disregarded Branch") != (-1):
create_4page_pdf(base_pdf_path, page_number)

Splitting PDF pages mid-way and re-combining it?

I dont have enough reputation score right now to answer a question I found - how to use python to split pdf pages into half and recombine it for further processing ..
#!/usr/bin/env python
'''
Chops each page in half, e.g. if a source were
created in booklet form, you could extract individual
pages, and re-combines it
'''
from PyPDF2 import PdfFileWriter,PdfFileReader,PdfFileMerger
#split left
with open("docu.pdf", "rb") as in_f:
input1 = PdfFileReader(in_f)
output = PdfFileWriter()
numPages = input1.getNumPages()
for i in range(numPages):
page = input1.getPage(i)
page.cropBox.lowerLeft = (60, 50)
page.cropBox.upperRight = (305, 700)
output.addPage(page)
with open("left.pdf", "wb") as out_f:
output.write(out_f)
#split right
with open("docu.pdf", "rb") as in_f:
input1 = PdfFileReader(in_f)
output = PdfFileWriter()
numPages = input1.getNumPages()
for i in range(numPages):
page = input1.getPage(i)
page.cropBox.lowerLeft = (300, 50)
page.cropBox.upperRight = (540, 700)
output.addPage(page)
with open("right.pdf", "wb") as out_f:
output.write(out_f)
#combine splitted files
input1 = PdfFileReader(open("left.pdf","rb"))
input2 = PdfFileReader(open("right.pdf","rb"))
output = PdfFileWriter()
numPages = input1.getNumPages()
for i in range(numPages):
l = input1.getPage(i)
output.addPage(l)
r = input2.getPage(i)
output.addPage(r)
with open("out.pdf", "wb") as out_f:
output.write(out_f)
Note : The cropping parameters are specific for your PDF , so, please, check it before execution of the program.
Further : Now, You can use this document to extract text easily without getting the columns merged into each other -- messed up extraction ..

I have converted a pdf file to csv using anaconda python3 But the converted csv file is not in a readable form how to make it readable?

# importing required modules
import PyPDF2
# creating a pdf file object
pdfFileObj = open(path, 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# printing number of pages in pdf file
print(pdfReader.numPages)
# creating a page object
pageObj = pdfReader.getPage(0)
# extracting text from page
print(pageObj.extractText())
df = pd.DataFrame(pdfFileObj)
print (df)
df.to_csv('output.csv')
I have converted a pdf file to csv using anaconda python 3. But the converted csv file is not in a readable form. how to make that csv in readable format?
I tested your method and I couldn't find a way to correct the csv ouput. I useally do it this way:
import csv
import os
from miner_text_generator import extract_text_by_page
def export_as_csv(pdf_path, csv_path):
filename = os.path.splitext(os.path.basename(pdf_path))[0]
counter = 1
with open(csv_path, 'w') as csv_file:
writer = csv.writer(csv_file)
for page in extract_text_by_page(pdf_path):
text = page[0:100]
words = text.split()
writer.writerow(words)
if __name__ == '__main__':
pdf_path = '<your path to the file>.pdf'
csv_path = '<path to the output>.csv'
export_as_csv(pdf_path, csv_path)

How to split/crop a pdf along the middle using pyPdf

I have a pdf that looks like this and i'd like to crop all the text out, almost right down the middle of the page. I found this script that does something simmilar:
def splitHorizontal():
from pyPdf import PdfFileWriter, PdfFileReader
input1 = PdfFileReader(file("in.pdf", "rb"))
output = PdfFileWriter()
numPages = input1.getNumPages()
print "document has %s pages." % numPages
for i in range(numPages):
page = input1.getPage(i)
print page.mediaBox.getUpperRight_x(), page.mediaBox.getUpperRight_y()
page.trimBox.lowerLeft = (25, 25)
page.trimBox.upperRight = (225, 225)
page.cropBox.lowerLeft = (50, 50)
page.cropBox.upperRight = (200, 200)
output.addPage(page)
outputStream = file("out.pdf", "wb")
output.write(outputStream)
outputStream.close()
However these crop dimensions are tuned to that specific example.
Can anyone show me how to find the correct crop dimensions.
I originally got the script from here --> Cropping pages of a .pdf file.
I read more into what the author had said, finally realizing that he had said:
The resulting document has a trim box that is 200x200 points and starts at 25,25 points inside the media box. The crop box is 25 points inside the trim box.
meaning
page.cropBox.upperRight = (200, 200)
must control the ultimate margins, i therefore adjusted the statement to
page.cropBox.upperLeft = (290, 792)
To mirror the cropping onto the other side and make sure the cropping holds the full vertical value
Chops each page in half, e.g. if a source were
created in booklet form, and then re-combines it
for further processing eg. text extraction
Importing required libraries
from PyPDF2 import PdfFileWriter,PdfFileReader,PdfFileMerger
Splitting Left Part
with open("docu.pdf", "rb") as in_f:
input1 = PdfFileReader(in_f)
output = PdfFileWriter()
numPages = input1.getNumPages()
for i in range(numPages):
page = input1.getPage(i)
page.cropBox.lowerLeft = (60, 50)
page.cropBox.upperRight = (305, 700)
output.addPage(page)
with open("left.pdf", "wb") as out_f:
output.write(out_f)
Splitting right part :
with open("docu.pdf", "rb") as in_f:
input1 = PdfFileReader(in_f)
output = PdfFileWriter()
numPages = input1.getNumPages()
for i in range(numPages):
page = input1.getPage(i)
page.cropBox.lowerLeft = (300, 50)
page.cropBox.upperRight = (540, 700)
output.addPage(page)
with open("right.pdf", "wb") as out_f:
output.write(out_f)
Combining left with right (two columns to two pages)
input1 = PdfFileReader(open("left.pdf","rb"))
input2 = PdfFileReader(open("right.pdf","rb"))
output = PdfFileWriter()
numPages = input1.getNumPages()
for i in range(numPages):
l = input1.getPage(i)
output.addPage(l)
r = input2.getPage(i)
output.addPage(r)
with open("out.pdf", "wb") as out_f:
output.write(out_f)
I faced the same challenge and I wrote this (require PyPDF2):
https://gist.github.com/kintaro1981/eb6cfc6f40a7fb39744f5ae630d58fd5
# -*- coding: utf-8 -*-
"""
This script splits each pdf page in half vertically and merge them in a single PDF in order.
It asks how many pages do you want to skip to avoid splitting a book cover or a specific number of initial pages.
Usage:
python cutpdfpages.py <filename.pdf> <newfilename.pdf>
"""
import sys
import copy
from PyPDF2 import PdfWriter, PdfReader
file = str(sys.argv[1])
newfile = str(sys.argv[2])
with open(file, "rb") as pdf1:
pdf = PdfReader(pdf1)
output = PdfWriter()
numpages = pdf.getNumPages()
page2skip = int(input('Insert how many pages do you want to skip: '))
for i in range(page2skip, numpages):
page = pdf.pages[i]
pagebis = copy.copy(page)
page.mediabox.upper_left = (page.mediabox.right / 2, page.mediabox.top,)
output.add_page(page)
pagebis.mediabox.upper_right = (pagebis.mediabox.right / 2, pagebis.mediabox.top,)
output.add_page(pagebis)
with open(newfile, "wb") as newpdf:
output.write(newpdf)

Categories