PDF data stream with python - python

Context: My code fetches a set of coordinates from some png documents and later on performs some redaction in certain fields (it uses these coordinates for drawing rectangles in certain areas).
I want my final output to be a pdf with each redacted image as page. I can achieve this with fpdf package with no problem.
However, I intend to send this pdf file as email (base64 encoded) attachment. Is there any way to get the base64 string from fpdf output?
On top of that, can I use image binary string in fpdf image method?
See the redact_pdf method below (I placed some comments there to be more clear)
Code:
class Redaction:
def __init__(self,png_image_list,df_coordinates):
self.png_image_list = png_image_list
self.df_coordinates = df_coordinates
def _redact_images(self):
redacted_images_bin = []
for page_num,page_data in enumerate(self.png_image_list):
im_page = Image.open(io.BytesIO(page_data))
draw = ImageDraw.Draw(im_page)
df_filtered = self.df_coordinates[self.df_coordinates['page_number'] == page_num+1]
for index, row in df_filtered.iterrows():
x0 = row['x0'] * im_page.size[0]
y0 = row['y0'] * im_page.size[1]
x1 = row['x1'] * im_page.size[0]
y1 = row['y1'] * im_page.size[1]
x2 = row['x2'] * im_page.size[0]
y2 = row['y2'] * im_page.size[1]
x3 = row['x3'] * im_page.size[0]
y3 = row['y3'] * im_page.size[1]
coords = [x0,y0,x1,y1,x2,y2,x3,y3]
draw.polygon(coords,outline='blue',fill='yellow')
redacted_images_bin.append(im_page)
return redacted_images_bin
def redacted_pdf(self):
redacted_images = self._redact_images()
pdf = FPDF()
pdf.set_auto_page_break(0)
for index,img_redacted in enumerate(redacted_images):
img_redacted.save(f"image_{index}.png")
pdf.add_page()
pdf.image(f"image_{index}.png",w=210,h=297)
os.remove(f"image_{index}.png") # I would like to avoid file handling!
pdf.output("doc.pdf","F") # I would like to avoid file handling!
#return pdf #this is what I want, to return the pdf as base64 or binary

In documentation I found that you can get PDF as string using
pdf_string = pdf.output(dest='S')
so you can use standard module base64
import fpdf
import base64
pdf = fpdf.FPDF()
# ... add some elements ...
pdf_string = pdf.output(dest='S')
pdf_bytes = pdf_string.encode('utf-8')
base64_bytes = base64.b64encode(pdf_bytes)
base64_string = base64_bytes.decode('utf-8')
print(base64_string)
Result:
JVBERi0xLjMKMyAwIG9iago8PC9UeXBlIC9QYWdlCi9QYXJlbnQgMSAwIFIKL1Jlc291cmNlcyAyIDAgUgovQ29udGVudHMgNCAwIFI+PgplbmRvYmoKNCAwIG9iago8PC9GaWx0ZXIgL0ZsYXRlRGVjb2RlIC9MZW5ndGggMTk+PgpzdHJlYW0KeMKcM1LDsMOiMsOQMzVXKMOnAgALw7wCEgplbmRzdHJlYW0KZW5kb2JqCjEgMCBvYmoKPDwvVHlwZSAvUGFnZXMKL0tpZHMgWzMgMCBSIF0KL0NvdW50IDEKL01lZGlhQm94IFswIDAgNTk1LjI4IDg0MS44OV0KPj4KZW5kb2JqCjIgMCBvYmoKPDwKL1Byb2NTZXQgWy9QREYgL1RleHQgL0ltYWdlQiAvSW1hZ2VDIC9JbWFnZUldCi9Gb250IDw8Cj4+Ci9YT2JqZWN0IDw8Cj4+Cj4+CmVuZG9iago1IDAgb2JqCjw8Ci9Qcm9kdWNlciAoUHlGUERGIDEuNy4yIGh0dHA6Ly9weWZwZGYuZ29vZ2xlY29kZS5jb20vKQovQ3JlYXRpb25EYXRlIChEOjIwMjIwMjE3MjExMDE3KQo+PgplbmRvYmoKNiAwIG9iago8PAovVHlwZSAvQ2F0YWxvZwovUGFnZXMgMSAwIFIKL09wZW5BY3Rpb24gWzMgMCBSIC9GaXRIIG51bGxdCi9QYWdlTGF5b3V0IC9PbmVDb2x1bW4KPj4KZW5kb2JqCnhyZWYKMCA3CjAwMDAwMDAwMDAgNjU1MzUgZiAKMDAwMDAwMDE3NSAwMDAwMCBuIAowMDAwMDAwMjYyIDAwMDAwIG4gCjAwMDAwMDAwMDkgMDAwMDAgbiAKMDAwMDAwMDA4NyAwMDAwMCBuIAowMDAwMDAwMzU2IDAwMDAwIG4gCjAwMDAwMDA0NjUgMDAwMDAgbiAKdHJhaWxlcgo8PAovU2l6ZSA3Ci9Sb290IDYgMCBSCi9JbmZvIDUgMCBSCj4+CnN0YXJ0eHJlZgo1NjgKJSVFT0YK
As for image(): it needs filename (or url) and it can't work with string or io.BytesIO().
Eventually you may get source code and you can try to change it.
There is even request on GitHub: Support for StringIO objects as images
EDIT:
I found that there is fork fpdf2 which can use pillow.Image in image() - see fpdf2 Image
And in source code I found image() can also work with io.BytesIO()
Example code for fpdf2 (output() gives bytes instead of string)
import fpdf
import base64
from PIL import Image
import io
#print(fpdf.__version__)
pdf = fpdf.FPDF()
pdf.add_page()
pdf.image('lenna.png')
pdf.image('https://upload.wikimedia.org/wikipedia/en/7/7d/Lenna_%28test_image%29.png')
f = open('lenna.png', 'rb')
pdf.image(f)
f = Image.open('lenna.png')
pdf.image(f)
f = open('lenna.png', 'rb')
b = io.BytesIO()
b.write(f.read())
pdf.image(b)
# save in file
pdf.output('output.pdf')
# get as bytes
pdf_bytes = pdf.output()
#print(pdf_bytes)
base64_bytes = base64.b64encode(pdf_bytes)
base64_string = base64_bytes.decode('utf-8')
print(base64_string)
Wikipedia: Lenna [image]
Test for writing in fpdf2
import fpdf
pdf = fpdf.FPDF()
pdf.add_page()
pdf.image('https://upload.wikimedia.org/wikipedia/en/7/7d/Lenna_%28test_image%29.png')
# --- test 1 ---
pdf.output('output-test-1.pdf')
# --- test 2 ---
pdf_bytes = pdf.output()
with open('output-test-2.pdf', 'wb') as f: # it will close automatically
f.write(pdf_bytes)
# --- test 2 ---
pdf_bytes = pdf.output()
f = open('output-test-3.pdf', 'wb')
f.write(pdf_bytes)
f.close() # don't forget to close when you write

Related

Zoom and crop a pdf document using PyMuPDF fitz and saving as pdf

I am trying to crop a pdf within and lambda and save the file. Ideally I just want to zoom in as otherwise the OCR package does not recognize some of the fonts. The rectangle I am using just seems to shift the margins versus actually cropping or zooming in.
Thanks!
import os
import json
import boto3
from urllib.parse import unquote_plus
import fitz, sys
from io import BytesIO
OUTPUT_BUCKET_NAME = os.environ["OUTPUT_BUCKET_NAME"]
OUTPUT_S3_PREFIX = os.environ["OUTPUT_S3_PREFIX"]
SNS_TOPIC_ARN = os.environ["SNS_TOPIC_ARN"]
SNS_ROLE_ARN = os.environ["SNS_ROLE_ARN"]
def lambda_handler(event, context):
if event:
file_obj = event["Records"][0]
bucketname = str(file_obj["s3"]["bucket"]["name"])
filename = unquote_plus(str(file_obj["s3"]["object"]["key"]))
doc = fitz.open()
s3 = boto3.resource('s3')
obj = s3.Object(bucketname, filename)
fs = obj.get()['Body'].read()
pdf=fitz.open("pdf", stream=BytesIO(fs))
rect=fitz.Rect(50, 50, 545, 792)
page = pdf[0]
page1 = doc.new_page(width = rect.width, # new page with ...
height = rect.height)
page1.show_pdf_page(rect, pdf, 0)
new_bytes = doc.write()
bucketname1='modified'
s3.Bucket(bucketname1).put_object(Key=filename, Body=new_bytes)

How to encrypt images with python

Preface: this is required for a class, I know ECB should not be used.
I am trying to encrypt images using AES and then display the images
Steps needed:
Read the image,
Convert to byte object,
Pad the bytes,
Encrypt the bytes,
Convert back to image object,
Save as image file
This is my code right now:
from PIL import Image
from Crypto.Cipher import AES
from Crypto import Random
img = Image.open("photo.jpg")
img.tobytes()
key = '0123456789abcdef'
mode = AES.MODE_ECB
encryptor = AES.new(key, mode)
img.frombytes("RGB")
At this point I am stuck. I am getting a "not enough image data" error on the line "img.frombytes("RGB"), and am also stuck at the part to pad the bytes
So I needed a way to transfer files in the form of images (don't ask me why), if you just want to transfer text you can maybe create a txt file.
This is probably not exactly the best solution to the question as you probably want a way to hide data inside an existing image but I would like anyway to share the code in case it will help someone sometime somewhere.
Basically this will create an image with a size dependent on the file size and will put a sequence of 3 bytes in one pixel (RGB)
So I wrote a small folder2ImageEncoder.py
(it will encrypt all the data that is located in a folder named "files" by default)
from PIL import Image
from pathlib import Path
encryptedImagesFolder = 'encryptedImagesFolder'
Path(f"./{encryptedImagesFolder}").mkdir(parents=True, exist_ok=True)
newLine = b'\new\n\rL'
def encode_data_to_image(data: bytes, imagePath: str):
data += b'FINISH_OF_DATA'
data = str(
{
'path': imagePath,
'data': data
}
)
data = data.encode()
n = int((len(data)/3)**0.5) + 1
print(n, len(data))
img = Image.new('RGB', (n, n))
# data = img.getdata()
encryptedPixelsList = []
pixel = []
if len(data) % 3 != 0:
data += (3 - (len(data) % 3)) * b'\x00'
for i, Byte in enumerate(data):
if i % 3 == 2:
pixel.append(Byte)
encryptedPixelsList.append(tuple(pixel))
pixel = []
else:
pixel.append(Byte)
for _ in range(len(encryptedPixelsList), n**2):
encryptedPixelsList.append((0, 0, 0))
img.putdata(encryptedPixelsList)
imagePath = imagePath.replace('\\', '_')
img.save(f'./{encryptedImagesFolder}/{imagePath}.png')
# img.show()
def encode_folder(folder: Path):
for file in folder.iterdir():
if not file.is_dir():
with open(str(file), 'rb') as rb:
data = rb.readlines()
encode_data_to_image(
data=newLine.join(data),
imagePath=str(file))
else:
encode_folder(folder=file)
if __name__ == '__main__':
# ./files is the name of the folder you want to encrypt
encode_folder(folder=Path(r'./files'))
and a Image2FilesDecoder.py
this will iterate through the encrypted images folder and will retrieve its former form (run this in another folder with the encrypted images folder so it won't override the original files folder)
from PIL import Image
from pathlib import Path
newLine = b'\new\n\rL'
def decode_encrypted_images(folder: Path):
for pic in folder.iterdir():
img = Image.open(str(pic))
data = img.getdata()
totalData = []
for pixel in data:
totalData.extend(list(pixel))
decryptedData = bytes(totalData)
try:
decryptedData = eval(
decryptedData[:decryptedData.rfind(b'}')+1].decode())
except:
decryptedData.replace(b'\\', '')
decryptedData = eval(
decryptedData[:decryptedData.rfind(b'}')+1].decode())
decryptedData['data'] = decryptedData['data'][:-14]
filePathObj = Path(decryptedData['path'])
Path(filePathObj.parent).mkdir(
parents=True, exist_ok=True)
writeBytes = decryptedData['data'].split(newLine)
with open(str(filePathObj), 'wb') as wb:
wb.writelines(writeBytes)
if __name__ == '__main__':
decode_encrypted_images(folder=Path(
r".\encryptedImagesFolder"))

Wand convert pdf to jpeg and storing pages in file-like objects

I am trying to convert a pdf to jpegs using wand, but when I iterate over the SingleImages in image.sequence and save each image separately. I am saving each image on AWS, with database references, using Django.
image_pdf = Image(blob=pdf_blob)
image_jpeg = image_pdf.convert('jpeg')
for img in image_jpeg.sequence:
memory_file = SimpleUploadedFile(
"{}.jpeg".format(img.page_number),
page.container.make_blob())
spam = Spam.objects.create(
page_image=memory_file,
caption="Spam")
This doesn't work, the page.container is calling the parent Image class, and the first page is written over and over again. How do I get the second frame/page for saveing?
Actually, you can get per-file blobs:
for img in image_jpeg.sequence:
img_page = Image(image=img)
Then you can work with each img_page variable like a full-fledged image: change format, resize, save, etc.
It seems you cannot get per file blobs without messing with c_types. So this is my solution
from path import path # wrapper for os.path
import re
import tempfile
image_pdf = Image(blob=pdf_blob)
image_jpeg = image_pdf.convert('jpeg')
temp_dir = path(tempfile.mkdtemp())
# set base file name (join)
image_jpeg.save(temp_dir / 'pdf_title.jpeg')
images = temp_dir.files()
sorted_images = sorted(
images,
key=lambda img_path: int(re.search(r'\d+', img_path.name).group())
)
for img in sorted_images:
with open(img, 'rb') as img_fd:
memory_file = SimpleUploadedFile(
img.name,
img_fd.read()
)
spam = Spam.objects.create(
page_image=memory_file,
caption="Spam Spam",
)
tempfile.rmtree(tempdir)
Not as clean as doing it all in memory, but it gets it done.

Convert plain text to PDF in Python

For my project, I get a plain text file (report.txt) from another program. It is all formatted in plain text. If you open it in Notepad, it looks nice (as much as a plain text file can). When I open the file in Word and show the paragraphs, I see the ... for spaces and the backwards P for pararaph.
I need to convert this file to PDF and add some other PDF pages to make one final PDF. All this happens in Python.
I am having trouble converting the report.txt to pdf. I have ReportLab, and am able to read the file and make a few changes (like change the text to Courier), but the spacing gets lost. When the file gets read, it appears to strip any extra spaces.
Questions:
a) is there an easier way to convert the report.txt to pdf?
b) If not, is there a way to keep my spaces when I read the file?
c) Or is there a parameter I'm missing from my paragraph style that will keep the original look?
Here's my code:
# ------------------------------------
# Styles
# ------------------------------------
styleSheet = getSampleStyleSheet()
mystyle = ParagraphStyle(name='normal',fontName='Courier',
fontSize=10,
alignment=TA_JUSTIFY,
leading=1.2*12,
parent=styleSheet['Normal'])
#=====================================================================================
model_report = 'report.txt'
# Create document for writing to pdf
doc = SimpleDocTemplate(str(pdfPath), \
rightMargin=40, leftMargin=40, \
topMargin=40, bottomMargin=25, \
pageSize=A4)
doc.pagesize = portrait(A4)
# Container for 'Flowable' objects
elements = []
# Open the model report
infile = file(model_report).read()
report_paragraphs = infile.split("\n")
for para in report_paragraphs:
para1 = '<font face="Courier" >%s</font>' % para
elements.append(Paragraph(para1, style=mystyle))
doc.build(elements)
I've created a small helper function to convert a multi-line text to a PDF file in a "report look" by using a monospaced font. Too long lines are wrapped at spaces so that it will fit the page width:
import textwrap
from fpdf import FPDF
def text_to_pdf(text, filename):
a4_width_mm = 210
pt_to_mm = 0.35
fontsize_pt = 10
fontsize_mm = fontsize_pt * pt_to_mm
margin_bottom_mm = 10
character_width_mm = 7 * pt_to_mm
width_text = a4_width_mm / character_width_mm
pdf = FPDF(orientation='P', unit='mm', format='A4')
pdf.set_auto_page_break(True, margin=margin_bottom_mm)
pdf.add_page()
pdf.set_font(family='Courier', size=fontsize_pt)
splitted = text.split('\n')
for line in splitted:
lines = textwrap.wrap(line, width_text)
if len(lines) == 0:
pdf.ln()
for wrap in lines:
pdf.cell(0, fontsize_mm, wrap, ln=1)
pdf.output(filename, 'F')
This is how you would use this function to convert a text file to a PDF file:
input_filename = 'test.txt'
output_filename = 'output.pdf'
file = open(input_filename)
text = file.read()
file.close()
text_to_pdf(text, output_filename)
ReportLab is the usual recommendation-- as you can see from the "Related" questions on the right side of this page.
Have you tried creating text with just StyleSheet['Normal']? I.e., if you get proper-looking output with the following, the problem is somehow with your style.
Paragraph(para1, style=StyleSheet['Normal'])
For converting text or text file into pdf, module fpdf shall be installed using pip install fpdf in command-line Interface.
run the below code and you will find the pdf file in folder-
from fpdf import FPDF
pdf = FPDF()
# Add a page
pdf.add_page()
# set style and size of font
# that you want in the pdf
pdf.set_font("Arial", size = 15)
# open the text file in read mode
f = open("path where text file is stored\\File_name.txt", "r")
# insert the texts in pdf
for x in f:
pdf.cell(50,5, txt = x, ln = 1, align = 'C')
# save the pdf with name .pdf
pdf.output("path where you want to store pdf file\\File_name.pdf")
reference: https://www.geeksforgeeks.org/convert-text-and-text-file-to-pdf-using-python/
I had similar issue. I solved with this code:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
from PIL import Image
# .....
# ..... some exta code unimportant for this issue....
# ....
# here it is
ptr = open("tafAlternos.txt", "r") # text file I need to convert
lineas = ptr.readlines()
ptr.close()
i = 750
numeroLinea = 0
while numeroLinea < len(lineas):
if numeroLinea - len(lineas) < 60: # I'm gonna write every 60 lines because I need it like that
i=750
for linea in lineas[numeroLinea:numeroLinea+60]:
canvas.drawString(15, i, linea.strip())
numeroLinea += 1
i -= 12
canvas.showPage()
else:
i = 750
for linea in lineas[numeroLinea:]:
canvas.drawString(15, i, linea.strip())
numeroLinea += 1
i -= 12
canvas.showPage()
Pdf looks exactly same as original text file
You can create a canvas with pdf_canvas = canvas.Canvas('output_file.pdf') and generate the PDF with pdf_canvas.save().

Place image over PDF

How can I place an image over an existing PDF file at an specific coordinate location. The pdf represents a drawing sheet with one page. The image will be scaled. I'm checking ReportLab but can't find the answer. Thanks.
Its been 5 years, I think these answers need some TLC. Here is a complete solution.
The following is tested with Python 2.7
Install dependencies
pip install reportlab
pip install pypdf2
Do the magic
from reportlab.pdfgen import canvas
from PyPDF2 import PdfFileWriter, PdfFileReader
# Create the watermark from an image
c = canvas.Canvas('watermark.pdf')
# Draw the image at x, y. I positioned the x,y to be where i like here
c.drawImage('test.png', 15, 720)
# Add some custom text for good measure
c.drawString(15, 720,"Hello World")
c.save()
# Get the watermark file you just created
watermark = PdfFileReader(open("watermark.pdf", "rb"))
# Get our files ready
output_file = PdfFileWriter()
input_file = PdfFileReader(open("test2.pdf", "rb"))
# Number of pages in input document
page_count = input_file.getNumPages()
# Go through all the input file pages to add a watermark to them
for page_number in range(page_count):
print "Watermarking page {} of {}".format(page_number, page_count)
# merge the watermark with the page
input_page = input_file.getPage(page_number)
input_page.mergePage(watermark.getPage(0))
# add page from input file to output document
output_file.addPage(input_page)
# finally, write "output" to document-output.pdf
with open("document-output.pdf", "wb") as outputStream:
output_file.write(outputStream)
References:
pypdf project page:
https://pypi.org/project/pypdf/
Reportlab docs:
http://www.reportlab.com/apis/reportlab/2.4/pdfgen.html
Reportlab complete user guide:
https://www.reportlab.com/docs/reportlab-userguide.pdf
https://pypi.org/project/pypdf/:
from pypdf import PdfWriter, PdfReader
writer = PdfWriter()
reader = PdfReader("document1.pdf")
watermark = PdfReader("watermark.pdf")
page = reader.pages[0]
page.merge_page(watermark.pages[0])
writer.add_page(page)
# finally, write the results to disk
with open("document-output.pdf", "wb") as fp:
writer.write(fp)
I think it's like watermark, see the documentation for more information
I combined ReportLab and pypdf to insert an image directly without having to generate the PDF up front:
from pyPdf import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
from StringIO import StringIO
# Using ReportLab to insert image into PDF
imgTemp = StringIO()
imgDoc = canvas.Canvas(imgTemp)
# Draw image on Canvas and save PDF in buffer
imgPath = "path/to/img.png"
imgDoc.drawImage(imgPath, 399, 760, 160, 160) ## at (399,760) with size 160x160
imgDoc.save()
# Use PyPDF to merge the image-PDF into the template
page = PdfFileReader(file("document.pdf","rb")).getPage(0)
overlay = PdfFileReader(StringIO(imgTemp.getvalue())).getPage(0)
page.mergePage(overlay)
#Save the result
output = PdfFileWriter()
output.addPage(page)
output.write(file("output.pdf","w"))
Thx to the previous answers. My way with python3.4
# -*- coding: utf-8 -*-
from io import BytesIO
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
def gen_pdf():
# there are 66 slides (1.jpg, 2.jpg, 3.jpg...)
path = 'slades/{0}.jpg'
pdf = PdfFileWriter()
for num in range(1, 67): # for each slide
# Using ReportLab Canvas to insert image into PDF
imgTemp = BytesIO()
imgDoc = canvas.Canvas(imgTemp, pagesize=A4)
# Draw image on Canvas and save PDF in buffer
imgDoc.drawImage(path.format(num), -25, -45)
# x, y - start position
# in my case -25, -45 needed
imgDoc.save()
# Use PyPDF to merge the image-PDF into the template
pdf.addPage(PdfFileReader(BytesIO(imgTemp.getvalue())).getPage(0))
pdf.write(open("output.pdf","wb"))
if __name__ == '__main__':
gen_pdf()
This is quite easy to do with PyMuPDF without merging two PDFs:
import fitz
src_pdf_filename = 'source.pdf'
dst_pdf_filename = 'destination.pdf'
img_filename = 'barcode.jpg'
# http://pymupdf.readthedocs.io/en/latest/rect/
# Set position and size according to your needs
img_rect = fitz.Rect(100, 100, 120, 120)
document = fitz.open(src_pdf_filename)
# We'll put image on first page only but you could put it elsewhere
page = document[0]
page.insertImage(img_rect, filename=img_filename)
# See http://pymupdf.readthedocs.io/en/latest/document/#Document.save and
# http://pymupdf.readthedocs.io/en/latest/document/#Document.saveIncr for
# additional parameters, especially if you want to overwrite existing PDF
# instead of writing new PDF
document.save(dst_pdf_filename)
document.close()
This is what worked for me
from PyPDF2 import PdfFileWriter, PdfFileReader
def watermarks(temp, watermar,new_file):
template = PdfFileReader(open(temp, 'rb'))
wpdf = PdfFileReader(open(watermar, 'rb'))
watermark = wpdf.getPage(0)
for i in xrange(template.getNumPages()):
page = template.getPage(i)
page.mergePage(watermark)
output.addPage(page)
with open(new_file, 'wb') as f:
output.write(f)

Categories