Save PDF file as images with same quality as original PDF - python

I want to save each page of a pdf file as a single image file:
import fitz
doc = fitz.open('file.pdf')
for i in range(doc.page_count):
page = doc[i]
pix = page.get_pixmap()
pix.save(f'page-{i}.png')
pix.pil_save(f'page-{i}.jpg', optimize = False, dpi = (1500, 1500))
The images are in worse quality than in the original pdf file, no matter which resolution I choose. How can I save them with the same or a similar quality?

Just a simple configuration, Add the dpi option in get_pixmap()
import fitz
doc = fitz.open('file.pdf')
resolution_parameter = 300
for i in range(doc.page_count):
page = doc[i]
pix = page.get_pixmap(dpi = resolution_parameter)
pix.save(f'page-{i}.png')
pix.pil_save(f'page-{i}.jpg', optimize = False, dpi = (1500, 1500))

Related

Python, JSignature, and ReportLab

I'm looking to write a signature to PDF. I'm using JSignature and Reportlab. My code works successfully for writing the data to a file and the database. I just cannot figure out how to write the signature to the canvas. Has anyone passed the signature into the canvas successfully?
Thank you in advance.
Here's a look at my code:
pdf.py
import io
from django.core.files.base import ContentFile
from reportlab.lib.units import inch
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
def create_pdf(parent):
# create a file-like buffer to receive PDF data
buffer = io.BytesIO()
# create the pdf object, using the buffer as its "file"
p = canvas.Canvas(buffer)
# create text
textobject = p.beginText()
# start text at top left of page
textobject.setTextOrigin(inch, 11*inch)
# set font and size
textobject.setFont("Helvetica-Bold", 18)
textobject.textLine("My Document")
textobject.textLine("")
# write page 1
textobject.setFont("Helvetica", 12)
p_name = f'Name: {participant.first_name} {participant.middle_initial} {participant.last_name}'
textobject.textLine(p_name)
sig = f'Signature:'
textobject.textLine(sig)
----insert signature here----
# write created text to canvas
p.drawText(textobject)
# close the pdf canvas
p.showPage()
p.save()
buffer.seek(0)
# get content of buffer
pdf_data = buffer.getvalue()
# save to django File object
file_data = ContentFile(pdf_data)
# name the file
file_data.name = f'{participant.last_name}.pdf'
#
participant.pdf = file_data
participant.save()
Model:
class Participant(models.Model):
first_name = models.CharField(max_length=50)
middle_initial = models.CharField(max_length=50, blank=True)
last_name = models.CharField(max_length=50, blank=True)
signature = JSignatureField()
pdf = models.FileField(blank=True, null=True)
For those interested in how I was able to get this functioning. The primary issue was The image would be completely black when pulling it into the PDF. Here’s what is required:
In your View:
Use the Jsignature draw_signature function and get the image:
rsr_image = draw_signature(signature)
save the signature as a PNG and then store
# save signature as png to prevent darkening, save to model
rsr_file_name = str(new_parent.id)+'_rsr.png'
buffer = BytesIO()
rsr_image.save(buffer, 'PNG')
new_parent.rsr_image.save(rsr_file_name, File(buffer))
Create the following function, in order to…
Open the image, create a new background for the image, and save it.
def get_jpeg_image(new_parent):
# open png image
png_image = Image.open(new_parent.rsr_image)
# create new image with 'RGB' mode which is compatible with jpeg,
# with same size as old and with white(255,255,255) background
bg = Image.new("RGB", png_image.size, (255, 255, 255))
# paste old image pixels in new background
bg.paste(png_image, png_image)
# give image file name
file_name_jpeg = str(new_parent.id)+'.jpg'
bg.save(file_name_jpeg)
return file_name_jpeg
Reference that function inside your create PDF function to convert the PNG to JPG
jpeg_image = get_jpeg_image(participant)
Hope this helps someone.

Export to image not pdf in python PyPDF2 package

I have the following code that crops part of pdf file then save the output as PDF
from PyPDF2 import PdfFileWriter, PdfFileReader
with open("Sample.pdf", "rb") as in_f:
input1 = PdfFileReader(in_f)
output = PdfFileWriter()
numPages = input1.getNumPages()
print("Document Has %s Pages." % numPages)
for i in range(1):
page = input1.getPage(i)
print(page.mediaBox.getUpperRight_x(), page.mediaBox.getUpperRight_y())
page.trimBox.lowerLeft = (280, 280)
page.trimBox.upperRight = (220, 200)
page.cropBox.lowerLeft = (100, 720)
page.cropBox.upperRight = (220, 800)
output.addPage(page)
with open("Output.pdf", "wb") as out_f:
output.write(out_f)
How can I save as an image not as PDF?
I found this code but the output is not at high quality. How can I improve the quality of the image output?
import fitz
pdffile = "Output.pdf"
doc = fitz.open(pdffile)
page = doc.loadPage(0)
pix = page.getPixmap()
output = "Output.jpg"
pix.writePNG(output)
Hi There You Could Use The pdf2image library for achieving so.
You Could Use The Following Code At The End:
from pdf2image import convert_from_path
images = convert_from_path('Output.pdf')
for i in range(len(images)):
images[i].save('Output'+ str(i) +'.jpg', 'JPEG')
Then If You Wish You Could Use The os library to delete the pdf you made using the following code in order to avoid the hassle of deleting the pdf yourself.
import os
os.remove("Output.pdf")
This solves the problem but I welcome any advanced ideas and improvements
import fitz
pdffile = "Output.pdf"
doc = fitz.open(pdffile)
zoom = 2 # zoom factor
mat = fitz.Matrix(zoom, zoom)
page = doc.loadPage(0)
pix = page.getPixmap(matrix = mat)
output = "Output.jpg"
pix.writePNG(output)

Python Image extraction sequence from pdf

I was trying to extract images from a pdf using PyMuPDF (fitz). My pdf has multiple images in a single page. I am maintaining a proper sequence number while saving my images. I saw that the images being extracted don't follow a proper sequence. Sometimes it is starting to extract from the bottom, sometimes from the top and so on. Is there a way to modify my code so that the extraction follow a proper sequence?
Given below is the code I am using :
import fitz
from PIL import Image
filename = "document.pdf"
doc = fitz.open(filename)
for i in range(len(doc)):
img_num = 0
p_no = 1
for img in doc.getPageImageList(i):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if pix.n - pix.alpha < 4:
img_num += 1
pix.writeImage("%s-%s.jpg" % (str(p_no),str(img_num)))
else:
img_num += 1
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writeImage("%s-%s.jpg" % (str(p_no),str(img_num)))
pix1 = None
pix = None
p_no += 1
Given below is a sample page of the pdf
I have the same problem I've used the following code:
import fitz
import io
from PIL import Image
file = "file_path"
pdf_file = fitz.open(file)
for page_index in range(len(pdf_file)):
# get the page itself
page = pdf_file[page_index]
image_list = page.getImageList()
# printing number of images found in this page
if image_list:
print(f"[+] Found {len(image_list)} images in page {page_index}")
else:
print("[!] No images found on the given pdf page", page_index)
for image_index, img in enumerate(page.getImageList(), start=1):
print(img)
print(image_index)
# get the XREF of the image
xref = img[0]
# extract the image bytes
base_image = pdf_file.extractImage(xref)
image_bytes = base_image["image"]
# get the image extension
image_ext = base_image["ext"]
# load it to PIL
image = Image.open(io.BytesIO(image_bytes))
# save it to local disk
image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))
The most probable way is to locate the 'img' var and order them.
I'd love to hear any further sggestions or if you found better idea/solution.

Change color scheme when extracting an image from PDF in Python

I am trying to read an image from a pdf following this post:
Extract images from PDF without resampling, in python?
So far I managed to get the image file from the pdf, but it contains a CWYK color scheme and the picture is becoming messed up.
My code is the following:
import PyPDF2
import struct
pdf_filename = 'document.pdf'
pdf_file = open(pdf_filename, 'rb')
cond_scan_reader = PyPDF2.PdfFileReader(pdf_file)
page = cond_scan_reader.getPage(4)
xObject = page['/Resources']['/XObject'].getObject()
for obj in xObject:
print(xObject[obj])
if xObject[obj]['/Subtype'] == '/Image':
if xObject[obj]['/Filter'] == '/DCTDecode':
data = xObject[obj]._data
img = open("image" + ".jpg", "wb")
img.write(data)
img.close()
pdf_file.close()
The point is that when I save, the colors are all weird, I believe it's because of the colorScheme. I have the following in the console:
{'/Type': '/XObject', '/Subtype': '/Image', '/Width': 1122, '/Height': 502, '/Interpolate': <PyPDF2.generic.BooleanObject object at 0x1061574a8>, '/ColorSpace': '/DeviceCMYK', '/BitsPerComponent': 8, '/Filter': '/DCTDecode'}
As you can see, the ColorSpace is CMYK, and I believe that's why the colors of the image are weird.
That's the image I have:
This is the original image (it is inside a pdf file):
Can anyone help me?
Thanks in advance.
Israel
A CMYK mode JPG image that contained in PDF must be invert.
But in PIL, invert of CMYK mode image is not supported.
Than I solve it using numpy.
Full source is in below link.
https://github.com/Gaia3D/pdfImageExtractor/blob/master/extrectImage.py
imgData = np.frombuffer(img.tobytes(), dtype='B')
invData = np.full(imgData.shape, 255, dtype='B')
invData -= imgData
img = Image.frombytes(img.mode, img.size, invData.tobytes())
img.save(outFileName + ".jpg")

Image drawn to reportlab pdf bigger than pdf paper size

i'm writing a program which takes all the pictures in a given folder and aggregates them into a pdf. The problem I have is that when the images are drawn, they are bigger in size and are rotated to the left oddly. I've searched everywhere, havent found anything even in the reportlab documentation.
Here's the code:
import os
from PIL import Image
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
from reportlab.lib.units import cm
from StringIO import StringIO
def main():
images = image_search()
output = PdfFileWriter()
for image in images:
Image_file = Image.open(image) # need to convert the image to the specific size first.
width, height = Image_file.size
im_width = 1 * cm
# Using ReportLab to insert image into PDF
watermark_str = "watermark" + str(images.index(image)) + '.pdf'
imgDoc = canvas.Canvas(watermark_str)
# Draw image on Canvas and save PDF in buffer
# define the aspect ratio first
aspect = height / float(width)
## Drawing the image
imgDoc.drawImage(image, 0,0, width = im_width, height = (im_width * aspect)) ## at (399,760) with size 160x160
imgDoc.showPage()
imgDoc.save()
# Get the watermark file just created
watermark = PdfFileReader(open(watermark_str, "rb"))
#Get our files ready
pdf1File = open('sample.pdf', 'rb')
page = PdfFileReader(pdf1File).getPage(0)
page.mergePage(watermark.getPage(0))
#Save the result
output.addPage(page)
output.write(file("output.pdf","wb"))
#The function which searches the current directory for image files.
def image_search():
found_images = []
for doc in os.listdir(os.curdir):
image_ext = ['.jpg', '.png', '.PNG', '.jpeg', '.JPG']
for ext in image_ext:
if doc.endswith(ext):
found_images.append(doc)
return found_images
main()
I also tried scaling and specifying the aspect ratio using the im_width variable, which gave the same output.
After a little bit of confusion about your goal I figured out that the goal is to make a PDF overview of the images in the current folder. To do so we actual don't need PyPDF2 as Reportlab offers everything we need for this.
See the code below with the comments as guidelines:
def main():
output_file_loc = "overview.pdf"
imgDoc = canvas.Canvas(output_file_loc)
imgDoc.setPageSize(A4) # This is actually the default page size
document_width, document_height = A4
images = image_search()
for image in images:
# Open the image file to get image dimensions
Image_file = Image.open(image)
image_width, image_height = Image_file.size
image_aspect = image_height / float(image_width)
# Determine the dimensions of the image in the overview
print_width = document_width
print_height = document_width * image_aspect
# Draw the image on the current page
# Note: As reportlab uses bottom left as (0,0) we need to determine the start position by subtracting the
# dimensions of the image from those of the document
imgDoc.drawImage(image, document_width - print_width, document_height - print_height, width=print_width,
height=print_height)
# Inform Reportlab that we want a new page
imgDoc.showPage()
# Save the document
imgDoc.save()

Categories