How to add a relative file path inside a pdf using pypdf - python

Context
I have a pdf with links.
I want to replace all the external links with local files in the same folder.
Is there a way to do that in pypdf or python
e.g.
outputStream = open("destination.pdf", "wb")
key = '/Annots'
uri = '/URI'
ank = '/A'
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
cwd = os.getcwd()
for x in range(existing_pdf.getNumPages()):
page = existing_pdf.getPage(x)
page_object = page.getObject()
if key in page_object:
ann = page_object[key]
for a in ann:
u = a.getObject()
if uri in u[ank]:
test = u[ank][uri]
test1 = u[ank].keys()
u[TextStringObject(ank)][TextStringObject(uri)] = TextStringObject(f"file:./foo1.pdf")
output.addPage(page)
# finally, write "output" to a real file
output.write(outputStream)
outputStream.close()
The above does not work i.e. the foo1.pdf is not linked properly.
If I add "file:///{CWD}/foo1.pdf" it works.
Is there a way to use relative path only?

After reading through the pdf structure and documentation I was able to write the following and it works as expected.
for x in range(existing_pdf.getNumPages()):
page = existing_pdf.getPage(x)
page_object = page.getObject()
if key in page_object:
ann = page_object[key]
for a in ann:
u = a.getObject()
if uri in u[ank]:
del u[TextStringObject(ank)][TextStringObject(uri)]
u[TextStringObject(ank)][NameObject('/F')] = TextStringObject(f"./sheets/sheet1.pdf")
u[TextStringObject(ank)][TextStringObject('/S')] = NameObject("/Launch")
u[TextStringObject(ank)][NameObject('/NewWindow')] = BooleanObject(f"true")
output.addPage(page)
# finally, write "output" to a real file
output.write(outputStream)
outputStream.close()

Related

How to know the cause for assertion errors Python?

I am compiling a script for adding custom property in PDF files using PdfMerger() in PyPdf2. It worked fine for almost all the files except a few. And error occurs in some function inside the PdfMerge. I don't understand what exactly is causing this error or how to rectify it. Here is the entire program - not sure if giving a snippet would be helpful.
import os
import pandas as pd
from PyPDF2 import PdfReader, PdfMerger
df = pd.read_excel('S:\\USERS\\VINTON\\F001A - Item Master (Stock and Cost)- 270001.xlsx')
folder_path = "U:\\BMP" pdf_files = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if f.endswith('.pdf')]
for EachFile in pdf_files:
search_value = EachFile
print(EachFile)
search_result = df[df['Item Number 02'] == search_value]
# Find the corresponding value in the "Name" column of the same w
if not search_result.empty:
print("Found in JDE")
Revision = search_result['Rev'].values[0]
Description = search_result['Item Description 01'].values[0]
FileName = "U:\\BMP\\" + search_value + ".pdf"
# Get the file from BMP Folder
file_in = open(FileName, 'rb')
pdf_reader = PdfReader(file_in)
if pdf_reader.is_encrypted:
print("Encrypted")
continue
metadata = pdf_reader.metadata
# Adding entire existing file to the new file created
pdf_merger = PdfMerger()
pdf_merger.append(file_in)
pdf_merger.add_metadata({
'/Revision': Revision,
'/Description': Description
})
file_out = open("S:\\USERS\\VINTON\\BMP-Rev\\" + search_value ".pdf", 'wb')
pdf_merger.write(file_out)
file_in.close()
file_out.close()
print("All Done!!")
I cannot figure out how to overcome assertion errors because the error is shown to have occurred in several layers below the simplified syntax.
There is "+" sign missing in this line before ".pdf"
file_out = open("S:\USERS\VINTON\BMP-Rev\" + search_value ".pdf", 'wb')
try this:
file_out = open("S:\USERS\VINTON\BMP-Rev\" + search_value + ".pdf", 'wb')
hope it works
Use try and except statements when reading or merging pdf files to throw the exception messages if failed. It's always a good practice to throw errors and exceptions when working with files or memory for development purposes.
import os
import pandas as pd
from PyPDF2 import PdfReader, PdfMerger
df = pd.read_excel('S:\\USERS\\VINTON\\F001A - Item Master (Stock and Cost)- 270001.xlsx')
folder_path = "U:\\BMP"
pdf_files = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if f.endswith('.pdf')]
for EachFile in pdf_files:
search_value = EachFile
print(EachFile)
search_result = df[df['Item Number 02'] == search_value]
# Find the corresponding value in the "Name" column of the same w
if not search_result.empty:
print("Found in JDE")
Revision = search_result['Rev'].values[0]
Description = search_result['Item Description 01'].values[0]
FileName = "U:\\BMP\\" + search_value + ".pdf"
# Get the file from BMP Folder
file_in = open(FileName, 'rb')
try:
pdf_reader = PdfReader(file_in)
if pdf_reader.is_encrypted:
print("Encrypted")
continue
metadata = pdf_reader.metadata
# Adding entire existing file to the new file created
pdf_merger = PdfMerger()
pdf_merger.append(file_in)
pdf_merger.add_metadata({
'/Revision': Revision,
'/Description': Description
})
except Exception as e:
print(e)
file_out = open("S:\\USERS\\VINTON\\BMP-Rev\\" + search_value ".pdf", 'wb')
pdf_merger.write(file_out)
file_in.close()
file_out.close()
print("All Done!!")

Delete all pdf files in a folder using python

I am trying to convert all pdf files to .jpg files and then remove them from the directory. I am able to convert all pdf's to jpg's but when I try to delete them, I get the error "The process is being used by another person".
Could you please help me?
Below is the code
Below script wil convert all pdfs to jpegs and storesin the same location.
for fn in files:
doc = fitz.open(pdffile)
page = doc.loadPage(0) # number of page
pix = page.getPixmap()
fn1 = fn.replace('.pdf', '.jpg')
output = fn1
pix.writePNG(output)
os.remove(fn) # one file at a time.
path = 'D:/python_ml/Machine Learning/New folder/Invoice/'
i = 0
for file in os.listdir(path):
path_to_zip_file = os.path.join(path, folder)
if file.endswith('.pdf'):
os.remove(file)
i += 1
As #K J noted in their comment, most probably the problem is with files not being closed, and indeed your code misses closing the doc object(s).
(Based on the line fitz.open(pdffile), I guess you use the pymupdf library.)
The problematic fragment:
doc = fitz.open(pdffile)
page = doc.loadPage(0) # number of page
pix = page.getPixmap()
fn1 = fn.replace('.pdf', '.jpg')
output = fn1
pix.writePNG(output)
...should be adjusted, e.g., in the following way:
with fitz.open(pdffile) as doc:
page = doc.loadPage(0) # number of page
pix = page.getPixmap()
output = fn.replace('.pdf', '.jpg')
pix.writePNG(output)
(Side note: the fn1 variable seems to be completely redundant so I got rid of it. Also, shouldn't pdffile be replaced with fn? What pdffile actually is?)

How to split only first page in each pdf file from directory that has muliple files?

I have created two directories with input and output names. Input directory have more than one PDF file and each file has multiple pages. I am trying to get first page of every PDF file and that should be save on output directory.
Below is the code i am tryingimport os
from PyPDF2 import PdfFileWriter, PdfFileReader
in_path = "D:/data/input/"
out_path = "D:/data/output/"
output = PdfFileWriter()
pages_to_keep = [0]
in_files = (f for f in os.listdir(in_path) if os.path.isfile(f) and f.endswith('.pdf'))
for file in in_files:
po = open(file, 'rb')
rd = PdfFileReader(po, strict=False)
for i in pages_to_keep:
page = rd.getPage(i)
output.addPage(page)
with open(out_path+str(file), 'wb') as f:
output.write(f):
The problem is: when i executing the script that is saving output file1 having 1 page, and output file2 having 2 pages, third file having three pages. But i need only first page from all PDF files.
How to solve this.
You need to reset output for each file:
for file in in_files:
output = PdfFileWriter() # clear output
po = open(file, 'rb')
rd = PdfFileReader(po, strict=False)
for i in pages_to_keep:
page = rd.getPage(i)
output.addPage(page)
with open(out_path+str(file), 'wb') as f:
output.write(f):

for each *.pdf in a directory select specific pages and generate a new pdf for each

Just learning Python and trying to produce code that re-saves each pdf in a folder with only specific pages (same page numbers for each pdf e.g. the 1st and 3rd page) into a new folder.
Tried to follow some examples and came up with this
from PyPDF4 import PdfFileReader, PdfFileWriter
from os import listdir
input_dir = 'C:/.../update/'
output_dir = 'C:/.../update_output/'
for x in listdir(input_dir):
pdf_in=open(input_dir + x, 'rb')
pdf_reader=PdfFileReader(pdf_in)
output=PdfFileWriter()
pages_to_keep = [0, 2]
for i in pages_to_keep:
p = pdf_in.getPage(i)
output.addPage(p)
with open(pdf_in+x, 'wb') as f:
output.write(f)
Error AttributeError: '_io.BufferedReader' object has no attribute 'getPage'
You need to operate on the PdfFileReader instead of on the open file.
You also need to write the output to something other than the open file.
Try instead:
for x in listdir(input_dir):
pdf_in=open(input_dir + x, 'rb')
pdf_reader=PdfFileReader(pdf_in)
output=PdfFileWriter()
pages_to_keep = [0, 2]
for i in pages_to_keep:
p = pdf_reader.getPage(i)
# ^^^^^^
output.addPage(p)
with open(x, 'wb') as f:
# ^
output.write(f)

How do I fix this file_tracker that reads/writes using JSON dictionaries?

I am trying to write a script that tracks for changes made in directories/files set to multiple file paths created by an installer. I found Thomas Sileo's DirTools project on git, modified it, but am now running into some issues when writing/reading from JSON:
1) First, I believe that I am writing to JSON incorrectly and am finding that my create_state() function is only writing the last path I need.
2) If I get it working, I am unable to read/parse the file like I was before. I usually get ValueError: Extra data errors
Code below:
import os import json import getpass
files = [] subdirs = []
USER = getpass.getuser()
pathMac = ['/Applications/',
'/Users/' + USER + '/Documents/' ]
def create_dir_index(path):
files = []
subdirs = []
for root, dirs, filenames in os.walk(path):
for subdir in dirs:
subdirs.append(os.path.relpath(os.path.join(root, subdir), path))
for f in filenames:
files.append(os.path.relpath(os.path.join(root, f), path))
return dict(files=files, subdirs=subdirs)
def create_state(): for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("Manifest.json", "w")
json.dump(dir_state, out_file)
out_file.close()
def compare_states(dir_base, dir_cmp):
'''
return a comparison two manifest json files
'''
data = {}
data['deleted'] = list(set(dir_cmp['files']) - set(dir_base['files']))
data['created'] = list(set(dir_base['files']) - set(dir_cmp['files']))
data['deleted_dirs'] = list(set(dir_cmp['subdirs']) - set(dir_base['subdirs']))
data['created_dirs'] = list(set(dir_base['subdirs']) - set(dir_cmp['subdirs']))
return data
if __name__ == '__main__':
response = raw_input("Would you like to Compare or Create? ")
if response == "Create":
# CREATE MANIFEST json file
create_state()
print "Manifest file created."
elif response == "Compare":
# create the CURRENT state of all indexes in pathMac and write to json file
for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("CurrentState.json", "w")
json.dump(dir_state, out_file)
out_file.close()
# Open and Load the contents from the file into dictionaries
manifest = json.load(open("Manifest.json", "r"))
current = json.load(open("CurrentState.json", "r"))
print compare_states(current, manifest)

Categories