Python: Numbering Pages in a PDF using PyPDF2 and io - python

So I am trying to retrospectively add a page numbering to a PDF file. I don't understand how this works.
I copied the code together from here and here.
I keep a problem I can't seem to fix on my own, probably because I don't understand what is happening even after reading the PyPDF2 documentation.
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize=A4)
can.drawString(10, 100, "Page" + str(15)) #just a random test number
can.save()
packet.seek(0)
watermark = PdfFileReader(packet)
watermark_page = watermark.getPage(0)
pdf = PdfFileReader('in.pdf')
pdf_writer = PdfFileWriter()
for page in range(pdf.getNumPages()):
pdf_page = pdf.getPage(page)
pdf_page.mergePage(watermark_page)
pdf_writer.addPage(pdf_page)
with open('out.pdf', 'wb') as fh:
pdf_writer.write(fh)
This works fine. However, I would like to give every page a different number. So I changed the for loop to this:
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
packet = io.BytesIO()
pdf = PdfFileReader('in.pdf')
pdf_writer = PdfFileWriter()
for page in range(pdf.getNumPages()):
can = canvas.Canvas(packet, pagesize=A4)
can.drawString(10, 200, "Page " + str(page) )
can.save()
packet.seek(0)
watermark = PdfFileReader(packet)
watermark_page = watermark.getPage(0)
pdf_page = pdf.getPage(page)
pdf_page.mergePage(watermark_page)
pdf_writer.addPage(pdf_page)
with open('out.pdf', 'wb') as fh:
pdf_writer.write(fh)
This does not work.
I get:
Traceback (most recent call last):
File "<ipython-input-44-c6a76740be9f>", line 1, in <module>
runfile('//DIR/pdftest.py', wdir='//DIR')
File "C:\Program Files (x86)\Anaconda\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Program Files (x86)\Anaconda\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "//DIR/pdftest.py", line 55, in <module>
watermark = PdfFileReader(packet)
File "C:\Program Files (x86)\Anaconda\lib\site-packages\PyPDF2\pdf.py", line 1084, in __init__
self.read(stream)
File "C:\Program Files (x86)\Anaconda\lib\site-packages\PyPDF2\pdf.py", line 1901, in read
raise utils.PdfReadError("Could not find xref table at specified location")
PdfReadError: Could not find xref table at specified location
A bit of help understanding as well as fixing this would be greatly appreciated.
Thank you!

Related

OSError: [Errno 36] File name too long using Streamlit

I have been trying to use StreamLit for a little website project I have going on and I keep getting this error:
OSError: [Errno 36] File name too long
Here is the coding
uploaded_files = st.file_uploader("Choose a file", accept_multiple_files=True)
for uploaded_file in uploaded_files:
bytes_data = uploaded_file.read()
s = bytes_data.decode("UTF-8")
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
loader = SimpleDirectoryReader(s)
documents = loader.load_data()
index = GPTSimpleVectorIndex(documents)
index.save_to_disk('index.json')
question = st.text_input("What do you want me to do with the file uploaded?")
response = index.query(question)
st.write(response)
Here is the full error:
File "/home/appuser/venv/lib/python3.9/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 565, in _run_script
exec(code, module.__dict__)
File "/app/indextest/streamlit_app.py", line 17, in <module>
loader = SimpleDirectoryReader(s)
File ".modules/file.py", line 75, in __init__
self.input_files = self._add_files(self.input_dir)
File ".modules/file.py", line 81, in _add_files
input_files = sorted(input_dir.iterdir())
File "/usr/local/lib/python3.9/pathlib.py", line 1160, in iterdir
for name in self._accessor.listdir(self):
You are passing the entire content of the file as an argument to SimpleDirectoryReader. However, as you can see in the source code of the SimpleDirectoryReader class, the first argument should be a path to a directory:
input_dir (str): Path to the directory.
You should store the uploaded_files inside a folder, and then give this folder as an argument to SimpleDirectoryReader like this:
import os
save_folder = "files"
if not os.path.exists(save_folder):
os.makedirs(save_folder)
uploaded_files = st.file_uploader("Choose a file", accept_multiple_files=True)
for uploaded_file in uploaded_files:
bytes_data = uploaded_file.read()
s = bytes_data.decode("UTF-8")
# save the file
with open(f"{save_folder}/{uploaded_file.name}", "w") as f:
f.write(s)
# Then, once you have stored the files, load the SimpleDirectoryReader
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
loader = SimpleDirectoryReader(save_folder)
documents = loader.load_data()

Python : Add watermark/background in all pages PDF

I just want to add/merge background in all pages in PDf but getpage(i) in inputfile giving me error. Only getPage(0) is running without error but creating duplicate copy of page first throughout the document keeping the original number of pages.
here is my code
from typing import BinaryIO
import os
import PyPDF2
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger
from tkinter.filedialog import askopenfilename
from fpdf import FPDF
input_file = askopenfilename()
pdf = PdfFileReader(input_file)
watermark = PyPDF2.PdfFileReader(open('F:\abc\abc\PDF
Templates\Report First - Potrait.pdf', 'rb'))
output = PdfFileWriter()
num_numbers = pdf.numPages
for i in range(pdf.getNumPages()):
watermarks = watermark.getPage(0)
page = pdf.getPage(i)
page.mergePage(watermarks)
output.addPage(page)
with open(input_file.rsplit(".", 1)[0] + '_FP.pdf', "wb") as merged_file:
output.write(merged_file)
getting Error::
Traceback (most recent call last): File "C:\Users\Gaurav\Desktop\PFD
python\abc\PFD python\test2.py", line 23, in
page.mergePage(watermarks) File "C:\Users\Gaurav\AppData\Local\Programs\Python\Python310\lib\site-packages\PyPDF2\pdf.py",
line 2417, in mergePage
self._mergePage(page2) File "C:\Users\Gaurav\AppData\Local\Programs\Python\Python310\lib\site-packages\PyPDF2\pdf.py",
line 2426, in _mergePage
originalResources = self[PG.RESOURCES].getObject() File "C:\Users\Gaurav\AppData\Local\Programs\Python\Python310\lib\site-packages\PyPDF2\generic.py",
line 539, in getitem
return dict.getitem(self, key).getObject() KeyError: '/Resources'

How to download ms word docx file in python with raw data from http url

if the following url is hit in browser the docx file will be downloaded i want to automate the download with python.
https://hudoc.echr.coe.int/app/conversion/docx/?library=ECHR&id=001-176931&filename=CASE OF NDIDI v. THE UNITED KINGDOM.docx&logEvent=False
i have tried this following
from docx import Document
import requests
import json
from bs4 import BeautifulSoup
dwnurl = 'https://hudoc.echr.coe.int/app/conversion/docx/?library=ECHR&id=001-176931&filename=CASE%20OF%20NDIDI%20v.%20THE%20UNITED%20KINGDOM.docx&logEvent=False'
doc = requests.get(dwnurl)
print(doc.content) #printing the document like b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\x00!\xfb\x16\x01\x16\x02\x00\x00\xec\x0c\x00\x00\x13\x00\xc4\x01[Content_Types].xml \xa2\xc0\
print(doc.raw) #printing the document like <urllib3.response.HTTPResponse object at 0x063D8BD0>
document = Document(doc.content)
document.save('test.docx')
#on document.save i have facing these issues
Traceback (most recent call last):
File "scraping_hudoc.py", line 40, in <module>
document = Document(doc.content)
File "C:\Users\204387\AppData\Local\Programs\Python\Python36-32\lib\site-packages\docx\api.py", line 25, in Document
document_part = Package.open(docx).main_document_part
File "C:\Users\204387\AppData\Local\Programs\Python\Python36-32\lib\site-packages\docx\opc\package.py", line 116, in open
pkg_reader = PackageReader.from_file(pkg_file)
File "C:\Users\204387\AppData\Local\Programs\Python\Python36-32\lib\site-packages\docx\opc\pkgreader.py", line 32, in from_file
phys_reader = PhysPkgReader(pkg_file)
File "C:\Users\204387\AppData\Local\Programs\Python\Python36-32\lib\site-packages\docx\opc\phys_pkg.py", line 101, in __init__
self._zipf = ZipFile(pkg_file, 'r')
File "C:\Users\204387\AppData\Local\Programs\Python\Python36-32\lib\zipfile.py", line 1108, in __init__
self._RealGetContents()
File "C:\Users\204387\AppData\Local\Programs\Python\Python36-32\lib\zipfile.py", line 1171, in _RealGetContents
endrec = _EndRecData(fp)
File "C:\Users\204387\AppData\Local\Programs\Python\Python36-32\lib\zipfile.py", line 241, in _EndRecData
fpin.seek(0, 2)
AttributeError: 'bytes' object has no attribute 'seek'
i have saved the ms word docx file through this
import requests
def save_link(book_link, book_name):
the_book = requests.get(book_link, stream=True)
with open(book_name, 'wb') as f:
for chunk in the_book.iter_content(1024 * 1024 * 2): # 2 MB chunks
f.write(chunk)
save_link("https://hudoc.echr.coe.int/app/conversion/docx/?library=ECHR&id=001-176931&filename=CASE%20OF%20NDIDI%20v.%20THE%20UNITED%20KINGDOM.docx&logEvent=False","CASE OF NDIDI v. THE UNITED KINGDOM.docx")

PyPDF2 write doesn't work on some PDF files (Python 3.5.1)

First of all I am using Python 3.5.1 (32 bit version)
I wrote the following program to add a pagenumber on all pages of my pdf files using PyPDF2 and reportlab:
#import modules
from os import listdir
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
#initial values of variable declarations
PDFlist=[]
X_value=460
Y_value=820
#Make a list of al files in de directory
filelist = listdir()
#Make a list of all pdf files in the directory
for i in range(0,len(filelist)):
filename=filelist[i]
for j in range(0,len(filename)):
char=filename[j]
if char=='.':
extension=filename[j+1:j+4]
if extension=='pdf':
PDFlist.append(filename)
j=j+1
i=i+1
# Give the horizontal position for the page number (Enter = use default value of 480)
User = input('Give horizontal position page number (ENTER = default 460): ')
if User != "":
X_value=int(User)
# Give the vertical position for the page number (Enter = use default value of 820)
User = input('Give horizontal position page number (ENTER = default 820): ')
if User != "":
Y_value=int(User)
for i in range(0,len(PDFlist)):
filename=PDFlist[i]
# read the PDF
existing_pdf = PdfFileReader(open(filename, "rb"))
print("File: "+filename)
# count the number of pages
number_of_pages = existing_pdf.getNumPages()
print("Number of pages detected:"+str(number_of_pages))
output = PdfFileWriter()
for k in range(0,number_of_pages):
packet = io.BytesIO()
# create a new PDF with Reportlab
can = canvas.Canvas(packet, pagesize=A4)
Pagenumber=" Page "+str(k+1)+"/"+str(number_of_pages)
# we first make a white rectangle to cover any existing text in the pdf
can.setFillColorRGB(1,1,1)
can.setStrokeColorRGB(1,1,1)
can.rect(X_value-10,Y_value-5,120,20,fill=1)
# set the font and size
can.setFont("Helvetica",14)
# choose color of page numbers (red)
can.setFillColorRGB(1,0,0)
can.drawString(X_value, Y_value, Pagenumber)
can.save()
print(Pagenumber)
#move to the beginning of the StringIO buffer
packet.seek(0)
new_pdf = PdfFileReader(packet)
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(k)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
k=k+1
# finally, write "output" to a real file
ResultPDF="Output/"+filename
outputStream = open(ResultPDF, "wb")
output.write(outputStream)
outputStream.close()
i=i+1
This program works fine for quite a number of PDF files (albeit that warnings are sometimes generated like 'PdfReadWarning: Superfluous whitespace found in object header b'16' b'0' [pdf.py:1666]' but the resulting output file is okay to me).
However, the program just doesn't work on some PDF files although these files are perfectly readable and editable with my Adobe Acrobat. I have the impression the error pops up mostly on PDF files that were scanned but not on all of them (I also numbered scanned PDF files that didn't generate any error).
I am getting the following error message (the first 8 lines are the result of my own print commands):
File: Scanned file.pdf
Number of pages detected:6
Page 1/6
Page 2/6
Page 3/6
Page 4/6
Page 5/6
Page 6/6
PdfReadWarning: Object 25 1 not defined. [pdf.py:1629]
Traceback (most recent call last):
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\Sourcecode\PDFPager.py", line 83, in <module>
output.write(outputStream)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 482, in write
self._sweepIndirectReferences(externalReferenceMap, self._root)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 556, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, data[i])
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 556, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, data[i])
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 577, in _sweepIndirectReferences
newobj = data.pdf.getObject(data)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 1631, in getObject
raise utils.PdfReadError("Could not find object.")
PyPDF2.utils.PdfReadError: Could not find object.
Apparently the pages are merged with the PDF created by reportlab (see lines up to page 6/6) but in the end no output PDF file can be generated by PyPDF2 (I get an unreadible output file of 0 bytes).
Can somebody shed some light on how to resolve this? I searched the internet but couldn't really find an answer.
On pdf.py do the following changes:
on line 1633 of pdf. py (which means uncommenting the if self.strict)
if self.strict:
raise utils.PdfReadError("Could not find object.")
and on line 501 on pdf.py make the following changes (adding a try, except block)
try:
obj.writeToStream(stream, key)
stream.write(b_("\nendobj\n"))
except:
pass
Cheers.
Using "strict = false" got things working for me.
from PyPDF2 import PdfFileMerger
pdfs = [r'file 1.pdf', r'file 2.pdf']
merger = PdfFileMerger(strict=False)
for pdf in pdfs:
merger.append(pdf)
merger.write(r"thanks mate.pdf")
Here is my solution. Try to write the file into a dummy ByteIO stream to check whether it is broken.
try:
reader = PdfFileReader(input_file)
print("Opening '{}', pages={}".format(file_path, reader.getNumPages()))
# Try to write it into an dummy ByteIO stream to check whether pdf is broken
writer = PdfFileWriter()
writer.addPage(reader.getPage(0))
writer.write(io.BytesIO())
except PdfReadError:
print("Error reading '{}".format(file_path))
continue
I just encountered the same error with pypdf2. It's a problem related to pdf's version
Just use the pikepdf package and then the issue went away.
You can find documentation here

Python script to create blank docx file fails on non-Latin character input

I wrote a script that creates a blank .docx file on the basis of another file, taking only the filename of that file and adding _EN to the end of it.
It works fine on all files, except if there are non-Latin characters in the filename. I've tried adding .encode('UTF-8') to line 7, but it doesn't work.
# -*- coding: utf-8 -*-
import sys
import re
from docx import Document
from docx.shared import Pt
docname = str((sys.argv[1]))
chopped = re.sub(r"(^.*)\..*", r"\1", docname)
newname = (chopped + '_EN.docx')
document = Document()
paragraph = document.add_paragraph()
paragraph_format = paragraph.paragraph_format
paragraph_format.space_before = Pt(0)
paragraph_format.space_after = Pt(0)
style = document.styles['Normal']
font = style.font
font.name = 'Times New Roman'
font.size = Pt(10.5)
document.save(newname)
As you can see, I've included the encoding line at the beginning.
Edit: thanks to advice from one of the commenters, I can elicit the error:
C:\Users\marca\OneDrive\programming\python\apps> python doc_creator.py 04.メガほむら変身後.pm
Traceback (most recent call last):
File "doc_creator.py", line 18, in <module>
document.save(newname)
File "C:\Python27\lib\site-packages\docx\document.py", line 142, in save
self._part.save(path_or_stream)
File "C:\Python27\lib\site-packages\docx\parts\document.py", line 128, in save
self.package.save(path_or_stream)
File "C:\Python27\lib\site-packages\docx\opc\package.py", line 160, in save
PackageWriter.write(pkg_file, self.rels, self.parts)
File "C:\Python27\lib\site-packages\docx\opc\pkgwriter.py", line 32, in write
phys_writer = PhysPkgWriter(pkg_file)
File "C:\Python27\lib\site-packages\docx\opc\phys_pkg.py", line 141, in __init__
self._zipf = ZipFile(pkg_file, 'w', compression=ZIP_DEFLATED)
File "C:\Python27\lib\zipfile.py", line 756, in __init__
self.fp = open(file, modeDict[mode])
IOError: [Errno 22] invalid mode ('wb') or filename: '04.????????_EN.docx'
(Interestingly, the filename I copied in appears as squares with question marks in them in Powershell, but when I copied it out of Powershell and pasted it back in here, it was correctly encoded, indicating that Powershell at least is reading the characters correctly, even if it can't display them.)

Categories