How to replace a word in pdf with Python - python

i want to replace a word in a pdf but when i try to do that it always returns me same pdf. Here is my code block. Currentyle i am using pypdf2 but if is there any suggestion i can switch it. What is the missing part at my code?
with open(file_path, 'rb') as file:
pdf_reader = PdfFileReader(file)
# Encrypt the word in the PDF content
encrypted_word = self.cipher.encrypt(word_to_encrypt_bytes)
encrypted_word_b64 = base64.b64encode(encrypted_word)
# Write the encrypted PDF content to a new PDF file
pdf_writer = PdfFileWriter()
for i in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(i)
page_content = page.extractText()
page_content_b = page_content.encode('utf-8')
page_content_b = page_content_b.replace(word_to_encrypt.encode(), encrypted_word_b64)
page_content = page_content_b.decode('utf-8')
pdf_writer.addPage(page)
output_path = os.path.join(file_dir, file_name_without_ext + '_encryptedm' + ext)
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
I want to place a word in my pdf.

It looks like you are only replacing the word in the extracted text, but not actually updating the PDF page content. To do this, you can use the setContentStreams method of the page object to replace the content stream with the updated content.
Here's an updated code block that should work:
from PyPDF2 import PdfFileReader, PdfFileWriter
import base64
with open(file_path, 'rb') as file:
pdf_reader = PdfFileReader(file)
# Encrypt the word in the PDF content
encrypted_word = self.cipher.encrypt(word_to_encrypt_bytes)
encrypted_word_b64 = base64.b64encode(encrypted_word)
# Write the encrypted PDF content to a new PDF file
pdf_writer = PdfFileWriter()
for i in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(i)
page_content = page.extractText()
page_content_b = page_content.encode('utf-8')
updated_content_b = page_content_b.replace(word_to_encrypt.encode(), encrypted_word_b64)
page_content = updated_content_b.decode('utf-8')
page_content_streams = [b"q\n"] + page.getContents().split(b"q\n")[1:]
updated_content_streams = [b"q\n"] + updated_content_b.split(b"q\n")[1:]
page.setContentStreams(updated_content_streams)
pdf_writer.addPage(page)
output_path = os.path.join(file_dir, file_name_without_ext + '_encryptedm' + ext)
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
In this updated code, we first extract the page content as text, replace the word, and then convert it back to bytes. We then get the existing content streams of the page using the getContents method, split them on the q operator (which marks the beginning of a new graphics state), and prepend a q operator to the updated content streams (since the first graphics state is not included in the extracted content). Finally, we set the updated content streams using the setContentStreams method of the page object, and add the updated page to the PDF writer.

Related

Write changing text to new pdf

I'm trying to open abc.pdf and find google.com and replace with input words. text changing but, I can write the new text to output.pdf it stay same with abc.pdf how can i solve this ?
import PyPDF2
import fitz
from PyPDF2 import PdfReader, PdfWriter
import requests
# # Replace with the URL of the PDF you want to download
# pdf_url = input("Enter the URL of the pdf file to download: ")
#
# # Replace with the link you want to replace the original links with
new_link = input("Enter the link you want to replace the original links with: ")
#
# # Download the PDF file
# response = requests.get(pdf_url)
# with open("abc.pdf", "wb") as f:
# f.write(response.content)
with open('abc.pdf', 'rb') as file:
reader = PyPDF2.PdfReader(file)
writer = PyPDF2.PdfWriter()
# pdf dosyasının tüm sayfalarını oku
for page in range(len(reader.pages)):
text = reader.pages[page].extract_text()
print(text)
# aranacak stringi bul
if "google.com" in text:
# stringi değiştir
text = text.replace("google.com", new_link)
# pdf dosyasını yeniden yaz
print(text)
writer.add_page(reader.pages[page])
with open('output.pdf', 'wb') as output:
writer.write(output)
file.close()
output.close()
I am also try with fitz, when I serach in links it has to be http:// so I couldnt change link like that google.com
import fitz
import requests
# Replace with the URL of the PDF you want to download
pdf_url = input("Enter the URL of the pdf file to download: ")
# Replace with the link you want to replace the original links with
new_link = input("Enter the link you want to replace the original links with: ")
old_link = input("Enter the link you want to replace ")
# Download the PDF file
response = requests.get(pdf_url)
with open("file.pdf", "wb") as f:
f.write(response.content)
# Open the PDF and modify the links
pdf_doc = fitz.open("file.pdf")
for page in pdf_doc:
for link in page.links():
print(link)
if "uri" in link and link["uri"] == old_link:
print("Found one")
link["uri"] = new_link
# Save the modified PDF to the desktop
pdf_doc.save("test2.pdf")
pdf_doc.close()
And another :
import PyPDF2
import fitz
from PyPDF2 import PdfReader, PdfWriter
import requests
# # Replace with the URL of the PDF you want to download
# pdf_url = input("Enter the URL of the pdf file to download: ")
#
# # Replace with the link you want to replace the original links with
new_link = input("Enter the link you want to replace the original links with: ")
#
# # Download the PDF file
# response = requests.get(pdf_url)
# with open("abc.pdf", "wb") as f:
# f.write(response.content)
# Open the original PDF file
# with open('abc.pdf', 'rb') as file:
doc = fitz.open('abc.pdf')
print(doc)
p = fitz.Point(50, 72) # start point of 1st line
for page in doc:
print(page)
text = page.get_text()
text = text.replace("google.com", new_link).encode("utf8")
rc = page.insert_text(p, # bottom-left of 1st char
text, # the text (honors '\n')
fontname="helv", # the default font
fontsize=11, # the default font size
rotate=0, # also available: 90, 180, 270
) # print(text)
# page.set_text(text)
# doc.insert_pdf(text,to_page=0)
doc.save("output.pdf")
doc.close()

Download a PDF from url, edit it an render it in Django

I need to download a PDF form Azure Storage, edit the file (extract an specific page) and render it from a Django view, I have this:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with io.BytesIO() as out:
pdfWriter.write(out)
But I can't achieve to render the pdf from the Django view, I don't want to use open because I had issue in production by doing this.
EDIT1:
This did work for me but NOT in production:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('media/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('media/test.pdf', 'rb'), content_type='application/pdf')
EDIT2:
This works but had to change the /media path for another one, not sure if is the best solution yet:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('test/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('test/test.pdf', 'rb'), content_type='application/pdf')

Python: How can I save the content of several html files, into a html link, from <title> tag?

I have a Python code that works fine for parsing some data on html files. At the end of the code I must save the html files by tag. For example, I have this 3 html files with 3 titles tags:
<title>My name is Prince</title>
<title>I love Madonna</title>
<title>Cars and Candies</title>
Each of them must be save like this:
my-name-is-prince.html
I-love-madonna.html
cars-and-candies.html
So, I already have some SAVE solution for Python, but I don't know how to save by tag.
try:
title = re.search('<title.+/title>', html)[0]
title_content = re.search('>(.+)<', title)[1]
except:
pass
with open("my-words.html", "w") as some_file_handle:
some_file_handle.write(finalString)
OR
with open('page_323.txt', 'w') as f:
f.write(result.text)
OR
with open("somefilename.txt", "w") as some_file_handle:
for line in data:
some_file_handle.write(line + "\n")
P.S. I have 500 files. The Python code must find each tag from each html and save each of them into new html.
Update
Are you looking for that:
# html = """<title>My name is Prince</title>"""
>>> re.search(r'<title>(?P<title>.+)</title>', html).groups('title')[0] \
.replace(' ', '-').lower()
'my-name-is-prince'
Old answer
If you already extract title from html you can do:
title = 'My name is Prince'
filename = f"{title.lower().replace(' ', '-')}.html"
with open(filename, "w") as some_file_handle:
some_file_handle.write(finalString)
If you want to use beautifulsoup, check this:
soup = soup.encode(formatter=UnsortedAttributes()).decode('utf-8')
new_filename = title.get_text()
new_filename = new_filename.lower()
words = re.findall(r'\w+', new_filename)
new_filename = '-'.join(words)
new_filename = new_filename + '.html'
print(new_filename)
see the complete code here:
https://neculaifantanaru.com/en/python-google-translate-beautifulsoup-library-save-title-tag-as-link.html

How to read from PDF and save as CSV, using Python?

There is this URL https://www.jpx.co.jp/english/listing/stocks/new/index.html#3422
I wrote(copy&paste from internet!) the following code to save all the pdfs which are inside the table in a folder
from PyPDF2 import PdfFileReader
import requests
from bs4 import BeautifulSoup
import io
import urllib.request as req
import urllib
import os
import time
from urllib.parse import urljoin
url = 'https://www.jpx.co.jp/english/listing/stocks/new/index.html'
headers = {'User-Agent':'Mozilla/5.0'}
res = req.urlopen(url)
soup = BeautifulSoup(res, "html.parser")
result = soup.select("a[href]")
link_list =[]
for link in result:
href = link.get("href")
link_list.append(href)
pdf_list = [temp for temp in link_list if temp.endswith('pdf')]
print(pdf_list)
abs_pdf_list = []
for relative in pdf_list:
temp_url = urljoin(url, relative)
abs_pdf_list.append(temp_url)
filename_list = []
for target in abs_pdf_list:
temp_list = target.split("/")
filename_list.append(temp_list[len(temp_list)-1])
newpath = r'/Users/myfolder/python/IPO'
if not os.path.exists(newpath):
os.makedirs(newpath)
target_dir = "/Users/myfolder/python/IPO/"
savepath_list = []
for filename in filename_list:
savepath_list.append(os.path.join(target_dir, filename))
savepath_list
for (pdflink, savepath) in zip(abs_pdf_list, savepath_list):
print(pdflink)
urllib.request.urlretrieve(pdflink, savepath)
time.sleep(2)
import pdfplumber
import re
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url) as r:
with open(local_filename, 'wb') as f:
f.write(r.content)
return local_filename
ap_url = abs_pdf_list[0]
ap = download_file(ap_url)
with pdfplumber.open(ap) as pdf:
page1 = pdf.pages[0]
page2 = pdf.pages[1]
text = page1.extract_text()
print(text)
Now I need to read those pdfs and extract the below lines,
From page1
line which start with "Information & Communication"
From page2
lines which start with
"Book-building Period"
"Offering Price"
and save them in one Excel or CSV file
Sadly I reached to my coding skill limit and can’t move any further .I convert the pdf to text,but …
Please advice me how to do this
I would recommend installing our new package, pdftextract, that conserves the pdf layout as best as possible to extract text, then using some regex to extract the keywords.
Here's a working code snippet tested on 2 pdf files from your link:
import re
import csv
from pdftextract import XPdf
pdf_files = ['a.pdf', "b.pdf"]
keywords = ["Information & Communication", "Book-building Period", "Offering Price"]
def extract_infos(file:str, keywords:list):
"""extract the text from the pdf file then get the wanted keywords information"""
# extracting the text from pdf while keeping the original layout
pdf = XPdf(file)
txt = pdf.to_text(keep_layout=True)
row = []
# getting the keywords information
for keyword in keywords:
# search for the keyword
pattern = "{} (.+)\r".format(keyword) # extracting the wanted info
regex = re.compile(pattern, flags=re.I| re.M)
m = regex.search(txt)
if m is not None:
m = m.groups()[0].strip(' /\r') # strip unwanted space and characters
row.append(m)
return row
def main(files:list, fname:str, headers:list):
"""extract the wanted info from a bunch of pdf files and save them as csv file"""
with open(fname, "w") as wf:
writer = csv.writer(wf)
writer.writerow(headers)
for i, file in enumerate(files, start=1):
row = extract_infos(file, headers)
writer.writerow(row)
print("[DONE]", "writed {} rows to {}.".format(i, fname))
main(pdf_files, "stocks.csv", keywords)

How to Merge two pages from a pdf file as one page

I have a pdf in which there are total 6 pages of images.I want to merge page 1 and 2 as a single pdf and so on for 3 to 6 pages.
I splitted all 6 pages of pdf as individual pdf.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_splitter(path):
fname = os.path.splitext(os.path.basename(path))[0]
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output_filename = '{}_page_{}.pdf'.format(
fname, page+1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
if name == 'main':
path = 'D:\Tasks\Samples\fw9.pdf'
pdf_splitter(path)
I want to know how to merge page 1 and 2 of fw9 as single pdf file which contains only 1 page which have half page as page 1 of fw9 pdf file and another half as page 2 of fw9 pdf.I have to do this for all 6 pages as 1-2 as 1 pdf with 1 page ,3-4 page as another pdf which has only 1 page with both on the same page and so on.Kindly help if any one have any idea on how to do so.
The library pyPDF2 has also a PdfFileMerger object, that should do exactly what you want.
As from the example here you can just create a PdfFileMerger, read two pages and put them into one single file.
I changed your script slightly to create also files with pages 0-1, 2-3, 4-5 ecc.. (of course page 0 is the first page but python numbering starts from 0)
import os
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger
def pdf_splitter(path):
fname = os.path.splitext(os.path.basename(path))[0]
pdf = PdfFileReader(path)
input_paths = []
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output_filename = '{}_page_{}.pdf'.format(fname, page+1)
input_paths.append(output_filename)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
# every 2 pages!
# Change the two if you need every other number of pages!
if page % 2 == 1:
pdf_merger = PdfFileMerger() #create pdfilemerger
for path in input_paths:
pdf_merger.append(path) #read the single pages
# we call it pages_N-1_N, so first would be pages_0_1!
output_path = '{}_pages_{}_{}.pdf'.format(fname, page-1, page)
with open(output_path, 'wb') as fileobj:
pdf_merger.write(fileobj) # write the two pages pdf!
input_paths = []
if __name__ == '__main__':
path = 'D:\Tasks\Samples\fw9.pdf'
pdf_splitter(path)
Is this what you wanted?
This will first create single pdf for each page and then combine them 2 to 2. Creating the single pdf could also be skipped, but I was not sure whether you want it or not.
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2 import PageObject
#Open the files that have to be merged
pdf1File = open('document.pdf', 'rb')
#Read the files that you have opened
pdf1Reader = PdfFileReader(pdf1File)
#Make a list of all pages
pages = []
for pageNum in range(pdf1Reader.numPages):
pageObj = pdf1Reader.getPage(pageNum)
pages.append(pageObj)
#Calculate width and height for final output page
width = pages[0].mediaBox.getWidth() * 6
height = pages[0].mediaBox.getHeight() + 100
#Create blank page to merge all pages in one page
merged_page = PageObject.createBlankPage(None, width, height)
#Loop through all pages and merge / add them to blank page
x = 0
for page in pages:
merged_page.mergeScaledTranslatedPage(page, 1, x, 10)
x = float(x) + float(page.mediaBox.getWidth())
#Create final file with one page
writer = PdfFileWriter()
writer.addPage(merged_page)
with open('out.pdf', 'wb') as f:
writer.write(f)
I wanted to merge 6 files / page so I have used 6 as a multiplier for page width.
this is the answer of how to merge two pages to one page in vertical way
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2 import PageObject
#Open the files that have to be merged
pdf1File = open('1.pdf', 'rb')
#Read the files that you have opened
pdf1Reader = PdfFileReader(pdf1File)
#Make a list of all pages
pages = []
for pageNum in range(pdf1Reader.numPages):
pageObj = pdf1Reader.getPage(pageNum)
pages.append(pageObj)
#Calculate width and height for final output page
width = pages[1].mediaBox.getWidth() * 2
height = pages[1].mediaBox.getHeight()
#Create blank page to merge all pages in one page
merged_page = PageObject.createBlankPage(None, width, height)
writer = PdfFileWriter()
#Loop through all pages and merge / add them to blank page
y =0
merged_page = PageObject.createBlankPage(None, width, height)
for page in range(len(pages)):
y+=1
if y%2!=0:
merged_page.mergePage(pages[page])
x=float(pages[page+1].mediaBox.getWidth())
merged_page.mergeScaledTranslatedPage(pages[page+1], 1,x, 0)
if y%2==0:
writer.addPage(merged_page)
merged_page = PageObject.createBlankPage(None, width, height)
y=0
#Create final file with one page
with open('out.pdf', 'wb') as f:
writer.write(f)

Categories