I'm trying to open abc.pdf and find google.com and replace with input words. text changing but, I can write the new text to output.pdf it stay same with abc.pdf how can i solve this ?
import PyPDF2
import fitz
from PyPDF2 import PdfReader, PdfWriter
import requests
# # Replace with the URL of the PDF you want to download
# pdf_url = input("Enter the URL of the pdf file to download: ")
#
# # Replace with the link you want to replace the original links with
new_link = input("Enter the link you want to replace the original links with: ")
#
# # Download the PDF file
# response = requests.get(pdf_url)
# with open("abc.pdf", "wb") as f:
# f.write(response.content)
with open('abc.pdf', 'rb') as file:
reader = PyPDF2.PdfReader(file)
writer = PyPDF2.PdfWriter()
# pdf dosyasının tüm sayfalarını oku
for page in range(len(reader.pages)):
text = reader.pages[page].extract_text()
print(text)
# aranacak stringi bul
if "google.com" in text:
# stringi değiştir
text = text.replace("google.com", new_link)
# pdf dosyasını yeniden yaz
print(text)
writer.add_page(reader.pages[page])
with open('output.pdf', 'wb') as output:
writer.write(output)
file.close()
output.close()
I am also try with fitz, when I serach in links it has to be http:// so I couldnt change link like that google.com
import fitz
import requests
# Replace with the URL of the PDF you want to download
pdf_url = input("Enter the URL of the pdf file to download: ")
# Replace with the link you want to replace the original links with
new_link = input("Enter the link you want to replace the original links with: ")
old_link = input("Enter the link you want to replace ")
# Download the PDF file
response = requests.get(pdf_url)
with open("file.pdf", "wb") as f:
f.write(response.content)
# Open the PDF and modify the links
pdf_doc = fitz.open("file.pdf")
for page in pdf_doc:
for link in page.links():
print(link)
if "uri" in link and link["uri"] == old_link:
print("Found one")
link["uri"] = new_link
# Save the modified PDF to the desktop
pdf_doc.save("test2.pdf")
pdf_doc.close()
And another :
import PyPDF2
import fitz
from PyPDF2 import PdfReader, PdfWriter
import requests
# # Replace with the URL of the PDF you want to download
# pdf_url = input("Enter the URL of the pdf file to download: ")
#
# # Replace with the link you want to replace the original links with
new_link = input("Enter the link you want to replace the original links with: ")
#
# # Download the PDF file
# response = requests.get(pdf_url)
# with open("abc.pdf", "wb") as f:
# f.write(response.content)
# Open the original PDF file
# with open('abc.pdf', 'rb') as file:
doc = fitz.open('abc.pdf')
print(doc)
p = fitz.Point(50, 72) # start point of 1st line
for page in doc:
print(page)
text = page.get_text()
text = text.replace("google.com", new_link).encode("utf8")
rc = page.insert_text(p, # bottom-left of 1st char
text, # the text (honors '\n')
fontname="helv", # the default font
fontsize=11, # the default font size
rotate=0, # also available: 90, 180, 270
) # print(text)
# page.set_text(text)
# doc.insert_pdf(text,to_page=0)
doc.save("output.pdf")
doc.close()
Related
i want to replace a word in a pdf but when i try to do that it always returns me same pdf. Here is my code block. Currentyle i am using pypdf2 but if is there any suggestion i can switch it. What is the missing part at my code?
with open(file_path, 'rb') as file:
pdf_reader = PdfFileReader(file)
# Encrypt the word in the PDF content
encrypted_word = self.cipher.encrypt(word_to_encrypt_bytes)
encrypted_word_b64 = base64.b64encode(encrypted_word)
# Write the encrypted PDF content to a new PDF file
pdf_writer = PdfFileWriter()
for i in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(i)
page_content = page.extractText()
page_content_b = page_content.encode('utf-8')
page_content_b = page_content_b.replace(word_to_encrypt.encode(), encrypted_word_b64)
page_content = page_content_b.decode('utf-8')
pdf_writer.addPage(page)
output_path = os.path.join(file_dir, file_name_without_ext + '_encryptedm' + ext)
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
I want to place a word in my pdf.
It looks like you are only replacing the word in the extracted text, but not actually updating the PDF page content. To do this, you can use the setContentStreams method of the page object to replace the content stream with the updated content.
Here's an updated code block that should work:
from PyPDF2 import PdfFileReader, PdfFileWriter
import base64
with open(file_path, 'rb') as file:
pdf_reader = PdfFileReader(file)
# Encrypt the word in the PDF content
encrypted_word = self.cipher.encrypt(word_to_encrypt_bytes)
encrypted_word_b64 = base64.b64encode(encrypted_word)
# Write the encrypted PDF content to a new PDF file
pdf_writer = PdfFileWriter()
for i in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(i)
page_content = page.extractText()
page_content_b = page_content.encode('utf-8')
updated_content_b = page_content_b.replace(word_to_encrypt.encode(), encrypted_word_b64)
page_content = updated_content_b.decode('utf-8')
page_content_streams = [b"q\n"] + page.getContents().split(b"q\n")[1:]
updated_content_streams = [b"q\n"] + updated_content_b.split(b"q\n")[1:]
page.setContentStreams(updated_content_streams)
pdf_writer.addPage(page)
output_path = os.path.join(file_dir, file_name_without_ext + '_encryptedm' + ext)
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
In this updated code, we first extract the page content as text, replace the word, and then convert it back to bytes. We then get the existing content streams of the page using the getContents method, split them on the q operator (which marks the beginning of a new graphics state), and prepend a q operator to the updated content streams (since the first graphics state is not included in the extracted content). Finally, we set the updated content streams using the setContentStreams method of the page object, and add the updated page to the PDF writer.
I am trying to webscrape this website. To do so, I wrote the following code which works nicely:
from bs4 import BeautifulSoup
import pandas as pd
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{card.a.get('href')}").content)
data.append({
'date': card.select_one('.item_date').get_text(strip=True),
'title': card.select_one('.title a').get_text(strip=True),
'author': card.select_one('.authorlnk.dashed').get_text(strip=True),
'url': f"https://www.bis.org{card.a.get('href')}",
'text': r.select_one('#cmsContent').get_text('\n\n', strip=True)
})
pd.DataFrame(data)
However, if you for example open the first link of the page, there is a pdf in it. I would like to add to my dataframe - whenever there is a pdf in the link - the content of the pdf.
To do so, I have been looking around and I tried the following only on the first pdf of the first link:
import io
from PyPDF2 import PdfFileReader
def info(pdf_path):
response = requests.get(pdf_path)
with io.BytesIO(response.content) as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
txt = f"""
Information about {pdf_path}:
Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}
"""
print(txt)
return information
info('https://www.bis.org/review/r220708e.pdf')
However, it just gets the info (which I already have from the previous code), while it is missing the text. Ideally, I would like it to be part of the same code as above. I got stuck here.
Can anyone help me with this?
Thanks!
You need to return it, e.g. as a tuple :
return txt, information
If you want the text inside the pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() + "\n"
I'll allow you the pleasure of adapting this to your requests, sync scraping fashion (really not hard):
from PyPDF2 import PdfReader
...
async def get_full_content(url):
async with AsyncClient(headers=headers, timeout=60.0, follow_redirects=True) as client:
if url[-3:] == 'pdf':
r = await client.get(url)
with open(f'{url.split("/")[-1]}', 'wb') as f:
f.write(r.content)
reader = PdfReader(f'{url.split("/")[-1]}')
pdf_text = ''
number_of_pages = len(reader.pages)
for x in range(number_of_pages):
page = reader.pages[x]
text = page.extract_text()
pdf_text = pdf_text + text
And then you do something with the pdf_text extracted from .pdf (saving it into a db, reading it with pandas, nlp-ing it with Transformers/torch, etc).
Edit: one more thing: do a pip install -U pypdf2 as the package was recently updated (a few hours ago), just to make sure you're up to date.
Edit 2: A copy/pastable example, for a single .pdf file:
from PyPDF2 import PdfReader
import requests
url = 'https://www.bis.org/review/r220708e.pdf'
r = requests.get(url)
with open(f'{url.split("/")[-1]}', 'wb') as f:
f.write(r.content)
reader = PdfReader(f'{url.split("/")[-1]}')
pdf_text = ''
number_of_pages = len(reader.pages)
for x in range(number_of_pages):
page = reader.pages[x]
text = page.extract_text()
pdf_text = pdf_text + text
print(pdf_text)
I need to download a PDF form Azure Storage, edit the file (extract an specific page) and render it from a Django view, I have this:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with io.BytesIO() as out:
pdfWriter.write(out)
But I can't achieve to render the pdf from the Django view, I don't want to use open because I had issue in production by doing this.
EDIT1:
This did work for me but NOT in production:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('media/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('media/test.pdf', 'rb'), content_type='application/pdf')
EDIT2:
This works but had to change the /media path for another one, not sure if is the best solution yet:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('test/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('test/test.pdf', 'rb'), content_type='application/pdf')
There is this URL https://www.jpx.co.jp/english/listing/stocks/new/index.html#3422
I wrote(copy&paste from internet!) the following code to save all the pdfs which are inside the table in a folder
from PyPDF2 import PdfFileReader
import requests
from bs4 import BeautifulSoup
import io
import urllib.request as req
import urllib
import os
import time
from urllib.parse import urljoin
url = 'https://www.jpx.co.jp/english/listing/stocks/new/index.html'
headers = {'User-Agent':'Mozilla/5.0'}
res = req.urlopen(url)
soup = BeautifulSoup(res, "html.parser")
result = soup.select("a[href]")
link_list =[]
for link in result:
href = link.get("href")
link_list.append(href)
pdf_list = [temp for temp in link_list if temp.endswith('pdf')]
print(pdf_list)
abs_pdf_list = []
for relative in pdf_list:
temp_url = urljoin(url, relative)
abs_pdf_list.append(temp_url)
filename_list = []
for target in abs_pdf_list:
temp_list = target.split("/")
filename_list.append(temp_list[len(temp_list)-1])
newpath = r'/Users/myfolder/python/IPO'
if not os.path.exists(newpath):
os.makedirs(newpath)
target_dir = "/Users/myfolder/python/IPO/"
savepath_list = []
for filename in filename_list:
savepath_list.append(os.path.join(target_dir, filename))
savepath_list
for (pdflink, savepath) in zip(abs_pdf_list, savepath_list):
print(pdflink)
urllib.request.urlretrieve(pdflink, savepath)
time.sleep(2)
import pdfplumber
import re
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url) as r:
with open(local_filename, 'wb') as f:
f.write(r.content)
return local_filename
ap_url = abs_pdf_list[0]
ap = download_file(ap_url)
with pdfplumber.open(ap) as pdf:
page1 = pdf.pages[0]
page2 = pdf.pages[1]
text = page1.extract_text()
print(text)
Now I need to read those pdfs and extract the below lines,
From page1
line which start with "Information & Communication"
From page2
lines which start with
"Book-building Period"
"Offering Price"
and save them in one Excel or CSV file
Sadly I reached to my coding skill limit and can’t move any further .I convert the pdf to text,but …
Please advice me how to do this
I would recommend installing our new package, pdftextract, that conserves the pdf layout as best as possible to extract text, then using some regex to extract the keywords.
Here's a working code snippet tested on 2 pdf files from your link:
import re
import csv
from pdftextract import XPdf
pdf_files = ['a.pdf', "b.pdf"]
keywords = ["Information & Communication", "Book-building Period", "Offering Price"]
def extract_infos(file:str, keywords:list):
"""extract the text from the pdf file then get the wanted keywords information"""
# extracting the text from pdf while keeping the original layout
pdf = XPdf(file)
txt = pdf.to_text(keep_layout=True)
row = []
# getting the keywords information
for keyword in keywords:
# search for the keyword
pattern = "{} (.+)\r".format(keyword) # extracting the wanted info
regex = re.compile(pattern, flags=re.I| re.M)
m = regex.search(txt)
if m is not None:
m = m.groups()[0].strip(' /\r') # strip unwanted space and characters
row.append(m)
return row
def main(files:list, fname:str, headers:list):
"""extract the wanted info from a bunch of pdf files and save them as csv file"""
with open(fname, "w") as wf:
writer = csv.writer(wf)
writer.writerow(headers)
for i, file in enumerate(files, start=1):
row = extract_infos(file, headers)
writer.writerow(row)
print("[DONE]", "writed {} rows to {}.".format(i, fname))
main(pdf_files, "stocks.csv", keywords)
I am having trouble with the Python Docx Library, I have scraped images from a website and I want to add them to docx but I cannot add the images to docx directly, I keep getting an error:
File "C:\Python27\lib\site-packages\docx\image\image.py", line 46, in
from_file
with open(path, 'rb') as f: IOError: [Errno 22] invalid mode ('rb') or filename:
'http://upsats.com/Content/Product/img/Product/Thumb/PCB2x8-.jpg'
This is my code:
import urllib
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
import os
document = Document()
document.add_heading("Megatronics Items Full Search", 0)
FullPage = ['New-Arrivals-2017-6', 'Big-Sales-click-here', 'Arduino-Development-boards',
'Robotics-and-Copters', 'Breakout-Boards', 'RC-Wireless-communication', 'GSM,-GPS,-RFID,-Wifi',
'Advance-Development-boards-and-starter-Kits', 'Sensors-and-IMU', 'Solenoid-valves,-Relays,--Switches',
'Motors,-drivers,-wheels', 'Microcontrollers-and-Educational-items', 'Arduino-Shields',
'Connectivity-Interfaces', 'Power-supplies,-Batteries-and-Chargers', 'Programmers-and-debuggers',
'LCD,-LED,-Cameras', 'Discrete-components-IC', 'Science-Education-and-DIY', 'Consumer-Electronics-and-tools',
'Mechanical-parts', '3D-Printing-and-CNC-machines', 'ATS', 'UPS', 'Internal-Battries-UPS',
'External-Battries-UPS']
urlp1 = "http://www.arduinopak.com/Prd.aspx?Cat_Name="
URL = urlp1 + FullPage[0]
for n in FullPage:
URL = urlp1 + n
page = urllib.urlopen(URL)
bsObj = BeautifulSoup(page, "lxml")
panel = bsObj.findAll("div", {"class": "panel"})
for div in panel:
titleList = div.find('div', attrs={'class': 'panel-heading'})
imageList = div.find('div', attrs={'class': 'pro-image'})
descList = div.find('div', attrs={'class': 'pro-desc'})
r = requests.get("http://upsats.com/", stream=True)
data = r.text
for link in imageList.find_all('img'):
image = link.get("src")
image_name = os.path.split(image)[1]
r2 = requests.get(image)
with open(image_name, "wb") as f:
f.write(r2.content)
print(titleList.get_text(separator=u' '))
print(imageList.get_text(separator=u''))
print(descList.get_text(separator=u' '))
document.add_heading("%s \n" % titleList.get_text(separator=u' '))
document.add_picture(image, width=Inches(1.5))
document.add_paragraph("%s \n" % descList.get_text(separator=u' '))
document.save('megapy.docx')
Not all of it but just the main part. Now, I am having problems copying the pictures that I downloaded, I want to copy it to docx. I do not know how to add the picture. How do I convert it? I think I have to format it but how do I do that?
All I know is the problem lies within this code:
document.add_picture(image, width=Inches(1.0))
How do I make this image show up in docx from the URL? What am I missing?
Update
I did a test with 10 images and I got a docx. When loading many I had an error at one place and I overwrote that by adding a try, except (see below). The resulting megapy.docx got 165 MB big and took about 10 minutes to create.
with open(image_name, "wb") as f:
f.write(r2.content)
To:
image = io.BytesIO(r2.content)
And added:
try:
document.add_picture(image, width=Inches(1.5))
except:
pass
Use io library to create file-like ojects.
Example that works on python2&3:
import requests
import io
from docx import Document
from docx.shared import Inches
url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Usain_Bolt_Rio_100m_final_2016k.jpg/200px-Usain_Bolt_Rio_100m_final_2016k.jpg'
response = requests.get(url, stream=True)
image = io.BytesIO(response.content)
document = Document()
document.add_picture(image, width=Inches(1.25))
document.save('demo.docx')