Download a PDF from url, edit it an render it in Django - python

I need to download a PDF form Azure Storage, edit the file (extract an specific page) and render it from a Django view, I have this:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with io.BytesIO() as out:
pdfWriter.write(out)
But I can't achieve to render the pdf from the Django view, I don't want to use open because I had issue in production by doing this.
EDIT1:
This did work for me but NOT in production:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('media/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('media/test.pdf', 'rb'), content_type='application/pdf')
EDIT2:
This works but had to change the /media path for another one, not sure if is the best solution yet:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('test/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('test/test.pdf', 'rb'), content_type='application/pdf')

Related

How to replace a word in pdf with Python

i want to replace a word in a pdf but when i try to do that it always returns me same pdf. Here is my code block. Currentyle i am using pypdf2 but if is there any suggestion i can switch it. What is the missing part at my code?
with open(file_path, 'rb') as file:
pdf_reader = PdfFileReader(file)
# Encrypt the word in the PDF content
encrypted_word = self.cipher.encrypt(word_to_encrypt_bytes)
encrypted_word_b64 = base64.b64encode(encrypted_word)
# Write the encrypted PDF content to a new PDF file
pdf_writer = PdfFileWriter()
for i in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(i)
page_content = page.extractText()
page_content_b = page_content.encode('utf-8')
page_content_b = page_content_b.replace(word_to_encrypt.encode(), encrypted_word_b64)
page_content = page_content_b.decode('utf-8')
pdf_writer.addPage(page)
output_path = os.path.join(file_dir, file_name_without_ext + '_encryptedm' + ext)
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
I want to place a word in my pdf.
It looks like you are only replacing the word in the extracted text, but not actually updating the PDF page content. To do this, you can use the setContentStreams method of the page object to replace the content stream with the updated content.
Here's an updated code block that should work:
from PyPDF2 import PdfFileReader, PdfFileWriter
import base64
with open(file_path, 'rb') as file:
pdf_reader = PdfFileReader(file)
# Encrypt the word in the PDF content
encrypted_word = self.cipher.encrypt(word_to_encrypt_bytes)
encrypted_word_b64 = base64.b64encode(encrypted_word)
# Write the encrypted PDF content to a new PDF file
pdf_writer = PdfFileWriter()
for i in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(i)
page_content = page.extractText()
page_content_b = page_content.encode('utf-8')
updated_content_b = page_content_b.replace(word_to_encrypt.encode(), encrypted_word_b64)
page_content = updated_content_b.decode('utf-8')
page_content_streams = [b"q\n"] + page.getContents().split(b"q\n")[1:]
updated_content_streams = [b"q\n"] + updated_content_b.split(b"q\n")[1:]
page.setContentStreams(updated_content_streams)
pdf_writer.addPage(page)
output_path = os.path.join(file_dir, file_name_without_ext + '_encryptedm' + ext)
with open(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
In this updated code, we first extract the page content as text, replace the word, and then convert it back to bytes. We then get the existing content streams of the page using the getContents method, split them on the q operator (which marks the beginning of a new graphics state), and prepend a q operator to the updated content streams (since the first graphics state is not included in the extracted content). Finally, we set the updated content streams using the setContentStreams method of the page object, and add the updated page to the PDF writer.

Python only read the first PDF FIle

I am using following code for the test purpose. However it only reads the first pdf of the directory. My pdf files name is like test , test1 and test2. But it only reads the test. I can also see the list of all pdf in the directory but the read function only reads the first pdf, I am not sure my line of code for +file_name, it should be like that or something else.
I request for help please. Below is my code for reference
#date
from datetime import*
import PyPDF2
import os
import re
today_date = datetime.today()
print('Today is:' , today_date)
#file list
for file_name in os.listdir(r"C:\\Nikhar\Work\Quantum\Work"):
print(file_name)
#read all file in directory
load_pdf = open(r"C:\\Nikhar\\Work\\Quantum\\Work\\"+file_name, "rb")
read_pdf = PyPDF2.PdfFileReader(load_pdf)
page_count = read_pdf.getNumPages()
first_page = read_pdf.getPage(0)
page_content = first_page.extractText()
page_content = page_content.replace('\n', '')
print(page_content)
You have to indent the code to make it executed in each loop like this :
#date
from datetime import*
import PyPDF2
import os
import re
today_date = datetime.today()
print('Today is:' , today_date)
#file list
for file_name in os.listdir(r"C:\\Nikhar\Work\Quantum\Work"):
print(file_name)
#read all file in directory
load_pdf = open(r"C:\\Nikhar\\Work\\Quantum\\Work\\"+file_name, "rb")
read_pdf = PyPDF2.PdfFileReader(load_pdf)
page_count = read_pdf.getNumPages()
first_page = read_pdf.getPage(0)
page_content = first_page.extractText()
page_content = page_content.replace('\n', '')
print(page_content)
You must simply indent the code that should be executed in the for loop:
#date
from datetime import*
import PyPDF2
import os
import re
today_date = datetime.today()
print('Today is:' , today_date)
#file list
for file_name in os.listdir(r"C:\\Nikhar\Work\Quantum\Work"):
print(file_name)
#read all file in directory
load_pdf = open(r"C:\\Nikhar\\Work\\Quantum\\Work\\"+file_name, "rb")
read_pdf = PyPDF2.PdfFileReader(load_pdf)
page_count = read_pdf.getNumPages()
first_page = read_pdf.getPage(0)
page_content = first_page.extractText()
page_content = page_content.replace('\n', '')
print(page_content)

Data not being written to file

I am writing a python program that uses scrapy spider to crawl a website and looks for pdf files. Once a pdf file is found, it downloads the pdf file to a folder and opens and reads the pdf to grab info from it (author, title, number of pages and if the document is tagged or not). This data is written in CSV file.
Everything seems to work fine, but when I open the CSV file, there is not nearly as much data in it as I expected. For example, I will crawl a site and find 40 pdf files, but the CSV has entries for only 14 pdf's.
pdfspider.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
import urllib.parse as urlparse
import os.path
import validators
import csv
from .. info import isTagged
from .. get_metadata import get_data
class PdfspiderSpider(CrawlSpider):
name = 'pdfspider'
allowed_domain = input('Enter the domain name of the website to be crawled (domain of https://google.com is "google"): ')
allowed_domains = [allowed_domain]
#need domain to name folder pdfs will be put into
global domain
domain = allowed_domains[0]
global start
start = input('Enter the url of the page you wish to start the crawl on (include http/https): ')
start_urls = [start]
global base_path
base_path = input('Where do you wish to save the folder containing the pdfs?: ')
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
base_url = start
for a in response.xpath('//a[#href]/#href'):
link = a.extract()
if link.endswith('.pdf'):
link = urlparse.urljoin(base_url, link)
yield Request(link, callback=self.save_pdf)
def create_csv(self):
header = ['location', 'title', 'author', '# of pages', 'tagged?']
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, 'x')
writer = csv.writer(f)
writer.writerow(header)
f.close()
def save_pdf(self, response):
url=response.url
if response.status == 200:
save_dir = base_path + '/' + domain
isExist = os.path.exists(save_dir)
if not isExist:
# Create a new directory because it does not exist
os.makedirs(save_dir)
csvFile = domain + '.csv'
csvPath = save_dir + '/' + csvFile
csvPathExist = os.path.exists(csvPath)
if not csvPathExist:
self.create_csv()
file = response.url.split('/')[-1]
full_path = os.path.join(save_dir, file)
with open(full_path, 'wb') as f:
f.write(response.body)
metaData = get_data(full_path)
is_tagged = isTagged(full_path)
row = [url, metaData[0], metaData[1], metaData[2], is_tagged]
self.add_to_csv(row)
else:
print(f"Failed to load pdf: {url}")
def add_to_csv(self,row):
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, "a")
writer = csv.writer(f)
writer.writerow(row)
f.close()
info.py
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.pdfdevice import TagExtractor
from pdfminer3.pdfpage import PDFPage
from io import BytesIO
def isTagged(path, password=''):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
try:
try:
device = TagExtractor(rsrcmgr, retstr, codec='utf-8')
except:
print('Not utf-8.')
try:
device = TagExtractor(rsrcmgr, retstr, codec='ascii')
except:
print('Not ascii.')
except Exception as ex:
print(ex)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
maxpages = 100
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
interpreter.process_page(page)
contents = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
# check if common proprietary Acrobat tags are in the response
tags = ["<b\'Part\'", "</b\'Sect\'", "</b\'Art\'", "<b'Content'", "<b\'Artifact\'"]
for tag in tags:
if tag in contents:
return 'tagged'
break
else:
continue
return 'not tagged'
get_metadata.py
from pdfminer3.pdfparser import PDFParser
from pdfminer3.pdfdocument import PDFDocument
from pdfminer.high_level import extract_pages
def get_data(file):
fp = open(file, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
author = doc.info[0]["Author"]
title = doc.info[0]["Title"]
numPages = len(list(extract_pages(file)))
return title, author, numPages
Can anyone see what I am doing wrong? This is my first python program so I am pretty green. I am running python 3.9.7

Check row if exist in csv Python

I'm new to Python and I'm copying a website and storing it in a csv file, but I can't verify that it exists in the csv file. I imagine that the correct thing would be to scroll through the lines to check if the title already exists. Does anyone have any idea how to solve this? See the code below:
from pathlib import Path
import time
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
DRIVER_PATH = str(Path('geckodriver').resolve())
def write_csv(ads):
filename = 'results.csv'
with open(filename, 'a+') as f:
fields = ['title', 'url']
writer = csv.DictWriter(f, fieldnames=fields)
existing_lines = csv.reader(f)
for ad in ads:
if ad['title'] not in existing_lines:
print(existing_lines)
writer.writerow(ad)
print('success')
else:
print('fail')
def get_html(url):
browser = webdriver.Firefox(executable_path=DRIVER_PATH)
browser.get(url)
return browser.page_source
def scrapde_data(card):
try:
h2 = card.h2
except:
title = ''
url = ''
else:
title = h2.text.strip()
try:
url = card.find('a').get('href')
except:
url = ''
data = {'title': title, 'url': url}
return data
def main():
while True:
url = '#'
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
cards = soup.find_all('div', {"class": "produto--comprar"})
ads_data = []
for card in cards:
data = scrapde_data(card)
ads_data.append(data)
write_csv(ads_data)
time.sleep(5)
if __name__ == '__main__':
main()
Help me please? :(
Try the following code:-
def write_csv(ads):
filename = 'results.csv'
with open(filename, 'a+', newline='') as f:
fields = ['title', 'url']
writer = csv.DictWriter(f, fieldnames=fields)
#moving file pointer at the start of the file
f.seek(0)
existing_lines = csv.reader(f)
#finding no of lines in the file
count=0
for line in existing_lines:
count+=1
break
#if file is not empty
if count>0:
for ad in ads:
flag=0
#moving file pointer to the start of the file
f.seek(0)
#checking if ad['title'] is present in the first column of csv file
for line in existing_lines:
if ad['title'] in line[0]:
flag=1
#if ad['title'] is not found
if flag==0:
writer.writerow(ad)
#if file is empty write the dictionary contents into the csv
else:
for ad in ads:
writer.writerow(ad)

How to read from PDF and save as CSV, using Python?

There is this URL https://www.jpx.co.jp/english/listing/stocks/new/index.html#3422
I wrote(copy&paste from internet!) the following code to save all the pdfs which are inside the table in a folder
from PyPDF2 import PdfFileReader
import requests
from bs4 import BeautifulSoup
import io
import urllib.request as req
import urllib
import os
import time
from urllib.parse import urljoin
url = 'https://www.jpx.co.jp/english/listing/stocks/new/index.html'
headers = {'User-Agent':'Mozilla/5.0'}
res = req.urlopen(url)
soup = BeautifulSoup(res, "html.parser")
result = soup.select("a[href]")
link_list =[]
for link in result:
href = link.get("href")
link_list.append(href)
pdf_list = [temp for temp in link_list if temp.endswith('pdf')]
print(pdf_list)
abs_pdf_list = []
for relative in pdf_list:
temp_url = urljoin(url, relative)
abs_pdf_list.append(temp_url)
filename_list = []
for target in abs_pdf_list:
temp_list = target.split("/")
filename_list.append(temp_list[len(temp_list)-1])
newpath = r'/Users/myfolder/python/IPO'
if not os.path.exists(newpath):
os.makedirs(newpath)
target_dir = "/Users/myfolder/python/IPO/"
savepath_list = []
for filename in filename_list:
savepath_list.append(os.path.join(target_dir, filename))
savepath_list
for (pdflink, savepath) in zip(abs_pdf_list, savepath_list):
print(pdflink)
urllib.request.urlretrieve(pdflink, savepath)
time.sleep(2)
import pdfplumber
import re
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url) as r:
with open(local_filename, 'wb') as f:
f.write(r.content)
return local_filename
ap_url = abs_pdf_list[0]
ap = download_file(ap_url)
with pdfplumber.open(ap) as pdf:
page1 = pdf.pages[0]
page2 = pdf.pages[1]
text = page1.extract_text()
print(text)
Now I need to read those pdfs and extract the below lines,
From page1
line which start with "Information & Communication"
From page2
lines which start with
"Book-building Period"
"Offering Price"
and save them in one Excel or CSV file
Sadly I reached to my coding skill limit and can’t move any further .I convert the pdf to text,but …
Please advice me how to do this
I would recommend installing our new package, pdftextract, that conserves the pdf layout as best as possible to extract text, then using some regex to extract the keywords.
Here's a working code snippet tested on 2 pdf files from your link:
import re
import csv
from pdftextract import XPdf
pdf_files = ['a.pdf', "b.pdf"]
keywords = ["Information & Communication", "Book-building Period", "Offering Price"]
def extract_infos(file:str, keywords:list):
"""extract the text from the pdf file then get the wanted keywords information"""
# extracting the text from pdf while keeping the original layout
pdf = XPdf(file)
txt = pdf.to_text(keep_layout=True)
row = []
# getting the keywords information
for keyword in keywords:
# search for the keyword
pattern = "{} (.+)\r".format(keyword) # extracting the wanted info
regex = re.compile(pattern, flags=re.I| re.M)
m = regex.search(txt)
if m is not None:
m = m.groups()[0].strip(' /\r') # strip unwanted space and characters
row.append(m)
return row
def main(files:list, fname:str, headers:list):
"""extract the wanted info from a bunch of pdf files and save them as csv file"""
with open(fname, "w") as wf:
writer = csv.writer(wf)
writer.writerow(headers)
for i, file in enumerate(files, start=1):
row = extract_infos(file, headers)
writer.writerow(row)
print("[DONE]", "writed {} rows to {}.".format(i, fname))
main(pdf_files, "stocks.csv", keywords)

Categories