Python PDF how to add bookmark url instead of page number - python

I'm using python 3.6 and PyPDF2 to create bookmarks in a pdf.
Instead of adding a bookmark to a page within the pdf. I want to add a url (eg. https://stackoverflow.com) as a bookmark.
Something like this?
output.addBookmark('TEST', 'https://stackoverflow.com', parent=None)
I don't think PyPDF2 supports something like this or does it? Is there another library that can support this?
from PyPDF2 import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
input = PdfFileReader(open('test.pdf', 'rb'))
output.addPage(input.getPage(0))
output.addBookmark('TEST', 0, parent=None) # add bookmark
outputStream = open('output.pdf', 'wb')
output.write(outputStream)
outputStream.close()

I might be a bit late but hear me out. I had to solve the same problem and since there is literally 0 information on this anywhere, I've investigated the issue and have managed to find the answer. It's not my most efficient/nicest code but definitely one of my proudest since nobody before did this.
The problem one faces is that there is no Pdf library for python that solves all
pdf related problems. Each tackle problems differently and some can do stuff that others can't. For this purpose, I had to use 2 libraries for the two functions below. PyPDF2 is here to add the bookmarks to the pdf. Pdfrw is here to alter those bookmarks to have the action of opening a url.
In short, we create a new pdf with the added bookmarks, and another new one with the changed bookmarks actions that point to a url.
One thing to mention is that for some reason (for me at least) PyPDF2 adds all bookmarks in a way that if you have multiple ones, they all become child elements to the previous bookmark. this is why we have the while loop, we collect all the bookmarks into a list with it, and then we can select the one we want.
If you already have bookmars and don't add them with PyPDF2, it might be enough to just loop over the metaObjects dictionary and get the values which contain the /Title key. Thus making the code significantly smaller. I've added this part as a comment.
Here an example on how to use the code below:
inputPdf = r"C:\......\first.pdf"
bookmarkedPdf = r"C:\......\second.pdf"
pdfWithWeblink = r"C:\......\final.pdf"
bookmarks = [
{"Title": "The Phantom Menace", "Page": 5},
{"Title": "Attack of the Clones", "Page": 10},
{"Title": "Revenge Of The Sith", "Page": 13},
{"Title": "A New hope", "Page": 18},
{"Title": "The Empire Strikes Back", "Page": 26},
{"Title": "Return of the Jedi", "Page": 32}
]
AddBookmarks(inputPdf, bookmarkedPdf, bookmarks)
AddWebLinkToBookmark(bookmarkedPdf, pdfWithWeblink, "Revenge Of The Sith", "https://stackoverflow.com")
The code:
from PyPDF2 import PdfFileWriter, PdfFileReader
import pdfrw
def AddBookmarks(inputPdfPath: str, outputPdfPath: str, headers: dict) -> None:
""" Adds bookmarks to a PDF. """
output = PdfFileWriter()
input = PdfFileReader(open(inputPdfPath, 'rb'))
for i in range(input.getNumPages()):
output.addPage(input.getPage(i))
for header in headers:
if header["Page"] - 1 == i:
output.addBookmark(header["Title"], header["Page"] - 1, parent=None)
output.setPageMode("/UseOutlines")
outputStream = open(outputPdfPath,'wb')
output.write(outputStream)
outputStream.close()
return outputPdfPath
def AddWebLinkToBookmark(inputPdfPath: str, outputPdfPath: str, bookmarkTitle: str, url: str) -> None:
""" Changes the bookmark action to opening a web url. """
# Reading the Pdf with pdfrw and collecting its meta objects. The bookmarks are among these.
pdf = pdfrw.PdfReader(inputPdfPath, decompress=True)
metaObjects = pdf.indirect_objects
# If you did not add the bookmarks with PyPDF2 previously, use this part for getting the bookmarkToChange variable:
# bookmarkToChange = None
# for _, annotation in metaObjects.items():
# if '/Title' in annotation:
# if annotation["/Title"] == f"({bookmarkTitle})".replace(" ", "\\040"):
# bookmarkToChange = annotation
# if bookmarkToChange == None:
# print(f"There is no bookmark called '{bookmarkTitle}' in this pdf.")
# return
try:
# Selecting the first, top parent bookmark.
bookmark = [annotation for _, annotation in metaObjects.items() if '/Title' in annotation][0]
except IndexError:
print("There are no bookmarks in this pdf.")
return
# Each bookmark is the child of the previous bookmark. They can be accessed from the parent with the '/Next' key.
bookmarkAnnotations = [bookmark]
while "/Next" in bookmark:
if "/Title" not in bookmark["/Next"]:
break
bookmark = bookmark["/Next"]
bookmarkAnnotations.append(bookmark)
try:
# Selecting the bookmark we want to add the url to.
bookmarkToChange = [annotation for annotation in bookmarkAnnotations if annotation["/Title"] == f"({bookmarkTitle})".replace(" ", "\\040")][0]
except IndexError:
print(f"There is no bookmark called '{bookmarkTitle}' in this pdf.")
return
# Changing the internal PDF commands to point to a url instead of a page.
bookmarkToChange.A.D = None # Deletes the page information the 'Go to page' action is pointing to.
bookmarkToChange.A.S = pdfrw.PdfName("URI") # Changes the 'Go to page' action to an 'Open a web link' action.
bookmarkToChange.A.URI = pdfrw.objects.pdfstring.PdfString(f"({url})") # Specifies the url for the 'Open a web link' action.
# Saving the end result into a new file.
pdfrw.PdfWriter().write(outputPdfPath, pdf)

Related

Python PDF Parser - Engineering Drawing

I am trying to write a Python Script to parse through a PDF file using PyPDF2. Only thing is, my PDF file isnt your traditional document, it's an engineering drawing.
Anyway, I need the code to parse through the text that is written on the bottom right corner, as well as a red stamp that has text written on it. The drawing will look something like this: enter image description here
I tried to write some basic code to just parse it and extract the data, but its not working.
import PyPDF2
# creating a pdf file object
pdfFileObj = open('example.pdf', 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# printing number of pages in pdf file
print(pdfReader.numPages)
# creating a page object
pageObj = pdfReader.getPage(0)
# extracting text from page
print(pageObj.extractText())
# closing the pdf file object
pdfFileObj.close()
Anyone have any recomendations?
Late to the party...
None the less, we developed a commercial product to do exactly that: Werk24. It has a simple python client pip install werk24
With this your task becomes very simple. You can read the Title Block with a simple command. Imagine you want to obtain the Designation
from werk24 import Hook, W24AskTitleBlock
from werk24.models.techread import W24TechreadMessage
from werk24.utils import w24_read_sync
from . import get_drawing_bytes # define your own
def recv_title_block(message: W24TechreadMessage) -> None:
""" Print the Designation
NOTE: Other fields like Drawing ID, Material etc are
also available.
"""
print(message.payload_dict.get('designation'))
if __name__ == "__main__":
# submit the request to Werk24
w24_read_sync(
get_drawing_bytes(),
[Hook(
ask=W24AskTitleBlock(),
function=recv_title_block
)])
For the drawing that your provided, the response will be:
"designation": {
"captions": [
{
"language": "eng",
"text": "Descr"
}
],
"values": [
{
"language": "eng",
"test": "Shaft",
}
]
}
NOTE: Your files is very blurry, so I created the response manually - the API requires a minimal resolution of 180 dpi (also works with TIF and DXF files).

Read form fields in a PDF created by Adobe LiveCycle Designer

How to get the fields from this PDF file? It is a dynamic PDF created by Adobe LiveCycle Designer. If you open the link in a web browser, you will probably see a single page starting from 'Please wait...' If you download the file and open it via Adobe Reader (5.0 or higher), you should see all 8 pages.
So, when reading via PyPDF2, you get an empty dictionary because it renders the file as a single page like that you see via a web browser.
def print_fields(path):
from PyPDF2 import PdfFileReader
reader = PdfFileReader(str(path))
fields = reader.getFields()
print(fields)
You can use Java-dependent library tika to read the contents for all 8 pages. However the results are messy and I am avoiding Java dependency.
def read_via_tika(path):
from tika import parser
raw = parser.from_file(str(path))
content = raw['content']
print(content)
So, basically, I can manually Edit -> Form Options -> Export Data… in Adobe Actobat DC to get a nice XML. Similarly, I need to get the nice form fields and their values via Python.
Thanks to this awesome answer, I managed to retrieve the fields using pdfminer.six.
Navigate through Catalog > AcroForm > XFA, then pdfminer.pdftypes.resolve1 the object right after b'datasets' element in the list.
In my case, the following code worked (source: ankur garg)
import PyPDF2 as pypdf
def findInDict(needle, haystack):
for key in haystack.keys():
try:
value=haystack[key]
except:
continue
if key==needle:
return value
if isinstance(value,dict):
x=findInDict(needle,value)
if x is not None:
return x
pdfobject=open('CTRX_filled.pdf','rb')
pdf=pypdf.PdfFileReader(pdfobject)
xfa=findInDict('/XFA',pdf.resolvedObjects)
xml=xfa[7].getObject().getData()

Extracting CSV from Export Button

I apologize for not being able to specifically give out the url im dealing with. I'm trying to extract some data from a certain site but its not organized well enough. However, they do have an "Export To CSV file" and the code for that block is ...
<input type="submit" name="ctl00$ContentPlaceHolder1$ExportValueCSVButton" value="Export to Value CSV" id="ContentPlaceHolder1_ExportValueCSVButton" class="smallbutton">
In this type of situation, whats the best way to go about grabbing that data when there is no specific url to the CSV, Im using Mechanize and BS4.
If you're able to click a button that could download the data as a csv, it sounds like you might be able to wget link that data and save it on your machine and work with it there. I'm not sure if that's what you're getting at here though, any more details you can offer?
You should try Selenium, Selenium is a suite of tools to automate web browsers across many platforms. It can do a lot thing including click button.
Well, you need SOME starting URL to feed br.open() to even start the process.
It appears that you have an aspnetForm type control there and the below code MAY serve as a bit of a starting point, even though it does not work as-is (it's a work in progress...:-).
You'll need to look at the headers and parameters via the network tab of your browser dev tools to see them.
br.open("http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?&Year=2016&LastName="+letter+"&FirstName=&City=&FilerID=")
soup = BS(br.response().read())
table = soup.find("table", { "id" : "ctl00_ContentPlaceHolder1_Results" }) # Need to add error check here...
if table is None: # No lobbyist with last name starting with 'X' :-)
continue
records = table.find_all('tr') # List of all results for this letter
for form in br.forms():
print "Form name:", form.name
print form
for row in records:
rec_print = ""
span = row.find_all('span', 'lblentry', 'value')
for sname in span:
if ',' in sname.get_text(): # They actually have a field named 'comma'!!
continue
rec_print = rec_print + sname.get_text() + "," # Create comma-delimited output
print(rec_print[:-1]) # Strip final comma
lnk = row.find('a', 'lblentrylink')
if lnk is None: # For some reason, first record is blank.
continue
print("Lnk: ", lnk)
newlnk = lnk['id']
print("NEWLNK: ", newlnk)
newstr = lnk['href']
newctl = newstr[+25:-5] # Matching placeholder (strip javascript....)
br.select_form('aspnetForm') # Tried (nr=0) also...
print("NEWCTL: ", newctl)
br[__EVENTTARGET] = newctl
response = br.submit(name=newlnk).read()

How to check / uncheck checkboxes in a PDF with Python (preferably PyPDF2)?

I have the code below
from PyPDF2 import PdfFileReader, PdfFileWriter
d = {
"Name": "James",
" Date": "1/1/2016",
"City": "Wilmo",
"County": "United States"
}
reader = PdfFileReader("medicareRRF.pdf")
inFields = reader.getFields()
watermark = PdfFileReader("justSign.pdf")
writer = PdfFileWriter()
page = reader.getPage(0)
page.mergePage(watermark.getPage(0))
writer.addPage(page)
written_page = writer.getPage(0)
writer.updatePageFormFieldValues(written_page, d)
Which correctly fills in the PDF with the dictionary (d), but how can I check and uncheck boxes on the PDF? Here is the getField() info for one of the boxes:
u'Are you ok': {'/FT': '/Btn','/Kids': [IndirectObject(36, 0),
IndirectObject(38, 0)],'/T': u'Are you ok','/V': '/No'}
I tried adding {'Are you ok' : '/Yes'} and several other similar ways, but nothing worked.
I came across the same issue, looked in several places, and was disappointed that I couldn't find the answer. After a few frustrating hours looking at my code, the pyPDF2 code, and the Adobe PDF 1.7 spec, I finally figured it out. If you debug into updatePageFormFieldValues, you'll see that it uses only TextStringObjects. Checkboxes are not text fields -- even the /V values are not text fields, which seemed counterintuitive at least to me. Debugging into that function showed me that checkboxes are instead NameObjects so I created my own function to handle them. I create two dicts: one with only text values that I pass to the built-in updatePageFormFieldValues function and a second with only checkbox values. I also set the /AS to ensure visibility (see PDF spec). My function looks like this:
def updateCheckboxValues(page, fields):
for j in range(0, len(page['/Annots'])):
writer_annot = page['/Annots'][j].getObject()
for field in fields:
if writer_annot.get('/T') == field:
writer_annot.update({
NameObject("/V"): NameObject(fields[field]),
NameObject("/AS"): NameObject(fields[field])
})
However, as far as I can tell, whether you use /1, /On, or /Yes depends on how the form was defined or perhaps what the PDF reader is looking for. For me, /1 worked.
I will like to add on to the answer #rpsip.
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import NameObject
reader = PdfReader(r"form2.pdf") #where you read the pdf in the same directory
writer = PdfWriter()
page = reader.pages[0] #read page 1 of your pdf
fields = reader.get_fields()
print (fields) # this is to identify if you can see the form fills in that page
writer.add_page(page) #this line is necessary otherwise the pdf will be corrupted
for i in range(len(page["/Annots"])): #in order to access the "Annots" key
print ((page["/Annots"][i].get_object())) #to find out which of the form fills are checkbox or text fill
if (page["/Annots"][i].get_object())['/FT']=="/Btn" and (page["/Annots"][i].get_object())['/T']=='Check Box3': #this is my filter so that I can filter checkboxes and the checkbox I want i.e. "Check Box 3"
print (page["/Annots"][i].get_object()) #further check if I got what I wanted as per the filter
writer_annot = page["/Annots"][i].get_object()
writer_annot.update(
{
NameObject("/V"): NameObject(
"/Yes"), #NameObject being only for checkbox, and please try "/Yes" or "/1" or "/On" to see which works
NameObject("/AS"): NameObject(
"/Yes" #NameObject being only for checkbox, and please try "/Yes" or "/1" or "/On" to see which works
)
}
)
with open("filled-out.pdf", "wb") as output_stream:
writer.write(output_stream) #save the ticked pdf file as another file named "filled-out.pdf"
hoped I helped.

getting data from webpage with select menus with python and beautifulsoup

I am trying to collect data from a webpage which has a bunch of select lists i need to fetch
data from. Here is the page:- http://www.asusparts.eu/partfinder/Asus/All In One/E Series/
And this is what i have so far:
import glob, string
from bs4 import BeautifulSoup
import urllib2, csv
for file in glob.glob("http://www.asusparts.eu/partfinder/*"):
##-page to show all selections for the E-series-##
selected_list = 'http://www.asusparts.eu/partfinder/Asus/All In One/E Series/'
##-
page = urllib2.urlopen(selected_list)
soup = BeautifulSoup(page)
##-page which shows results after selecting one option-##
url = 'http://www.asusparts.eu/partfinder/Asus/All In One/E Series/ET10B'
##-identify the id of select list which contains the E-series-##
select = soup.find('select', id="myselectListModel")
option_tags = select.findAll('option')
##-omit first item in list as isn't part of the option-##
option_tags = option_tags[1:]
for option in option_tags:
open(url + option['value'])
html = urllib2.urlopen("http://www.asusparts.eu/partfinder/")
soup = BeautifulSoup(html)
all = soup.find('div', id="accordion")
I am not sure if i am going about the right way? As all the select menus make it confusing. Basically i need to grab
all the data from the selected results such as images,price,description,etc. They are all contained within
one div tag which contains all the results, which is named 'accordion' so would this still gather all the data?
or would i need to dig deeper to search through the tags inside this div? Also i would have prefered to search by id rather than
class as i could fetch all the data in one go. How would i do this from what i have above? Thanks. Also i am unsure about the glob function too if i am using that correctly or not?
EDIT
Here is my edited code, no errors return however i am not sure if it returns all the models for the e-series?
import string, urllib2, urllib, csv, urlparse from bs4 import
BeautifulSoup
##-page which shows results after selecting one option-##
url = 'http://www.asusparts.eu/partfinder/Asus/All In One/E Series/ET10B'
base_url = 'http://www.asusparts.eu/' + url
print base_url
##-page to show all selections for the E-series-##
selected_list = urllib.quote(base_url + '/Asus/All In One/E Series/ET10B')
print urllib.quote(base_url + '/Asus/All In One/E Series/ET10B')
#selected_list = 'http://www.asusparts.eu/partfinder/Asus/All In One/E Series/ET10B'
##-
page = urllib2.urlopen('http://www.asusparts.eu/partfinder/Asus/All%20In%20One/E%20Series')
soup = BeautifulSoup(page)
print soup
##-identify the id of select list which contains the E-series-##
select = soup.find('select', id="myselectListModel")
option_tags = select.findAll('option')
print option_tags
##-omit first item in list as isn't part of the option-##
option_tags = option_tags[1:]
print option_tags
for option in option_tags:
url + option['redirectvalue']
print " " + url + option['redirectvalue']
First of all, I'd like to point out a couple of problems you have in the code you posted. First, of all the glob module is not typically used for making HTTP requests. It is useful for iterating through a subset of files on a specified path, you can read more about it in its docs.
The second issue is that in the line:
for file in glob.glob("http://www.asusparts.eu/partfinder/*"):
you have an indentation error, because there is no indented code that follows. This will raise an error and prevent the rest of the code from being executed.
Another problem is that you are using some of python's "reserved" names for your variables. You should never use words such as all or file for variable names.
Finally when you are looping through option_tags:
for option in option_tags:
open(url + option['value'])
The open statement will try and open a local file whose path is url + option['value']. This will likely raise an error, as I doubt you'll have a file at that location. In addition, you should be aware that you aren't doing anything with this open file.
Okay, so enough with the critique. I've taken a look at the asus page and I think I have an idea of what you want to accomplish. From what I understand, you want to scrape a list of parts (images, text, price, etc..) for each computer model on the asus page. Each model has its list of parts located at a unique URL (for example: http://www.asusparts.eu/partfinder/Asus/Desktop/B%20Series/BM2220). This means that you need to be able to create this unique URL for each model. To make matters more complicated, each parts category is loaded dynamically, so for example the parts for the "Cooling" section are not loaded until you click on the link for "Cooling". This means we have a two part problem: 1) Get all of the valid (brand, type, family, model) combinations and 2) Figure out how to load all the parts for a given model.
I was kind of bored and decided to write up a simple program that will take care of most of the heavy lifting. It isn't the most elegant thing out there, but it'll get the job done. Step 1) is accomplished in get_model_information(). Step 2) is taken care of in parse_models() but is a little less obvious. Taking a look at the asus website, whenever you click on a parts subsection the JavaScript function getProductsBasedOnCategoryID() is run, which makes an ajax call to a formatted PRODUCT_URL (see below). The response is some JSON information that is used to populate the section you clicked on.
import urllib2
import json
import urlparse
from bs4 import BeautifulSoup
BASE_URL = 'http://www.asusparts.eu/partfinder/'
PRODUCTS_URL = 'http://json.zandparts.com/api/category/GetCategories/'\
'44/EUR/{model}/{family}/{accessory}/{brand}/null/'
ACCESSORIES = ['Cable', 'Cooling', 'Cover', 'HDD', 'Keyboard', 'Memory',
'Miscellaneous', 'Mouse', 'ODD', 'PS', 'Screw']
def get_options(url, select_id):
"""
Gets all the options from a select element.
"""
r = urllib2.urlopen(url)
soup = BeautifulSoup(r)
select = soup.find('select', id=select_id)
try:
options = [option for option in select.strings]
except AttributeError:
print url, select_id, select
raise
return options[1:] # The first option is the menu text
def get_model_information():
"""
Finds all the models for each family, all the families and models for each
type, and all the types, families, and models for each brand.
These are all added as tuples (brand, type, family, model) to the list
models.
"""
model_info = []
print "Getting brands"
brand_options = get_options(BASE_URL, 'mySelectList')
for brand in brand_options:
print "Getting types for {0}".format(brand)
# brand = brand.replace(' ', '%20') # URL encode spaces
brand_url = urlparse.urljoin(BASE_URL, brand.replace(' ', '%20'))
types = get_options(brand_url, 'mySelectListType')
for _type in types:
print "Getting families for {0}->{1}".format(brand, _type)
bt = '{0}/{1}'.format(brand, _type)
type_url = urlparse.urljoin(BASE_URL, bt.replace(' ', '%20'))
families = get_options(type_url, 'myselectListFamily')
for family in families:
print "Getting models for {0}->{1}->{2}".format(brand,
_type, family)
btf = '{0}/{1}'.format(bt, family)
fam_url = urlparse.urljoin(BASE_URL, btf.replace(' ', '%20'))
models = get_options(fam_url, 'myselectListModel')
model_info.extend((brand, _type, family, m) for m in models)
return model_info
def parse_models(model_information):
"""
Get all the information for each accessory type for every
(brand, type, family, model). accessory_info will be the python formatted
json results. You can parse, filter, and save this information or use
it however suits your needs.
"""
for brand, _type, family, model in model_information:
for accessory in ACCESSORIES:
r = urllib2.urlopen(PRODUCTS_URL.format(model=model, family=family,
accessory=accessory,
brand=brand,))
accessory_info = json.load(r)
# Do something with accessory_info
# ...
def main():
models = get_model_information()
parse_models(models)
if __name__ == '__main__':
main()
Finally, one side note. I have dropped urllib2 in favor of the requests library. I personally think provides much more functionality and has better semantics, but you can use whatever you would like.

Categories