Field name extraction from a pdf form

Field name extraction from a pdf form - python

I am able to fetch the field names for most of the pdf files using pdfminer but i am not able to fetch fields of uniform Residential Loan Application. is it something restricted in that form.
though it contains multiple fields in the form, it is showing only one field.
This is the output:
topmostSubform[0].
code:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
from pdfminer.psparser import PSLiteral, PSKeyword
from pdfminer.utils import decode_text
data = {}
def decode_value(value):
# decode PSLiteral, PSKeyword
if isinstance(value, (PSLiteral, PSKeyword)):
value = value.name
# decode bytes
if isinstance(value, bytes):
value = decode_text(value)
return value
with open('URLA_2019_Borrower_v28.pdf', 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
res = resolve1(doc.catalog)
if 'AcroForm' not in res:
raise ValueError("No AcroForm Found")
fields = resolve1(doc.catalog['AcroForm'])['Fields'] # may need further resolving
for f in fields:
field = resolve1(f)
name, values = field.get('T'), field.get('V')
# decode name
name = decode_text(name)
# resolve indirect obj
values = resolve1(values)
# decode value(s)
if isinstance(values, list):
values = [decode_value(v) for v in values]
else:
values = decode_value(values)
data.update({name: values})
print(name, values)

Related

Json2csv converter for Python 3

I'm using the Json2csv code to convert the Yelp dataset to csv files (Available here: https://github.com/Yelp/dataset-examples/blob/master/json_to_csv_converter.py)
This code was originally used with Python 2 but I'm using Python 3; I've made some changes so it's now working with Python 3 except that I'm getting b' preceding the strings (which indicates that it is a byte sequence).
I've added encoding='utf-8' to convert it to string but my csv file still shows the b''
Example: business_id
b'7KPBkxAOEtb3QeIL9PEErg'
What do I need to change to make it write strings instead of bytes?
Thanks
# -*- coding: utf-8 -*-
"""Convert the Yelp Dataset Challenge dataset from json format to csv.
For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge
"""
import argparse
import collections
import csv
import json
def read_and_write_file(json_file_path, csv_file_path, column_names):
"""Read in the json dataset file and write it out to a csv file, given the column names."""
with open(csv_file_path, 'w', newline='', encoding='utf-8') as fout:
csv_file = csv.writer(fout)
csv_file.writerow(list(column_names))
with open(json_file_path,encoding='utf-8') as fin:
for line in fin:
line_contents = json.loads(line)
csv_file.writerow(get_row(line_contents, column_names))
def get_superset_of_column_names_from_file(json_file_path):
"""Read in the json dataset file and return the superset of column names."""
column_names = set()
with open(json_file_path, encoding='utf-8') as fin:
for line in fin:
line_contents = json.loads(line)
column_names.update(
set(get_column_names(line_contents).keys())
)
return column_names
def get_column_names(line_contents, parent_key=''):
"""Return a list of flattened key names given a dict.
Example:
line_contents = {
'a': {
'b': 2,
'c': 3,
},
}
will return: ['a.b', 'a.c']
These will be the column names for the eventual csv file.
"""
column_names = []
for k, v in line_contents.items():
column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
if isinstance(v, collections.MutableMapping):
column_names.extend(
get_column_names(v, column_name).items()
)
else:
column_names.append((column_name, v))
return dict(column_names)
def get_nested_value(d, key):
"""Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.
Example:
d = {
'a': {
'b': 2,
'c': 3,
},
}
key = 'a.b'
will return: 2
"""
if '.' not in key:
if key not in d:
return None
return d[key]
base_key, sub_key = key.split('.', 1)
if base_key not in d:
return None
sub_dict = d[base_key]
return get_nested_value(sub_dict, sub_key)
def get_row(line_contents, column_names):
"""Return a csv compatible row given column names and a dict."""
row = []
for column_name in column_names:
line_value = get_nested_value(
line_contents,
column_name,
)
if isinstance(line_value, str):
row.append('{0}'.format(line_value.encode('utf-8')))
elif line_value is not None:
row.append('{0}'.format(line_value))
else:
row.append('')
return row
if __name__ == '__main__':
"""Convert a yelp dataset file from json to csv."""
parser = argparse.ArgumentParser(
description='Convert Yelp Dataset Challenge data from JSON format to CSV.',
)
parser.add_argument(
'json_file',
type=str,
help='The json file to convert.',
)
args = parser.parse_args()
json_file = args.json_file
csv_file = '{0}.csv'.format(json_file.split('.json')[0])
column_names = get_superset_of_column_names_from_file(json_file)
read_and_write_file(json_file, csv_file, column_names)

Just a guess:
if isinstance(line_value, str):
row.append('{0}'.format(line_value.encode('utf-8')))
If the value is str you don't need to encode it in Python 3 - all strings in Python 3 are unicode. You probably should check if the value is an instance of bytes instead.
if isinstance(line_value, bytes):
row.append('{0}'.format(line_value.decode('utf-8')))
[update]
No, that line is checking if it is string versus number... so str is correct – Luluperam
Are you sure? Lets say line_value is the string "foo":
line_value = 'foo'
Now try this:
>>> row = []
>>> if isinstance(line_value, str):
... row.append('{0}'.format(line_value.encode('utf-8')))
>>> print(row)
["b'foo'"]
That is the source of your bytes literal in the CSV file. Now lets try the version I so kindly suggested before dismissing it:
>>> line_value = b'foo'
>>> row = []
>>> if isinstance(line_value, bytes):
... row.append('{0}'.format(line_value.decode('utf-8')))
>>> print(row)
['foo']

Python, ignore files with no Exif data

I am trying to do a mass extraction of gps exif data, my code below:
from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS
def get_exif_data(image):
exif_data = {}
info = image._getexif()
if info:
for tag, value in info.items():
decoded = TAGS.get(tag, tag)
if decoded == "GPSInfo":
gps_data = {}
for t in value:
sub_decoded = GPSTAGS.get(t, t)
gps_data[sub_decoded] = value[t]
exif_data[decoded] = gps_data
else:
exif_data[decoded] = value
return exif_data
def _get_if_exist(data, key):
if key in data:
return data[key]
else:
pass
def get_lat_lon(exif_data):
gps_info = exif_data["GPSInfo"]
lat = None
lon = None
if "GPSInfo" in exif_data:
gps_info = exif_data["GPSInfo"]
gps_latitude = _get_if_exist(gps_info, "GPSLatitude")
gps_latitude_ref = _get_if_exist(gps_info, "GPSLatitudeRef")
gps_longitude = _get_if_exist(gps_info, "GPSLongitude")
gps_longitude_ref = _get_if_exist(gps_info, "GPSLongitudeRef")
if gps_latitude and gps_latitude_ref and gps_longitude and gps_longitude_ref:
lat = _convert_to_degrees(gps_latitude)
if gps_latitude_ref != "N":
lat = 0 - lat
lon = _convert_to_degrees(gps_longitude)
if gps_longitude_ref != "E":
lon = 0 - lon
return lat, lon
Code source
Which is run like:
if __name__ == "__main__":
image = Image.open("photo directory")
exif_data = get_exif_data(image)
print(get_lat_lon(exif_data)
This works fine for one photo, so I've used glob to iterate over all photos in a file:
import glob
file_names = []
for name in glob.glob(photo directory):
file_names.append(name)
for item in file_names:
if __name__ == "__main__":
image = Image.open(item)
exif_data = get_exif_data(image)
print(get_lat_lon(exif_data))
else:
pass
Which works fine, as long as every photo in the file is a) an image and b) has gps data. I have tried adding a pass in the _get_if_exist function as well as my file iteration, however, neither same to have had any impact and I'm still receiving KeyError: 'GPSInfo'
Any ideas on how I can ignore photos with no data or different file types?

A possible approach would be writing a small helper function that first checks, if the file is actually an image file and as a second step checks if the image contains EXIF data.
def is_metadata_image(filename):
try:
image = Image.open(filename)
return 'exif' in image.info
except OSError:
return False
I found that PIL does not work every time with .png files that do contain EXIF information when using _getexif(). So instead I check for the key exif in the info dictionary of an image.

I've tried this source code.
Simply you need to remove
gps_info = exif_data["GPSInfo"]
from the first line of get_lat_lon(exif_data) function, it works well for me.

Understanding how to quote XML string in order to serialize using Python's ElementTree

My specs:
Python 3.4.3
Windows 7
IDE is Jupyter Notebooks
What I have referenced:
how-to-properly-escape-single-and-double-quotes
python-escaping-strings-for-use-in-xml
escaping-characters-in-a-xml-file-with-python
Here is the data and script, respectively, below (I have tried variations on serializing Column 'E' using both Sax and ElementTree):
Data
A,B,C,D,E,F,G,H,I,J
"3","8","1","<Request TransactionID="3" RequestType="FOO"><InstitutionISO /><CallID>23</CallID><MemberID>12</MemberID><MemberPassword /><RequestData><AccountNumber>2</AccountNumber><AccountSuffix>85</AccountSuffix><AccountType>S</AccountType><MPIAcctType>Checking</MPIAcctType><TransactionCount>10</TransactionCount></RequestData></Request>","<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000',0001,0070,</ShareList></Response>","1967-12-25 22:18:13.471000","2005-12-25 22:18:13.768000","2","70","0"
Script
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os.path
import sys
import csv
from io import StringIO
import xml.etree.cElementTree as ElementTree
from xml.etree.ElementTree import XMLParser
import xml
import xml.sax
from xml.sax import ContentHandler
class MyHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self._charBuffer = []
self._result = []
def _getCharacterData(self):
data = ''.join(self._charBuffer).strip()
self._charBuffer = []
return data.strip() #remove strip() if whitespace is important
def parse(self, f):
xml.sax.parse(f, self)
return self._result
def characters(self, data):
self._charBuffer.append(data)
def startElement(self, name, attrs):
if name == 'Response':
self._result.append({})
def endElement(self, name):
if not name == 'Response': self._result[-1][name] = self._getCharacterData()
def read_data(path):
with open(path, 'rU', encoding='utf-8') as data:
reader = csv.DictReader(data, delimiter =',', quotechar="'", skipinitialspace=True)
for row in reader:
yield row
if __name__ == "__main__":
empty = ''
Response = 'sample.csv'
for idx, row in enumerate(read_data(Response)):
if idx > 10: break
data = row['E']
print(data) # The before
data = data[1:-1]
data = ""'{}'"".format(data)
print(data) # Sanity check
# data = '<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000',0001,0070,</ShareList></Response>'
try:
root = ElementTree.XML(data)
# print(root)
except StopIteration:
raise
pass
# xmlstring = StringIO(data)
# print(xmlstring)
# Handler = MyHandler().parse(xmlstring)
Specifically, due to the quoting in the CSV file (which is beyond my control), I have had to resort to slicing the string (line 51) and then formatting it (line 52).
However the print out from the above attempt is as follows:
"<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000'
<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000
File "<string>", line unknown
ParseError: no element found: line 1, column 69
Interestingly - if I assign the variable "data" (as in line 54) I receive this:
File "<ipython-input-80-7357c9272b92>", line 56
data = '<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000',0001,0070,</ShareList></Response>'
^
SyntaxError: invalid token
I seek feedback and information on how to address utilizing the most Pythonic means to do so. Ideally, is there a method that can leverage ElementTree. Thank you, in advance, for your feedback and guidance.

It seems that You have badly formatted (well, badly quoted) csv data.
If csv file is beyond Your control I suggest not using csv reader to read them,
instead - if You can rely on each field being properly quoted - split them yourself.
with open(Response, 'rU', encoding='utf-8') as data:
separated = data.read().split('","')
try:
x = ElementTree.XML(separated[3])
print(x)
xml.etree.ElementTree.dump(x)
y = ElementTree.XML(separated[4])
xml.etree.ElementTree.dump(y)
except Exception as e:
print(e)
outputs
<Element 'Request' at 0xb6d973b0>
<Request RequestType="FOO" TransactionID="3"><InstitutionISO /><CallID>23</CallID><MemberID>12</MemberID><MemberPassword /><RequestData><AccountNumber>2</AccountNumber><AccountSuffix>85</AccountSuffix><AccountType>S</AccountType><MPIAcctType>Checking</MPIAcctType><TransactionCount>10</TransactionCount></RequestData></Request>
<Response RequestType="HoldInquiry" TransactionID="2"><ShareList>0000',0001,0070,</ShareList></Response>

Streaming a CSV file in Django

I am attempting to stream a csv file as an attachment download. The CSV files are getting to be 4MB in size or more, and I need a way for the user to actively download the files without waiting for all of the data to be created and committed to memory first.
I first used my own file wrapper based on Django's FileWrapper class. That failed. Then I saw a method here for using a generator to stream the response:
How to stream an HttpResponse with Django
When I raise an error within the generator, I can see that I am creating the proper data with the get_row_data() function, but when I try to return the response it comes back empty. I've also disabled the Django GZipMiddleware. Does anyone know what I'm doing wrong?
Edit: The issue I was having was with the ConditionalGetMiddleware. I had to replace it, the code is in an answer below.
Here is the view:
from django.views.decorators.http import condition
#condition(etag_func=None)
def csv_view(request, app_label, model_name):
""" Based on the filters in the query, return a csv file for the given model """
#Get the model
model = models.get_model(app_label, model_name)
#if there are filters in the query
if request.method == 'GET':
#if the query is not empty
if request.META['QUERY_STRING'] != None:
keyword_arg_dict = {}
for key, value in request.GET.items():
#get the query filters
keyword_arg_dict[str(key)] = str(value)
#generate a list of row objects, based on the filters
objects_list = model.objects.filter(**keyword_arg_dict)
else:
#get all the model's objects
objects_list = model.objects.all()
else:
#get all the model's objects
objects_list = model.objects.all()
#create the reponse object with a csv mimetype
response = HttpResponse(
stream_response_generator(model, objects_list),
mimetype='text/plain',
)
response['Content-Disposition'] = "attachment; filename=foo.csv"
return response
Here is the generator I use to stream the response:
def stream_response_generator(model, objects_list):
"""Streaming function to return data iteratively """
for row_item in objects_list:
yield get_row_data(model, row_item)
time.sleep(1)
And here is how I create the csv row data:
def get_row_data(model, row):
"""Get a row of csv data from an object"""
#Create a temporary csv handle
csv_handle = cStringIO.StringIO()
#create the csv output object
csv_output = csv.writer(csv_handle)
value_list = []
for field in model._meta.fields:
#if the field is a related field (ForeignKey, ManyToMany, OneToOne)
if isinstance(field, RelatedField):
#get the related model from the field object
related_model = field.rel.to
for key in row.__dict__.keys():
#find the field in the row that matches the related field
if key.startswith(field.name):
#Get the unicode version of the row in the related model, based on the id
try:
entry = related_model.objects.get(
id__exact=int(row.__dict__[key]),
)
except:
pass
else:
value = entry.__unicode__().encode("utf-8")
break
#if it isn't a related field
else:
#get the value of the field
if isinstance(row.__dict__[field.name], basestring):
value = row.__dict__[field.name].encode("utf-8")
else:
value = row.__dict__[field.name]
value_list.append(value)
#add the row of csv values to the csv file
csv_output.writerow(value_list)
#Return the string value of the csv output
return csv_handle.getvalue()

Here's some simple code that'll stream a CSV; you can probably go from this to whatever you need to do:
import cStringIO as StringIO
import csv
def csv(request):
def data():
for i in xrange(10):
csvfile = StringIO.StringIO()
csvwriter = csv.writer(csvfile)
csvwriter.writerow([i,"a","b","c"])
yield csvfile.getvalue()
response = HttpResponse(data(), mimetype="text/csv")
response["Content-Disposition"] = "attachment; filename=test.csv"
return response
This simply writes each row to an in-memory file, reads the row and yields it.
This version is more efficient for generating bulk data, but be sure to understand the above before using it:
import cStringIO as StringIO
import csv
def csv(request):
csvfile = StringIO.StringIO()
csvwriter = csv.writer(csvfile)
def read_and_flush():
csvfile.seek(0)
data = csvfile.read()
csvfile.seek(0)
csvfile.truncate()
return data
def data():
for i in xrange(10):
csvwriter.writerow([i,"a","b","c"])
data = read_and_flush()
yield data
response = HttpResponse(data(), mimetype="text/csv")
response["Content-Disposition"] = "attachment; filename=test.csv"
return response

The middleware issue has been solved as of Django 1.5 and a StreamingHttpResponse has been introduced. The following should do:
import cStringIO as StringIO
import csv
def csv_view(request):
...
# Assume `rows` is an iterator or lists
def stream():
buffer_ = StringIO.StringIO()
writer = csv.writer(buffer_)
for row in rows:
writer.writerow(row)
buffer_.seek(0)
data = buffer_.read()
buffer_.seek(0)
buffer_.truncate()
yield data
response = StreamingHttpResponse(
stream(), content_type='text/csv'
)
disposition = "attachment; filename=file.csv"
response['Content-Disposition'] = disposition
return response
There's some documentation on how to output csv from Django but it doesn't take advantage of the StreamingHttpResponse so I went ahead and opened a ticket in order to track it.

The problem I was having was with the ConditionalGetMiddleware. I saw django-piston come up with a replacement middleware for the ConditionalGetMiddleware that allows streaming:
from django.middleware.http import ConditionalGetMiddleware
def compat_middleware_factory(klass):
"""
Class wrapper that only executes `process_response`
if `streaming` is not set on the `HttpResponse` object.
Django has a bad habbit of looking at the content,
which will prematurely exhaust the data source if we're
using generators or buffers.
"""
class compatwrapper(klass):
def process_response(self, req, resp):
if not hasattr(resp, 'streaming'):
return klass.process_response(self, req, resp)
return resp
return compatwrapper
ConditionalMiddlewareCompatProxy = compat_middleware_factory(ConditionalGetMiddleware)
So then you will replace ConditionalGetMiddleware with your ConditionalMiddlewareCompatProxy middleware, and in your view (borrowed code from a clever answer to this question):
def csv_view(request):
def data():
for i in xrange(10):
csvfile = StringIO.StringIO()
csvwriter = csv.writer(csvfile)
csvwriter.writerow([i,"a","b","c"])
yield csvfile.getvalue()
#create the reponse object with a csv mimetype
response = HttpResponse(
data(),
mimetype='text/csv',
)
#Set the response as an attachment with a filename
response['Content-Disposition'] = "attachment; filename=test.csv"
response.streaming = True
return response

How to extract PDF fields from a filled out form in Python?

I'm trying to use Python to processes some PDF forms that were filled out and signed using Adobe Acrobat Reader.
I've tried:
The pdfminer demo: it didn't dump any of the filled out data.
pyPdf: it maxed a core for 2 minutes when I tried to load the file with PdfFileReader(f) and I just gave up and killed it.
Jython and PDFBox: got that working great but the startup time is excessive, I'll just write an external utility in straight Java if that's my only option.
I can keep hunting for libraries and trying them but I'm hoping someone already has an efficient solution for this.
Update: Based on Steven's answer I looked into pdfminer and it did the trick nicely.
from argparse import ArgumentParser
import pickle
import pprint
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1, PDFObjRef
def load_form(filename):
"""Load pdf form contents into a nested list of name/value tuples"""
with open(filename, 'rb') as file:
parser = PDFParser(file)
doc = PDFDocument(parser)
return [load_fields(resolve1(f)) for f in
resolve1(doc.catalog['AcroForm'])['Fields']]
def load_fields(field):
"""Recursively load form fields"""
form = field.get('Kids', None)
if form:
return [load_fields(resolve1(f)) for f in form]
else:
# Some field types, like signatures, need extra resolving
return (field.get('T').decode('utf-16'), resolve1(field.get('V')))
def parse_cli():
"""Load command line arguments"""
parser = ArgumentParser(description='Dump the form contents of a PDF.')
parser.add_argument('file', metavar='pdf_form',
help='PDF Form to dump the contents of')
parser.add_argument('-o', '--out', help='Write output to file',
default=None, metavar='FILE')
parser.add_argument('-p', '--pickle', action='store_true', default=False,
help='Format output for python consumption')
return parser.parse_args()
def main():
args = parse_cli()
form = load_form(args.file)
if args.out:
with open(args.out, 'w') as outfile:
if args.pickle:
pickle.dump(form, outfile)
else:
pp = pprint.PrettyPrinter(indent=2)
file.write(pp.pformat(form))
else:
if args.pickle:
print(pickle.dumps(form))
else:
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(form)
if __name__ == '__main__':
main()

You should be able to do it with pdfminer, but it will require some delving into the internals of pdfminer and some knowledge about the pdf format (wrt forms of course, but also about pdf's internal structures like "dictionaries" and "indirect objects").
This example might help you on your way (I think it will work only on simple cases, with no nested fields etc...)
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
filename = sys.argv[1]
fp = open(filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog['AcroForm'])['Fields']
for i in fields:
field = resolve1(i)
name, value = field.get('T'), field.get('V')
print '{0}: {1}'.format(name, value)
EDIT: forgot to mention: if you need to provide a password, pass it to doc.initialize()

Python 3.6+:
pip install PyPDF2
# -*- coding: utf-8 -*-
from collections import OrderedDict
from PyPDF2 import PdfFileWriter, PdfFileReader
def _getFields(obj, tree=None, retval=None, fileobj=None):
"""
Extracts field data if this PDF contains interactive form fields.
The *tree* and *retval* parameters are for recursive use.
:param fileobj: A file object (usually a text file) to write
a report to on all interactive form fields found.
:return: A dictionary where each key is a field name, and each
value is a :class:`Field<PyPDF2.generic.Field>` object. By
default, the mapping name is used for keys.
:rtype: dict, or ``None`` if form data could not be located.
"""
fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
'/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
if retval is None:
retval = OrderedDict()
catalog = obj.trailer["/Root"]
# get the AcroForm tree
if "/AcroForm" in catalog:
tree = catalog["/AcroForm"]
else:
return None
if tree is None:
return retval
obj._checkKids(tree, retval, fileobj)
for attr in fieldAttributes:
if attr in tree:
# Tree is a field
obj._buildField(tree, retval, fileobj, fieldAttributes)
break
if "/Fields" in tree:
fields = tree["/Fields"]
for f in fields:
field = f.getObject()
obj._buildField(field, retval, fileobj, fieldAttributes)
return retval
def get_form_fields(infile):
infile = PdfFileReader(open(infile, 'rb'))
fields = _getFields(infile)
return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())
if __name__ == '__main__':
from pprint import pprint
pdf_file_name = 'FormExample.pdf'
pprint(get_form_fields(pdf_file_name))

The Python PyPDF2 package (successor to pyPdf) is very convenient:
import PyPDF2
f = PyPDF2.PdfReader('form.pdf')
ff = f.get_fields()
Then ff is a dict that contains all the relevant form information.

Quick and dirty 2-minute job; just use PDFminer to convert PDF to xml and then grab all of the fields.
from xml.etree import ElementTree
from pprint import pprint
import os
def main():
print "Calling PDFDUMP.py"
os.system("dumppdf.py -a FILE.pdf > out.xml")
# Preprocess the file to eliminate bad XML.
print "Screening the file"
o = open("output.xml","w") #open for append
for line in open("out.xml"):
line = line.replace("&#", "Invalid_XML") #some bad data in xml for formatting info.
o.write(line)
o.close()
print "Opening XML output"
tree = ElementTree.parse('output.xml')
lastnode = ""
lastnode2 = ""
list = {}
entry = {}
for node in tree.iter(): # Run through the tree..
# Check if New node
if node.tag == "key" and node.text == "T":
lastnode = node.tag + node.text
elif lastnode == "keyT":
for child in node.iter():
entry["ID"] = child.text
lastnode = ""
if node.tag == "key" and node.text == "V":
lastnode2 = node.tag + node.text
elif lastnode2 == "keyV":
for child in node.iter():
if child.tag == "string":
if entry.has_key("ID"):
entry["Value"] = child.text
list[entry["ID"]] = entry["Value"]
entry = {}
lastnode2 = ""
pprint(list)
if __name__ == '__main__':
main()
It isn't pretty, just a simple proof of concept. I need to implement it for a system I'm working on so I will be cleaning it up, but I thought I would post it in case anyone finds it useful.

Update for latest version of pdf miner (change import and parser/doc setup in first function)
from argparse import ArgumentParser
import pickle
import pprint
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
from pdfminer.pdftypes import PDFObjRef
def load_form(filename):
"""Load pdf form contents into a nested list of name/value tuples"""
with open(filename, 'rb') as file:
parser = PDFParser(file)
doc = PDFDocument(parser)
parser.set_document(doc)
#doc.set_parser(parser)
doc.initialize()
return [load_fields(resolve1(f)) for f in
resolve1(doc.catalog['AcroForm'])['Fields']]
def load_fields(field):
"""Recursively load form fields"""
form = field.get('Kids', None)
if form:
return [load_fields(resolve1(f)) for f in form]
else:
# Some field types, like signatures, need extra resolving
return (field.get('T').decode('utf-8'), resolve1(field.get('V')))
def parse_cli():
"""Load command line arguments"""
parser = ArgumentParser(description='Dump the form contents of a PDF.')
parser.add_argument('file', metavar='pdf_form',
help='PDF Form to dump the contents of')
parser.add_argument('-o', '--out', help='Write output to file',
default=None, metavar='FILE')
parser.add_argument('-p', '--pickle', action='store_true', default=False,
help='Format output for python consumption')
return parser.parse_args()
def main():
args = parse_cli()
form = load_form(args.file)
if args.out:
with open(args.out, 'w') as outfile:
if args.pickle:
pickle.dump(form, outfile)
else:
pp = pprint.PrettyPrinter(indent=2)
file.write(pp.pformat(form))
else:
if args.pickle:
print pickle.dumps(form)
else:
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(form)
if __name__ == '__main__':
main()

I created a library to do this:
pip install fillpdf
from fillpdf import fillpdfs
fillpdfs.get_form_fields("ex.pdf")
Credit to dvska's answer, for basis of library code.

There is a typo on these lines:
file.write(pp.pformat(form))
Should be:
outfile.write(pp.pformat(form))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Field name extraction from a pdf form - python

Related

Json2csv converter for Python 3

Python, ignore files with no Exif data

Understanding how to quote XML string in order to serialize using Python's ElementTree

Streaming a CSV file in Django

How to extract PDF fields from a filled out form in Python?

Categories

Resources