I have been going round in circles trying to get this to work and for the life of me I can not. I would greatly appricate it if someone could help me out with this. I am trying to create a python application that can scan a folder of documents and any sub folders for PDF files. It will then scan through all of the individual documents and look for a specific phrase. Once it has found this phrase it will add it to a .txt file with the document name and page number of the doucment. Once it is compelte the .txt file will be created and allow the user to see a report on which documents have got this phrase in them and where it is located.
I am using Python and PDFminer.six and Tkinter
So far my code is as follows.
`
import tkinter as tk
from tkinter import filedialog
import os
import re
from tqdm import tqdm
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import PDFPageAggregator
root = tk.Tk()
root.withdraw()
def extract_text_from_pdf(pdf_path):
resource_manager = PDFResourceManager()
device = PDFPageAggregator(resource_manager)
interpreter = PDFPageInterpreter(resource_manager, device)
laparams = LAParams()
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
interpreter.process_page(page)
layout = device.get_result()
for element in layout:
if element.get_text() and (element.get_text()).strip():
yield element.get_text(), element.bbox
# Function to scan through a folder and its subfolders
def scan_folder(folder_path, phrase):
# Create a dictionary to store the results
results = {}
# Iterate through the selected folder and its subfolders
for root, dirs, files in os.walk(folder_path):
for file in tqdm(files):
file_path = os.path.join(root, file)
# Check if the file is a pdf file
if file.endswith(".pdf"):
matches = []
for text, bbox, page_id in extract_text_from_pdf(file_path):
# Search for the specified phrase
match = re.search(phrase, text)
if match:matches.append({'word': phrase, 'location': bbox, 'page': page_id+1})
results[file_path] = matches
# prompt the user to select a location to save the text file
file_path = filedialog.asksaveasfilename(defaultextension=".txt",
initialfile="results.txt",
initialdir=folder_path)
# write the results to the selected text file
with open(file_path, 'w') as f:
for key, value in results.items():
for match in value:
f.write(key + " : " + match['word'] + " found at " + str(match['location']) + " on page " + str(match['page']) + '\n')
return results
# Example usage
folder_path = filedialog.askdirectory(initialdir = "C:/", title = "Select folder")
phrase = input("Enter the phrase you want to search for: ")
results = scan_folder(folder_path, phrase)`
But I keep running into problem after problem, the latest one being this
Exception has occurred: AttributeError 'LTCurve' object has no attribute 'get_text' File "C:\Users\Edward Baker\OneDrive - Folley Electrical\Desktop\Document scanning project\PDF_Scanner_New.py", line 31, in extract_text_from_pdf if element.get_text() and (element.get_text()).strip(): ^^^^^^^^^^^^^^^^ File "C:\Users\Edward Baker\OneDrive - Folley Electrical\Desktop\Document scanning project\PDF_Scanner_New.py", line 47, in scan_folder for text, bbox, page_id in extract_text_from_pdf(file_path): File "C:\Users\Edward Baker\OneDrive - Folley Electrical\Desktop\Document scanning project\PDF_Scanner_New.py", line 66, in <module> results = scan_folder(folder_path, phrase) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'LTCurve' object has no attribute 'get_text'
I would love some help on this little side project
Related
I am trying to extract the following information from all PDF files within a folder, the PDF files are CV's: Email Address, First Name, Last Name for a work project.
I have successfully managed to extract Email Addresses using this code:
from io import StringIO
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter
from pdfminer3.layout import LAParams
from pdfminer3.pdfpage import PDFPage
import subprocess
from subprocess import call
import os
import re
working_directory = './folder'
file_list = [] # define file_list to save all dxf files
email_list = {} # define file_list to save all dxf files
for subdir, dirs, files in os.walk(working_directory):
for file in files:
if file.endswith('.pdf'):
file_list.append(file)
for input_file in file_list:
pagenums = set()
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open('./folder/' + input_file, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
match = re.search(r'[\w\.-]+#[a-z0-9\.-]+', text)
try:
email = match.group(0)
except AttributeError:
email = match
if email is None:
pass
else:
email_list.update({input_file: email})
print(email_list[input_file])
email_list
But have trouble extracting First and Last Names, any help would be appreciated!
You can find email information because there is logic behind it
match = re.search(r'[\w\.-]+#[a-z0-9\.-]+', text)
But also you have to figure out a logic to find out first and last names of your PDF files.
Maybe an specific field after Dear, for example
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
filepath = open(fname, 'rb')
for page in PDFPage.get_pages(filepath, pagenums):
interpreter.process_page(page)
filepath.close()
converter.close()
text = output.getvalue()
output.close
return text
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
#textFile.close
pdfDir = (r"FK_EPPS")
txtDir = (r"FK_txt")
convertMultiple(pdfDir, txtDir)
I tried to convert multiple pdf files called FK_EPPS into txt files and write it in different folder called FK_txt. But it says that there is no such files or directory. I put the folder exactly in those path. I try find the solution but still there is an error. Can you help me why this is happen?
/usr/local/lib/python2.7/dist-packages/pdfminer/__init__.py:20: UserWarning: On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For more information see https://github.com/pdfminer/pdfminer.six/issues/194
warnings.warn('On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For '
Traceback (most recent call last):
File "/home/a1-re/Documents/pdftotext/1.py", line 44, in <module>
convertMultiple(pdfDir, txtDir)
File "/home/a1-re/Documents/pdftotext/1.py", line 36, in convertMultiple
text = convert(pdfFilename) #get string of text content of pdf
File "/home/a1-re/Documents/pdftotext/1.py", line 21, in convert
filepath = file(fname, 'rb')
IOError: [Errno 2] No such file or directory: 'pdf1831150030.pdf'
(There is no way the traceback that you show is correct. With your sample input, the error should have contained FK_EPPS at the start.)
You forget that a path and filename must be separated from each other with the appropriate separator for your OS.
You could immediately have seen this if you had printed out the value of fname at the start of that convert function. You make the same mistake for the text output filename, but that would be harder to notice because it would not yield an error, but only create a wrong filename.
I am using python 3. My code uses pdfminer to convert pdf to text. I want to get the output of these files in a new folder. Currently it's coming in the existing folder from which it does the conversion to .txt using pdfminer. How do I redirect the output to a different folder. I want the output in a folder called "D:\extracted_text" Code till now:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import glob
import os
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
savepath = 'D:/extracted_text/'
outfile = os.path.splitext(fname)[0] + '.txt'
comp_name = os.path.join(savepath,outfile)
print(outfile)
with open(comp_name, 'w', encoding = 'utf-8') as pdf_file:
pdf_file.write(text)
return text
directory = glob.glob(r'D:\files\*.pdf')
for myfiles in directory:
convert(myfiles)
I am currently using the class provided in the answer here:
How to extract text and text coordinates from a pdf file?
The class provided is very helpful in that I can get the position of every text box in a PDF. The class given also inserts a '_' every time there is a new line within the textbox.
I was wondering whether there was some way to get the position of each line of text within the textbox as well?
Found it: The solution is to recurse even when there is a TextBox, until a textline is found. The class below should provide the x and y coordinates of every line of text on a pdf when the parsepdf method is called.
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
class pdfPositionHandling:
def parse_obj(self, lt_objs):
# loop over the object list
for obj in lt_objs:
if isinstance(obj, pdfminer.layout.LTTextLine):
print "%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', '_'))
# if it's a textbox, also recurse
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
self.parse_obj(obj._objs)
# if it's a container, recurse
elif isinstance(obj, pdfminer.layout.LTFigure):
self.parse_obj(obj._objs)
def parsepdf(self, filename, startpage, endpage):
# Open a PDF file.
fp = open(filename, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
i = 0
# loop over all pages in the document
for page in PDFPage.create_pages(document):
if i >= startpage and i <= endpage:
# read the page into a layout object
interpreter.process_page(page)
layout = device.get_result()
# extract text from this object
self.parse_obj(layout._objs)
i += 1
I'm looking to extract texts from PDFs for a data-mining task.
The PDFs I'm looking at contain multiple reports, each report has its own first level entry in the documents table of contents. Also, there is a written table of contents at the beginning of the PDF, which contains page numbers for each report ("from page - to page").
I'm looking for a way to either:
Split the PDF into the individual reports, in order to dump each of those into a .txt file.
Dump each section of the PDF into a .txt directly.
So far, I have been able to dump to entire file into a .txt using PDFminer (python), as follows:
# Not all imports are needed for this task
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def myparse(data):
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
#fp.close()
#device.close()
str = retstr.getvalue()
#retstr.close()
return str
t1 = myparse("part2.pdf")
text_file = open("part2.txt", "w")
text_file.write(t1)
text_file.close()
Also, this returns the entire structure of the table of contents:
# Open a PDF document.
fp = open('solar.pdf', 'rb')
parser = PDFParser(fp)
password = ""
document = PDFDocument(parser, password)
# Get the outlines of the document.
outlines = document.get_outlines()
for (level,title,dest,a,se) in outlines:
print (level, title, a)
Any idea how to go ahead from here? Any tools using python, R or bash would be easiest to use for me personally, but as long as it enables batch splitting based on the first outline level of the document, any solution would be great.
Thank you,
Matthias
I've found a straightforward solution for this using sejda-console:
from subprocess import call
import os
pdfname = "example.pdf"
outdir = "C:\\out\\%s" % pdfname
if not os.path.exists(outdir):
os.makedirs(outdir)
sejda = 'C:\\sejda\\bin\\sejda-console.bat'
call = sejda
call += ' splitbybookmarks'
call += ' --bookmarkLevel 1'
call += ' -f "%s"' % pdfname
call += ' -o "%s"' % outdir
print '\n', call
subprocess.call(call)
print "PDFs have been written to out-directory"
Abviously this requires the sejda programme: http://www.sejda.org/