I've found code online which allows to convert several pdf files to text files, using the pdfminer module in Python. I tried to expand the code for several pdf files which I've saved in a directory, but the code results in an error.
My code so far:
import nltk
import re
import glob
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
with open('D:\Reports\*.txt', 'w') as pdf_file:
pdf_file.write(text)
return text
directory = glob.glob('D:\Reports\*.pdf')
for myfiles in directory:
convert(myfiles)
The error message:
Traceback (most recent call last):
File "F:/Text mining/pdfminer for several files", line 40, in <module>
convert(myfiles)
File "F:/Text mining/pdfminer for several files", line 32, in convert
with open('D:\Reports\*.txt', 'w') as pdf_file:
IOError: [Errno 22] invalid mode ('w') or filename: 'D:\\Reports\\*.txt'
The error stems from attempting to write the contents of the text variable to a file that is named 'D:\Reports\*.txt'. The wildcard * is not allowed in a filename (ref).
If you want to save the file to a text file with the same name, you could replace your writing functionality with:
outfile = os.path.splitext(os.path.abspath(fname))[0] + '.txt'
with open(outfile, 'wb') as pdf_file:
pdf_file.write(text)
Do not forget to import os if you want to process paths in an OS agnostic way.
probably you should just change:
with open('D:\Reports\*.txt', 'w') as pdf_file:
pdf_file.write(text)
to
with open(fname, 'w') as pdf_file:
pdf_file.write(text)
but I do not have python2.7-3.4 on my machine available to verify
Related
I am trying to extract the following information from all PDF files within a folder, the PDF files are CV's: Email Address, First Name, Last Name for a work project.
I have successfully managed to extract Email Addresses using this code:
from io import StringIO
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter
from pdfminer3.layout import LAParams
from pdfminer3.pdfpage import PDFPage
import subprocess
from subprocess import call
import os
import re
working_directory = './folder'
file_list = [] # define file_list to save all dxf files
email_list = {} # define file_list to save all dxf files
for subdir, dirs, files in os.walk(working_directory):
for file in files:
if file.endswith('.pdf'):
file_list.append(file)
for input_file in file_list:
pagenums = set()
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open('./folder/' + input_file, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
match = re.search(r'[\w\.-]+#[a-z0-9\.-]+', text)
try:
email = match.group(0)
except AttributeError:
email = match
if email is None:
pass
else:
email_list.update({input_file: email})
print(email_list[input_file])
email_list
But have trouble extracting First and Last Names, any help would be appreciated!
You can find email information because there is logic behind it
match = re.search(r'[\w\.-]+#[a-z0-9\.-]+', text)
But also you have to figure out a logic to find out first and last names of your PDF files.
Maybe an specific field after Dear, for example
Is there any code snippet that will work? I have tried this for converting pdf to html
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import os
import contextlib
import tempfile
rsrcmgr = PDFResourceManager()
laparams = LAParams()
converter = HTMLConverter if format == 'html' else TextConverter
out_file = "A:\folder"
in_file = "A:\folder\pyhtml.html"
pdf_filename = 'insurance.pdf'
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
PDFPage.get_pages(rsrcmgr, device, in_file, pagenos=[1], maxpages=1)
with contextlib.closing(tempfile.NamedTemporaryFile(mode='r', suffix='.xml')) as xmlin:
cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (
pdf_filename, xmlin.name.rpartition('.')[0])
os.system(cmd + " >/dev/null 2>&1")
result = xmlin.read().decode('utf-8')
when i run the above code it gives me following erroe
Traceback (most recent call last):
File "a:\folder\new - Copy.py", line 14, in <module>
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
AttributeError: 'str' object has no attribute 'write'
AttributeError: 'str' object has no attribute 'write'
If there is attempt of .write that means you should provide write-able file-handle rather than str, you might use with open... which will take care of closing file for you as follow, replace
in_file = "A:\folder\pyhtml.html"
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
using
in_file = "A:\folder\pyhtml.html"
with open(in_file, "w") as out_file:
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
If you want to know more about open read Built-in Functions docs
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
filepath = open(fname, 'rb')
for page in PDFPage.get_pages(filepath, pagenums):
interpreter.process_page(page)
filepath.close()
converter.close()
text = output.getvalue()
output.close
return text
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
#textFile.close
pdfDir = (r"FK_EPPS")
txtDir = (r"FK_txt")
convertMultiple(pdfDir, txtDir)
I tried to convert multiple pdf files called FK_EPPS into txt files and write it in different folder called FK_txt. But it says that there is no such files or directory. I put the folder exactly in those path. I try find the solution but still there is an error. Can you help me why this is happen?
/usr/local/lib/python2.7/dist-packages/pdfminer/__init__.py:20: UserWarning: On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For more information see https://github.com/pdfminer/pdfminer.six/issues/194
warnings.warn('On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For '
Traceback (most recent call last):
File "/home/a1-re/Documents/pdftotext/1.py", line 44, in <module>
convertMultiple(pdfDir, txtDir)
File "/home/a1-re/Documents/pdftotext/1.py", line 36, in convertMultiple
text = convert(pdfFilename) #get string of text content of pdf
File "/home/a1-re/Documents/pdftotext/1.py", line 21, in convert
filepath = file(fname, 'rb')
IOError: [Errno 2] No such file or directory: 'pdf1831150030.pdf'
(There is no way the traceback that you show is correct. With your sample input, the error should have contained FK_EPPS at the start.)
You forget that a path and filename must be separated from each other with the appropriate separator for your OS.
You could immediately have seen this if you had printed out the value of fname at the start of that convert function. You make the same mistake for the text output filename, but that would be harder to notice because it would not yield an error, but only create a wrong filename.
I am using python 3. My code uses pdfminer to convert pdf to text. I want to get the output of these files in a new folder. Currently it's coming in the existing folder from which it does the conversion to .txt using pdfminer. How do I redirect the output to a different folder. I want the output in a folder called "D:\extracted_text" Code till now:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import glob
import os
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
savepath = 'D:/extracted_text/'
outfile = os.path.splitext(fname)[0] + '.txt'
comp_name = os.path.join(savepath,outfile)
print(outfile)
with open(comp_name, 'w', encoding = 'utf-8') as pdf_file:
pdf_file.write(text)
return text
directory = glob.glob(r'D:\files\*.pdf')
for myfiles in directory:
convert(myfiles)
I'm looking to extract texts from PDFs for a data-mining task.
The PDFs I'm looking at contain multiple reports, each report has its own first level entry in the documents table of contents. Also, there is a written table of contents at the beginning of the PDF, which contains page numbers for each report ("from page - to page").
I'm looking for a way to either:
Split the PDF into the individual reports, in order to dump each of those into a .txt file.
Dump each section of the PDF into a .txt directly.
So far, I have been able to dump to entire file into a .txt using PDFminer (python), as follows:
# Not all imports are needed for this task
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def myparse(data):
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
#fp.close()
#device.close()
str = retstr.getvalue()
#retstr.close()
return str
t1 = myparse("part2.pdf")
text_file = open("part2.txt", "w")
text_file.write(t1)
text_file.close()
Also, this returns the entire structure of the table of contents:
# Open a PDF document.
fp = open('solar.pdf', 'rb')
parser = PDFParser(fp)
password = ""
document = PDFDocument(parser, password)
# Get the outlines of the document.
outlines = document.get_outlines()
for (level,title,dest,a,se) in outlines:
print (level, title, a)
Any idea how to go ahead from here? Any tools using python, R or bash would be easiest to use for me personally, but as long as it enables batch splitting based on the first outline level of the document, any solution would be great.
Thank you,
Matthias
I've found a straightforward solution for this using sejda-console:
from subprocess import call
import os
pdfname = "example.pdf"
outdir = "C:\\out\\%s" % pdfname
if not os.path.exists(outdir):
os.makedirs(outdir)
sejda = 'C:\\sejda\\bin\\sejda-console.bat'
call = sejda
call += ' splitbybookmarks'
call += ' --bookmarkLevel 1'
call += ' -f "%s"' % pdfname
call += ' -o "%s"' % outdir
print '\n', call
subprocess.call(call)
print "PDFs have been written to out-directory"
Abviously this requires the sejda programme: http://www.sejda.org/