How to convert pdf to HTML using python pdfminer? - python

Is there any code snippet that will work? I have tried this for converting pdf to html
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import os
import contextlib
import tempfile
rsrcmgr = PDFResourceManager()
laparams = LAParams()
converter = HTMLConverter if format == 'html' else TextConverter
out_file = "A:\folder"
in_file = "A:\folder\pyhtml.html"
pdf_filename = 'insurance.pdf'
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
PDFPage.get_pages(rsrcmgr, device, in_file, pagenos=[1], maxpages=1)
with contextlib.closing(tempfile.NamedTemporaryFile(mode='r', suffix='.xml')) as xmlin:
cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (
pdf_filename, xmlin.name.rpartition('.')[0])
os.system(cmd + " >/dev/null 2>&1")
result = xmlin.read().decode('utf-8')
when i run the above code it gives me following erroe
Traceback (most recent call last):
File "a:\folder\new - Copy.py", line 14, in <module>
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
AttributeError: 'str' object has no attribute 'write'

AttributeError: 'str' object has no attribute 'write'
If there is attempt of .write that means you should provide write-able file-handle rather than str, you might use with open... which will take care of closing file for you as follow, replace
in_file = "A:\folder\pyhtml.html"
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
using
in_file = "A:\folder\pyhtml.html"
with open(in_file, "w") as out_file:
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
If you want to know more about open read Built-in Functions docs

Related

Redirect output of a function to another folder in python

I am using python 3. My code uses pdfminer to convert pdf to text. I want to get the output of these files in a new folder. Currently it's coming in the existing folder from which it does the conversion to .txt using pdfminer. How do I redirect the output to a different folder. I want the output in a folder called "D:\extracted_text" Code till now:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import glob
import os
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
savepath = 'D:/extracted_text/'
outfile = os.path.splitext(fname)[0] + '.txt'
comp_name = os.path.join(savepath,outfile)
print(outfile)
with open(comp_name, 'w', encoding = 'utf-8') as pdf_file:
pdf_file.write(text)
return text
directory = glob.glob(r'D:\files\*.pdf')
for myfiles in directory:
convert(myfiles)

convert several files with pdfminer

I've found code online which allows to convert several pdf files to text files, using the pdfminer module in Python. I tried to expand the code for several pdf files which I've saved in a directory, but the code results in an error.
My code so far:
import nltk
import re
import glob
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
with open('D:\Reports\*.txt', 'w') as pdf_file:
pdf_file.write(text)
return text
directory = glob.glob('D:\Reports\*.pdf')
for myfiles in directory:
convert(myfiles)
The error message:
Traceback (most recent call last):
File "F:/Text mining/pdfminer for several files", line 40, in <module>
convert(myfiles)
File "F:/Text mining/pdfminer for several files", line 32, in convert
with open('D:\Reports\*.txt', 'w') as pdf_file:
IOError: [Errno 22] invalid mode ('w') or filename: 'D:\\Reports\\*.txt'
The error stems from attempting to write the contents of the text variable to a file that is named 'D:\Reports\*.txt'. The wildcard * is not allowed in a filename (ref).
If you want to save the file to a text file with the same name, you could replace your writing functionality with:
outfile = os.path.splitext(os.path.abspath(fname))[0] + '.txt'
with open(outfile, 'wb') as pdf_file:
pdf_file.write(text)
Do not forget to import os if you want to process paths in an OS agnostic way.
probably you should just change:
with open('D:\Reports\*.txt', 'w') as pdf_file:
pdf_file.write(text)
to
with open(fname, 'w') as pdf_file:
pdf_file.write(text)
but I do not have python2.7-3.4 on my machine available to verify

Path not printing string values

I recently found this really handy library for pdf conversion. I am trying to convert a pdf to string values. In order to parse the data and convert to a csv file. I want to automate this for future so I cannot use Tabula.
I am calling some modules in order to convert pdf to string.
The part for string conversion is not working. (pdf2string.py)
Here is part for the pdf conversion to string.
I get no error. Success. But, there is no output.
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re
import csv
import sys
def convert_pdf_to_html(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0 #is for all
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
print str
if __name__ == '__main__':
if len(sys.argv) == 2:
path = sys.argv[1]
convert_pdf_to_html(path)
This is my bash.
python pdf2string.py example.pdf
Script is pdf2string.py and path is example.pdf.
I am also new to high-level logic in python.
Edit: you are returning before printing - remove return str, or remove print str and use the advice below.
You're not printing the output of convert_pdf_to_html(), or saving it somewhere.
print convert_pdf_to_html(path)

ClassFormatError: Invalid method Code length 85551 in class file pdfminer/glyphlist$py

I am running an acceptance test from Command Line, which internally calls pdfminer python script method for conversion of Pdf into Text. I have provided the PDF2TextLibrary which has the code to convert Pdf into text using pdfminer library.
But while I run the test i get the error :
ClassFormatError: Invalid method Code length 85551 in class file pdfminer/glyphlist$py
I don't think you need to have a class if you are using only one function. You can save code and make it easier to read:
pdf2text.py
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
textstr = retstr.getvalue()
retstr.close()
return textstr
And this is how I use it:
*** Settings ***
Library pdf2text
*** Test Cases ***
pdfconvert
${pdftext}= Convert Pdf To txt <path_to_pdf>
>
The issue was resolved by dividing the file into smaller chunks. And the reason was that Java implementation has limit of 64KB for a class file. So in my case the class was evaluating to a size of 446KB.

Convert PDF to Text - Keep rows of table - Python

I have tables in pdf documents that I want to convert to text. I found the following code which converts the pdf to text. However, when it converts, it does not keep the data in the correct rows. It places everything in one long line of string. Is there any way to preserve rows in a table when converting to text from PDF using Python?
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from cStringIO import StringIO
def convert_pdf(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
Pdfminer comes with text extraction tool called pdf2txt.py, which has the ability to analyze layouts. You can try using that, or study it to see how it works.
A-PDF to Text convert better PDF with tables as other tools !

Categories