Convert PDF file to .txt python 3 - python

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
filepath = open(fname, 'rb')
for page in PDFPage.get_pages(filepath, pagenums):
interpreter.process_page(page)
filepath.close()
converter.close()
text = output.getvalue()
output.close
return text
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
#textFile.close
pdfDir = (r"FK_EPPS")
txtDir = (r"FK_txt")
convertMultiple(pdfDir, txtDir)
I tried to convert multiple pdf files called FK_EPPS into txt files and write it in different folder called FK_txt. But it says that there is no such files or directory. I put the folder exactly in those path. I try find the solution but still there is an error. Can you help me why this is happen?
/usr/local/lib/python2.7/dist-packages/pdfminer/__init__.py:20: UserWarning: On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For more information see https://github.com/pdfminer/pdfminer.six/issues/194
warnings.warn('On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For '
Traceback (most recent call last):
File "/home/a1-re/Documents/pdftotext/1.py", line 44, in <module>
convertMultiple(pdfDir, txtDir)
File "/home/a1-re/Documents/pdftotext/1.py", line 36, in convertMultiple
text = convert(pdfFilename) #get string of text content of pdf
File "/home/a1-re/Documents/pdftotext/1.py", line 21, in convert
filepath = file(fname, 'rb')
IOError: [Errno 2] No such file or directory: 'pdf1831150030.pdf'

(There is no way the traceback that you show is correct. With your sample input, the error should have contained FK_EPPS at the start.)
You forget that a path and filename must be separated from each other with the appropriate separator for your OS.
You could immediately have seen this if you had printed out the value of fname at the start of that convert function. You make the same mistake for the text output filename, but that would be harder to notice because it would not yield an error, but only create a wrong filename.

Related

Extracting email address, first name and last name from multiple PDF files within a folder

I am trying to extract the following information from all PDF files within a folder, the PDF files are CV's: Email Address, First Name, Last Name for a work project.
I have successfully managed to extract Email Addresses using this code:
from io import StringIO
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter
from pdfminer3.layout import LAParams
from pdfminer3.pdfpage import PDFPage
import subprocess
from subprocess import call
import os
import re
working_directory = './folder'
file_list = [] # define file_list to save all dxf files
email_list = {} # define file_list to save all dxf files
for subdir, dirs, files in os.walk(working_directory):
for file in files:
if file.endswith('.pdf'):
file_list.append(file)
for input_file in file_list:
pagenums = set()
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open('./folder/' + input_file, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
match = re.search(r'[\w\.-]+#[a-z0-9\.-]+', text)
try:
email = match.group(0)
except AttributeError:
email = match
if email is None:
pass
else:
email_list.update({input_file: email})
print(email_list[input_file])
email_list
But have trouble extracting First and Last Names, any help would be appreciated!
You can find email information because there is logic behind it
match = re.search(r'[\w\.-]+#[a-z0-9\.-]+', text)
But also you have to figure out a logic to find out first and last names of your PDF files.
Maybe an specific field after Dear, for example

Redirect output of a function to another folder in python

I am using python 3. My code uses pdfminer to convert pdf to text. I want to get the output of these files in a new folder. Currently it's coming in the existing folder from which it does the conversion to .txt using pdfminer. How do I redirect the output to a different folder. I want the output in a folder called "D:\extracted_text" Code till now:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import glob
import os
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
savepath = 'D:/extracted_text/'
outfile = os.path.splitext(fname)[0] + '.txt'
comp_name = os.path.join(savepath,outfile)
print(outfile)
with open(comp_name, 'w', encoding = 'utf-8') as pdf_file:
pdf_file.write(text)
return text
directory = glob.glob(r'D:\files\*.pdf')
for myfiles in directory:
convert(myfiles)

how to convert pdf file to text file where the content is arabic

i have a python script that list all the pdf files in a specified directory and convert it to text files.
the system work perfect the problem is when i have ARABIC text the script crash because its unable to search in the PDF file.
i know that pdf are binary but do not know how to read ARABIC and convert it to text
how to fix this error ?
i tried to encode to UTF-8 and decode it but still not working
if i try the code with comment lines at the bottom of the code the result will be converted empty text file.
if i try to Uncomment the lines in order to encode and decode the result will be empty converted text file with this error:
Traceback (most recent call last): File
"C:\Users\test\Downloads\pdf-txt\text maker.py", line 63, in
content.decode('ascii', 'ignore') UnicodeEncodeError: 'ascii' codec can't encode character u'\xa9' in position 50: ordinal not in
range(128)
code:
import os
from os import chdir, getcwd, listdir, path
import codecs
import pyPdf
from time import strftime
def check_path(prompt):
''' (str) -> str
Verifies if the provided absolute path does exist.
'''
abs_path = raw_input(prompt)
while path.exists(abs_path) != True:
print "\nThe specified path does not exist.\n"
abs_path = raw_input(prompt)
return abs_path
print "\n"
folder = check_path("Provide absolute path for the folder: ")
list=[]
directory=folder
for root,dirs,files in os.walk(directory):
for filename in files:
if filename.endswith('.pdf'):
t=os.path.join(directory,filename)
##print(t)
##list.extend(t)
list.append(t)
## print(list)
m=len(list)
print (m)
i=0
while i<=m-1:
path=list[i]
print(path)
head,tail=os.path.split(path)
var="\\"
tail=tail.replace(".pdf",".txt")
name=head+var+tail
content = ""
# Load PDF into pyPDF
pdf = pyPdf.PdfFileReader(file(path, "rb"))
##pdf = pyPdf.PdfFileReader(codecs.open(path, "rb", encoding='UTF-8'))
# Iterate pages
for j in range(0, pdf.getNumPages()):
# Extract text from page and add to content
content += pdf.getPage(j).extractText() + "\n"
print strftime("%H:%M:%S"), " pdf -> txt "
f=open(name,'w')
## all this red line are added by me ##
##f.decode(content.encode('UTF-8'))
##content.encode('utf-8')
##content.decode('ascii', 'ignore')
##content.decode('unicode-escape')
##f.write(content)
f.write(content.encode('UTF-8'))
f.close
i=i+1
5 years later: pypdf has gone through major updates. Extracting Arabic text should work fine the standard way:
from pypdf import PdfReader
reader = PdfReader("arabic.pdf")
full_text = ""
for page in reader.pages:
full_text += page.extract_text() + "\n"
print(full_text)
If some PDF is causing issues, please report a bug. You need to share the pypdf version and the file that causes the issue.
What NOT to do
Don't do anything with encode / decode. It's not necessary.
Don't use PyPDF2 / PyPDF3 / PyPDF4: The community moved to pypdf.

How to read Arabic text from PDF using Python script

I have a code written in Python that reads from PDF files and convert it to text file.
The problem occurred when I tried to read Arabic text from PDF files. I know that the error is in the coding and encoding process but I don't know how to fix it.
The system converts Arabic PDF files but the text file is empty.
and display this error:
Traceback (most recent call last): File
"C:\Users\test\Downloads\pdf-txt\text maker.py", line 68, in
f.write(content) UnicodeEncodeError: 'ascii' codec can't encode character u'\xa9' in position 50: ordinal not in range(128)
Code:
import os
from os import chdir, getcwd, listdir, path
import codecs
import pyPdf
from time import strftime
def check_path(prompt):
''' (str) -> str
Verifies if the provided absolute path does exist.
'''
abs_path = raw_input(prompt)
while path.exists(abs_path) != True:
print "\nThe specified path does not exist.\n"
abs_path = raw_input(prompt)
return abs_path
print "\n"
folder = check_path("Provide absolute path for the folder: ")
list=[]
directory=folder
for root,dirs,files in os.walk(directory):
for filename in files:
if filename.endswith('.pdf'):
t=os.path.join(directory,filename)
list.append(t)
m=len(list)
print (m)
i=0
while i<=m-1:
path=list[i]
print(path)
head,tail=os.path.split(path)
var="\\"
tail=tail.replace(".pdf",".txt")
name=head+var+tail
content = ""
# Load PDF into pyPDF
pdf = pyPdf.PdfFileReader(file(path, "rb"))
# Iterate pages
for j in range(0, pdf.getNumPages()):
# Extract text from page and add to content
content += pdf.getPage(j).extractText() + "\n"
print strftime("%H:%M:%S"), " pdf -> txt "
f=open(name,'w')
content.encode('utf-8')
f.write(content)
f.close
i=i+1
You have a couple of problems:
content.encode('utf-8') doesn't do anything. The return value is the encoded content, but you have to assign it to a variable. Better yet, open the file with an encoding, and write Unicode strings to that file. content appears to be Unicode data.
Example (works for both Python 2 and 3):
import io
f = io.open(name,'w',encoding='utf8')
f.write(content)
If you don't close the file properly, you may see no content because the file is not flushed to disk. You have f.close not f.close(). It's better to use with, which ensures the file is closed when the block exits.
Example:
import io
with io.open(name,'w',encoding='utf8') as f:
f.write(content)
In Python 3, you don't need to import and use io.open but it still works. open is equivalent. Python 2 needs the io.open form.
you can use anthor library called pdfplumber instead of using pypdf or PyPDF2
import arabic_reshaper
from bidi.algorithm import get_display
with pdfplumber.open(r'example.pdf') as pdf:
my_page = pdf.pages[10]
thepages=my_page.extract_text()
reshaped_text = arabic_reshaper.reshape(thepages)
bidi_text = get_display(reshaped_text)
print(bidi_text)

convert several files with pdfminer

I've found code online which allows to convert several pdf files to text files, using the pdfminer module in Python. I tried to expand the code for several pdf files which I've saved in a directory, but the code results in an error.
My code so far:
import nltk
import re
import glob
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
with open('D:\Reports\*.txt', 'w') as pdf_file:
pdf_file.write(text)
return text
directory = glob.glob('D:\Reports\*.pdf')
for myfiles in directory:
convert(myfiles)
The error message:
Traceback (most recent call last):
File "F:/Text mining/pdfminer for several files", line 40, in <module>
convert(myfiles)
File "F:/Text mining/pdfminer for several files", line 32, in convert
with open('D:\Reports\*.txt', 'w') as pdf_file:
IOError: [Errno 22] invalid mode ('w') or filename: 'D:\\Reports\\*.txt'
The error stems from attempting to write the contents of the text variable to a file that is named 'D:\Reports\*.txt'. The wildcard * is not allowed in a filename (ref).
If you want to save the file to a text file with the same name, you could replace your writing functionality with:
outfile = os.path.splitext(os.path.abspath(fname))[0] + '.txt'
with open(outfile, 'wb') as pdf_file:
pdf_file.write(text)
Do not forget to import os if you want to process paths in an OS agnostic way.
probably you should just change:
with open('D:\Reports\*.txt', 'w') as pdf_file:
pdf_file.write(text)
to
with open(fname, 'w') as pdf_file:
pdf_file.write(text)
but I do not have python2.7-3.4 on my machine available to verify

Categories