Extract text from PDF files in a directory using PyPDF2 - python

I would like to extract text from pdf's in a directory to text files in another directory ( convert PDF=> .txt) with PyPDF2
I have read the information here : https://automatetheboringstuff.com/chapter13/
But I did not find information on batch converting the files
import PyPDF2
pdfFileObj = open('meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
pageObj.extractText()
I looking for a solution to convert pdf files from a directory and convert them to .txt files with same names in another directory.

You can take a look at the following code
import os
import PyPDF2
PDFS_FOLDER = '/absolute/path/of/your/pdf/folder'
TEXTS_FOLDER = '/absolute/path/of/your/txt/folder/which/is/already/created'
def get_all_pdfs(folder_path):
"""
:param folder_path: absolute folder path of the pdfs
:return: a list with all the absolute path of pdfs
"""
return os.listdir(folder_path)
def create_absolute_path(root_path, file_name):
"""
:param root_path: absolute route path
:param file_name: file name
:return: absolute path of the file name
"""
root_path = root_path + '/' if root_path[-1] != '/' else root_path
return "%s%s" %(root_path, file_name)
def convert_pdf_to_text(pdf_path):
"""
:param pdf_path:
:return: bytearray with all the pages content
"""
pdfFileObj = open(pdf_path, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
number_of_pages = pdfReader.getNumPages()
text_byte_array = bytearray()
for page_number in range(number_of_pages):
current_page = pdfReader.getPage(page_number)
page_content = bytearray(current_page.extractText().encode('utf-8'))
text_byte_array.extend(page_content)
return text_byte_array
def convert_pdf_extension_to_text(pdf_file_name):
"""
:param pdf_file_name: string which contains a pdf file name
:return: string with the filename but with .txt extension instead of .pdf
"""
return "%s.txt" %(pdf_file_name.split('.pdf', 1)[0])
def save_text_to_folder(text, target_path):
"""
:param text: byte or bytearray
:param target_path:
:return:
"""
with open(target_path, 'wb') as f:
f.write(text)
if __name__ == '__main__':
all_pdfs = get_all_pdfs(PDFS_FOLDER)
for pdf_file_name in all_pdfs:
abs_path_pdf = create_absolute_path(PDFS_FOLDER, pdf_file_name)
text = convert_pdf_to_text(abs_path_pdf)
text_path = convert_pdf_extension_to_text(pdf_file_name)
target_text_path = create_absolute_path(TEXTS_FOLDER, text_path)
save_text_to_folder(text, target_text_path)

Related

How can I convert a PureWindowsPath to a Iterable?

I'm working on an academy project to encrypt some files I have managed to encrypt all files from one folder but when there is a folder into that folder i get errors so i decide to first list all files and sub-directories of the folder:
ROOT = r"C:\Users\Practiques\Desktop\archivos"
for path, subdirs, files in os.walk(ROOT):
for name in files:
pure_path = PurePath(path, name)
print (pure_path)
With this code I get the paths in that form: C:\Users\XXX\Desktop\archivos\external-content.duckduckgo.com.jpg
C:\Users\XXX\Desktop\archivos\hola.txt
and then when i try to pass to the function 'encrypt', i get this error:
TypeError: 'PureWindowsPath' object is not iterable
The format I need to pass to the function is this: ['C:\Users\XXX\Desktop\archivos\external-content.duckduckgo.com.jpg', 'C:\Users\XXX\Desktop\archivos\hola.txt', etc.]
I think one possible solution is to make a list when i obtain all recursive path and their files, but i don't know how to do that.
The function encrypt:
def encrypt(items, key):
f = Fernet(key)
for item in items:
with open(item, 'rb') as file:
file_data = file.read()
encrypted_data = f.encrypt(file_data)
with open(item, 'wb') as file:
file.write(encrypted_data)
How i call it:
for path, subdirs, files in os.walk(ROOT):
for name in files:
pure_path = PurePath(path, name)
print (pure_path)
encrypt(pure_path, key)
You need to use recursion to encrypt the sub-folders' contents:
import os
def recursive_search(path: str) -> "list[str]":
"""get all files from an absolute path
:param path: absolute path of the directory to search
:type path: str
:return: a list of all files
:rtype: list[str]
"""
found_files = []
if not os.path.isdir(path):
raise RuntimeError(f"'{path}' is not a directory")
for item in os.listdir(path):
full_path = os.path.join(path, item)
if os.path.isfile(full_path):
found_files.append(full_path)
elif os.path.isdir(full_path):
found_files.extend(recursive_search(full_path))
return found_files
directory = "YOUR ROOT/TOP-LEVEL DIRECTORY HERE"
print(recursive_search(directory))
Then, you would do:
f = Fernet(key)
for item in recursive_search(directory):
with open(item, 'rb') as file:
file_data = file.read()
encrypted_data = f.encrypt(file_data)
with open(item, 'wb') as file:
file.write(encrypted_data)
Edit 1: in regard to skipping over certain file extensions:
import os
def recursive_search(path: str) -> "list[str]":
"""get all files from an absolute path
:param path: absolute path of the directory to search
:type path: str
:return: a list of all files
:rtype: list[str]
"""
found_files = []
if not os.path.isdir(path):
raise RuntimeError(f"'{path}' is not a directory")
for item in os.listdir(path):
full_path = os.path.join(path, item)
dot_extension = os.path.splitext(full_path)[1] # ex.: '.txt'
if os.path.isfile(full_path):
if dot_extension == ".ini":
continue # this tells python to skip to break the for-loop run on the current item in an iterable and go to the next one
found_files.append(full_path)
elif os.path.isdir(full_path):
found_files.extend(recursive_search(full_path))
return found_files
directory = "/Users/nicholasbarrow/GitHub/com.nicholasrbarrow.cpp"
print(recursive_search(directory))
f = Fernet(key)
for item in recursive_search(directory):
with open(item, 'rb') as file:
file_data = file.read()
encrypted_data = f.encrypt(file_data)
with open(item, 'wb') as file:
file.write(encrypted_data)

Converting multiple PDF files into txt in Python?

import os
from PyPDF2 import PdfFileReader, PdfFileWriter
for filename in os.listdir("C:/117"):
path = os.path.join("C:/117/", filename)
print(path)
with open('file.txt', 'w', encoding='utf-8') as file:
for page_num in range(PdfFileReader(path).numPages):
print('Page: {0}'.format(page_num))
pageObj = PdfFileReader(path).getPage(page_num)
try:
txt = pageObj.extractText()
except:
pass
else:
file.write('Page{0}\n'.format(page_num+1))
file.write(txt)
file.close()
I am converting hundreds of pdf files into txt. However, with this code, all the PDFs are merged into a single txt file. Is there a way to create separate txt file for each PDF I convert? Thanks
from pathlib import Path
from pypdf import PdfReader
def convert_pdf_to_text(path: Path) -> str:
text = ""
for page in PdfReader(path).pages:
text += page.extract_text() + "\n"
return text
for path in Path("Documents").glob("**/*.pdf"):
text = convert_pdf_to_text(path)
txt_path = path.parent / (".".join(path.name.split(".")[:-1]) + ".txt")
if txt_path.exists():
print(f"Skip {txt_path} as it already exists")
continue
with open(txt_path, "wt") as fp:
fp.write(text)

Sorting PDF reports into proper directories "TypeError: expected str, bytes or os.PathLike object, not list"

I am currently getting an error:
line 24, in <module> pdfFileobj = open(pdfFiles, 'rb') TypeError: expected str, bytes or os.PathLike object, not list"
Trying to automate part of my job.
At my job I am constantly creating new PDF reports for clients.
My goal is to sort all of the PDF reports from my download directory parse the reports for 3 pieces of data first name last name and report type then I need to compare the data to the appointments on my shared outlook calendar to get the date of the client's appointment. then I need to move the reports to our clients directory on the shared drive and create a client specific sub-directory if it does not exist lastly I need to rename the reports in this format LastnameDD-MM-YY Firstname Report type
import os
import winreg
import PyPDF2 as p2
import glob
def get_download_path():
"""Returns the default downloads path for linux or windows"""
if os.name == 'nt':
sub_key = r'SOFTWARE\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders'
downloads_guid = '{374DE290-123F-4565-9164-39C4925E467B}'
with winreg.OpenKey(winreg.HKEY_CURRENT_USER, sub_key) as key:
location = winreg.QueryValueEx(key, downloads_guid)[0]
return location
else:
return os.path.join(os.path.expanduser('~'), 'downloads')
os.chdir(get_download_path())
pdfFiles = [glob.glob("*.pdf")]
pdfs = []
while pdfFiles:
pdfFileobj = open(pdfFiles, 'rb')
pdfReader = p2.PdfFileReader(pdfFileobj)
pdfFiles.pop(-1)
First of all, glob.glob(str) returns a list, and so, you adding the extra square brackets are unnecessary.
Second, open() takes in a str, bytes or os.PathLike object as an argument, not a list.
Change this part:
while pdfFiles:
pdfFileobj = open(pdfFiles, 'rb')
pdfReader = p2.PdfFileReader(pdfFileobj)
pdfFiles.pop(-1)
to:
for file in pdfFiles:
pdfFileobj = open(file, 'rb')
pdfReader = p2.PdfFileReader(pdfFileobj)
(also note that pop() by default removes the -1 index, so you don't have to pass in the argument)
All together:
import os
import winreg
import PyPDF2 as p2
import glob
def get_download_path():
"""Returns the default downloads path for linux or windows"""
if os.name == 'nt':
sub_key = r'SOFTWARE\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders'
downloads_guid = '{374DE290-123F-4565-9164-39C4925E467B}'
with winreg.OpenKey(winreg.HKEY_CURRENT_USER, sub_key) as key:
location = winreg.QueryValueEx(key, downloads_guid)[0]
return location
else:
return os.path.join(os.path.expanduser('~'), 'downloads')
os.chdir(get_download_path())
pdfFiles = glob.glob("*.pdf")
pdfs = []
for file in pdfFiles:
pdfFileobj = open(file, 'rb')
pdfReader = p2.PdfFileReader(pdfFileobj)
____________________________ UPDATED FOR LOOPING: ____________________________
import os
import winreg
import PyPDF2 as p2
import glob
from time import sleep
def get_download_path():
"""Returns the default downloads path for linux or windows"""
if os.name == 'nt':
sub_key = r'SOFTWARE\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders'
downloads_guid = '{374DE290-123F-4565-9164-39C4925E467B}'
with winreg.OpenKey(winreg.HKEY_CURRENT_USER, sub_key) as key:
location = winreg.QueryValueEx(key, downloads_guid)[0]
return location
else:
return os.path.join(os.path.expanduser('~'), 'downloads')
os.chdir(get_download_path())
while True:
pdfFiles = glob.glob("*.pdf")
pdfs = []
for file in pdfFiles:
pdfFileobj = open(file, 'rb')
pdfReader = p2.PdfFileReader(pdfFileobj)
sleep(300) # stop the program for 300 seconds

Extrating text with textract from S3 bucket

I am trying to retrieve a .doc file from a s3 bucket and use textract to read its text. In order to do so, I created these two functions:
def process_files(filepath):
s3 = s3fs.S3FileSystem()
filename = 's3://' + bucket_name + '/' + filepath
_, ext = os.path.splitext(filename)
if ext == '.pdf':
extract_string = pdf_to_string(s3, filename)
return extract_string
elif ext == '.doc':
extract_string = doc_to_string(s3, filename)
return extract_string
def doc_to_string(s3_file, filename):
"""
convert an .doc or .docs file into string
"""
print(filename)
print(s3_file.ls('/myname/test_files/*'))
text = textract.process(filename)
return text
However, I am getting the error:
Is this the right path/to/file/you/want/to/extract.doc
Therefore I changed my code in order to change the path:
def doc_to_string(s3_file, filename):
"""
convert an .doc or .docs file into string
"""
text = textract.process(s3_file.ls('/myname/test_files/*'))
return text
But I get:
Path should be string bytes or os.pathlike

Convert a PDF files to TXT files

I need a last touch from an expert !! I want to convert all pdf files in a directory to txt files. I wrote a code to create empty txt files having the same name as pdf files and a code to convert a single pdf to txt but I want to convert all files in the directory. please see the code below:
PS : I Already tried with PDFminer, and every other package and it does not work
import pandas as pd
import os
import PyPDF2
###Create empty txt files Named as pdf files ###########
path = '....\\PDF2Text\\PDF\\'
newpath = '....\\PDF2Text\\Text\\'
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
for f in files:
ext = f.replace('.pdf','.txt')
extpath = ext.replace(path,newpath)
ft= open(extpath ,"w+")
ft.close()
print(extpath)
##Here we Convert a single pdf file to a txt file providing pdf path and empty txt path #####
import PyPDF2
def getPDFFileContentToTXT(pdfFile):
myPDFFile = PyPDF2.PdfFileReader(pdfFile)
with open('....\\PDF2Text\\Text\\blabla.txt', 'w') as pdf_output:
for page in range (myPDFFile.getNumPages()):
data = myPDFFile.getPage(page).extractText()
pdf_output.write(data)
with open('.....\\PDF2Text\\Text\\blabla.txt', 'r') as myPDFContent:
return myPDFContent.read().replace('\n',' ')
pdfFileContent = getPDFFileContentToTXT('.....\\PDF2Text\\PDF\\blabla.pdf')
import pandas as pd
import os
import PyPDF2
#Create empty txt files Named as pdf files
path = 'C:\\PDF2Text\\PDF\\'
newpath = 'C:\\PDF2Text\\Text\\'
# r=root, d=directories, f = files
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
for f in files:
txt = f.replace('.pdf','.txt')
txtpath = txt.replace(path,newpath)
print(f)
ft= open(txtpath ,"w+")
ft.close()
print(txtpath)
Vpath = f.replace('.pdf','')
#print(Vpath)
myPDFFile = PyPDF2.PdfFileReader(f)
with open(txtpath, 'w') as pdf_output: #, encoding="utf-8"
for page in range (myPDFFile.getNumPages()):
data = myPDFFile.getPage(page).extractText()
pdf_output.write(data)
with open(txtpath, 'r') as myPDFContent:
myPDFContent.read().replace('\n',' ')
Have you tried Tika? Just do a pip install tika (also need to have Java 7+ installed on your system) and maybe this is the piece of code you want:
import os
from tika import parser
def read_pdf(pdf_file):
text = parser.from_file(pdf_file)['content']
return text.encode('utf-8')
def pdf_to_txt(folder_with_pdf, dest_folder):
"""
folder_with_pdf: path to your pdf's
dest_folder: path where you want .txt files saved
"""
pdf_files = []
for root, dirs, files in os.walk(folder_with_pdf):
for f in files:
if '.pdf' in f:
pdf_files.append(os.path.join(root, f))
#print(pdf_files)
for file_ in pdf_files:
text_file = os.path.splitext(os.path.basename(file_))[0]+'.txt'
with open(os.path.join(dest_folder,text_file), 'wb') as text_f:
text_f.write(read_pdf(file_))
return None
pdf_to_txt('./pdf_folder', './txt_folder') #you should see .txt files being populated in ./txt_folder
Aside: If pdf files in sub-directories of ./pdf_folder happens to have the same name (but different content) by any chance, then you will lose one (or more) .txt files.

Categories