PyPDF to read each PDF in a folder

PyPDF to read each PDF in a folder - python

I have the code below which is working. But it only reads one file at a time, which I have to insert in the code. How could I make this code read every PDF in a directory? PDF by PDF.
import PyPDF2
import textract
import re
filename = 'file.pdf'
pdfFileObj = open(filename,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
extractedtext = ""
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
wortlaut += pageObj.extractText()
print(extractedtext)
keyword = re.findall(r'(\d{7})(\-)(\d{2})',extractedtext)
if keyword:
print(keyword)

You can use glob to get a list of PDF files in a directory.
You can also accept a command-line argument for the directory within which to operate.
Call this program with: python3 this_script.py directory_to_read
import PyPDF2
import glob
import os
import re
import sys
dir_to_read = sys.argv[1] # accept a command-line argument with the dir to read
pdf_files = glob.glob(os.path.join(dir_to_read,'*.pdf'))
count = 0
extractedtext = ""
for pdf_file in pdf_files:
print(pdf_file)
pdfFileObj = open(pdf_file,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
print(num_pages)
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
extractedtext += pageObj.extractText()
print(extractedtext)
keyword = re.findall(r'(\d{7})(\-)(\d{2})',extractedtext)
if keyword:
print(keyword)

Related

Trying to extract a range of PDF page numbers from a pdf page to split and save it as a separate file using Python

I am trying to create a range of page numbers from a pdf file and then split and save them as a separate file.
Below is the code written for it.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
import re
def pdf_splitter(pdf_path):
directory = "C:\\Users\\Docs\\"
fname = os.path.splitext(os.path.basename(pdf_path))[0]
print(fname)
object = PdfFileReader(pdf_path)
NumPages = object.getNumPages()
print(NumPages)
string = "Challenge 1:"
string2 = "Challenge 2:"
res=0
pageList=[]
for txt in range(0,NumPages):
pdf_writer = PdfFileWriter()
pageObject = object.getPage(txt)
Text = pageObject.extractText()
print(Text)
acc_pos = Text.find(string)
print(acc_pos)
Cur_pos = Text.find(string2)
print(Cur_pos)
loanAcctName = Text[acc_pos+12:Cur_pos]
print (loanAcctName)
# pageList.append(txt)
# print(pageList)
ReSearch = re.search(string, Text)
if ReSearch != None:
pageList.append(txt)
elif ReSearch ==None:
pageList.append(txt)
print(pageList)
res = res + 1
pdf_writer.addPage(object.getPage(page_num))
output_filename = '{}_page_{}.pdf'.format(loanAcctName,page + 1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
out.close()
res = res + 1
if __name__ == '__main__':
pdf_path = r"C:\Users\FY22.pdf"
pdf_splitter(pdf_path)

Separate large pdf document into small pages based on text

New to python scripting. I need to Separate a large PDF into individual pdfs based on the text I have tried.
def getPagebreakList(file_name: str)->list:
import os
from glob import glob
from PyPDF2 import PdfFileWriter, PdfFileReader
pdf_file = PyPDF2.PdfFileReader(file_name)
num_pages = pdf_file.getNumPages()
page_breaks = list()
for i in range(0, num_pages):
Page = file.getPage(i)
Text = PageObj.extractText()
if re.search(r"tool used", Text):
page_breaks.append(i)
return page_breaks
inputpdf = PdfFileReader(open('couch.pdf', "rb"))
num_pages = inputpdf.numPages
page_breaks = getPagebreakList('yourPDF.pdf')
i = 0
while (i < num_pages):
if page_breaks:
page_break = page_breaks.pop(0)
else:
page_break = num_pages
output = PdfFileWriter()
while (i != page_break + 1):
output.addPage(inputpdf.getPage(i))
i = i + 1
with open("couch%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
I know I'm missing the print statements and I am getting the error
`syntaxError: 'return ' outside function``
Any assistance would be grateful.

Converting multiple PDF files into txt in Python?

import os
from PyPDF2 import PdfFileReader, PdfFileWriter
for filename in os.listdir("C:/117"):
path = os.path.join("C:/117/", filename)
print(path)
with open('file.txt', 'w', encoding='utf-8') as file:
for page_num in range(PdfFileReader(path).numPages):
print('Page: {0}'.format(page_num))
pageObj = PdfFileReader(path).getPage(page_num)
try:
txt = pageObj.extractText()
except:
pass
else:
file.write('Page{0}\n'.format(page_num+1))
file.write(txt)
file.close()
I am converting hundreds of pdf files into txt. However, with this code, all the PDFs are merged into a single txt file. Is there a way to create separate txt file for each PDF I convert? Thanks

from pathlib import Path
from pypdf import PdfReader
def convert_pdf_to_text(path: Path) -> str:
text = ""
for page in PdfReader(path).pages:
text += page.extract_text() + "\n"
return text
for path in Path("Documents").glob("**/*.pdf"):
text = convert_pdf_to_text(path)
txt_path = path.parent / (".".join(path.name.split(".")[:-1]) + ".txt")
if txt_path.exists():
print(f"Skip {txt_path} as it already exists")
continue
with open(txt_path, "wt") as fp:
fp.write(text)

How to merge for loop results in one list

I am trying to merge all the values into one list when I run my for loop. However I keep getting to separate brackets in one list.
For example, when I run this code:
import glob
import re
#import PyPDF2
folder_path='/Users/my_path/cb_files'
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern, recursive=True)
#IP Bank
import re
ip = re.compile(r"((?:^|\b)(?:h[tTxX]ps?://)?(?:\d{1,3}\[?\.\]?){3}\d{1,3}(?:\b|$))")
hash_ = re.compile(r"((?:^|\b)[a-fA-F0-9]{32,64}(?:\b|$))")
domain = re.compile(r"((?:^|\b)(?:h[xXtT]{2}ps?:|meows?:)?//(?:[a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDFC\uFDF0-\uFFEF_.\[\]-]+)(?:\[?\.\]?[a-z]+)+(?::\d+)?(?:[/?][^\s]*)?)")
ip_list=[]
for file in folder_contents:
if re.search(r".*(?=pdf$)",file):
#this is pdf
pdfFileObj = open('pdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
read_file=pageObj.extractText()
elif '.' not in file:
continue
else:
read_file = open(file, 'rt', encoding="latin-1").read()
if ip.findall(read_file) or hash_.findall(read_file) or domain.findall(read_file):
ips =ip.findall(read_file)
hashs= hash_.findall(read_file)
domains=domain.findall(read_file)
# print("IPS",', '.join(ips))
ip_list.append(ips)
print(ip_list)
Here is my output:
[['000.000.0.1', '111.111.1.1'], ['222.222.2.2','333.333.3.3']]
So it looks like for each file it loops over, it is putting it in its own list.
I want the output to look like this:
['000.000.0.1', '111.111.1.1','222.222.2.2','333.333.3.3']
Any changes in my code that will produce these results?

Try changing:-
ip_list.append(ips)
to
ip_list.extend(ips)

How to read all pdf files in a directory and convert to text file using tesseract python 3?

How to read all pdf files in a directory and convert to text file using tesseract python 3?
The below code is for reading one pdf file and convert to text file.
But i want to read all pdf files in a directory and convert to text file using tesseract python 3
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
pdf_filename = "pdffile_name.pdf"
txt_filename = "text_file_created.txt"
def tesseract(pdf_filename,txt_filename):
PDF_file = pdf_filename
pages = convert_from_path(PDF_file, 500)
image_counter = 1
for page in pages:
pdf_filename = "page_"+str(image_counter)+".jpg"
page.save(pdf_filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
outfile = txt_filename
f = open(outfile, "a",encoding = "utf-8")
for i in range(1, filelimit + 1):
pdf_filename = "page_"+str(i)+".jpg"
text = str(((pytesseract.image_to_string(Image.open(pdf_filename)))))
text = text.replace('-\n', '')
f.write(text)
f.close()
f1 = open(outfile, "r",encoding = "utf-8")
text_list = f1.readlines()
return text_list
tesseract(pdf_filename,txt_filename)`enter code here`
i have code for reading pdf files in a directory but i dont know to combine this code with above code
def readfiles():
os.chdir(path)
pdfs = []
for file_list in glob.glob("*.pdf"):
print(file_list)
pdfs.append(file_list)
readfiles()

Simply convert the variable pdf_filename to a list using this code snippet:
import glob
pdf_filename = [f for f in glob.glob("your_preferred_path/*.pdf")]
which will get you all the pdf files you want and store it into a list.
Or simply use any of the methods posted here:
How do I list all files of a directory?
Once you do that, you now have a list of pdf files.
Now iterate over the list of pdfs, one at a time, which will give you a list of test files.
You can use it something like this code snippet:
for one_pdf in pdf_filename:
#* your code to convert the files *#
Hope this helps.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

PyPDF to read each PDF in a folder - python

Related

Trying to extract a range of PDF page numbers from a pdf page to split and save it as a separate file using Python

Separate large pdf document into small pages based on text

Converting multiple PDF files into txt in Python?

How to merge for loop results in one list

How to read all pdf files in a directory and convert to text file using tesseract python 3?

Categories

Resources