PyPDF to read each PDF in a folder - python

I have the code below which is working. But it only reads one file at a time, which I have to insert in the code. How could I make this code read every PDF in a directory? PDF by PDF.
import PyPDF2
import textract
import re
filename = 'file.pdf'
pdfFileObj = open(filename,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
extractedtext = ""
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
wortlaut += pageObj.extractText()
print(extractedtext)
keyword = re.findall(r'(\d{7})(\-)(\d{2})',extractedtext)
if keyword:
print(keyword)

You can use glob to get a list of PDF files in a directory.
You can also accept a command-line argument for the directory within which to operate.
Call this program with: python3 this_script.py directory_to_read
import PyPDF2
import glob
import os
import re
import sys
dir_to_read = sys.argv[1] # accept a command-line argument with the dir to read
pdf_files = glob.glob(os.path.join(dir_to_read,'*.pdf'))
count = 0
extractedtext = ""
for pdf_file in pdf_files:
print(pdf_file)
pdfFileObj = open(pdf_file,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
print(num_pages)
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
extractedtext += pageObj.extractText()
print(extractedtext)
keyword = re.findall(r'(\d{7})(\-)(\d{2})',extractedtext)
if keyword:
print(keyword)

Related

Trying to extract a range of PDF page numbers from a pdf page to split and save it as a separate file using Python

I am trying to create a range of page numbers from a pdf file and then split and save them as a separate file.
Below is the code written for it.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
import re
def pdf_splitter(pdf_path):
directory = "C:\\Users\\Docs\\"
fname = os.path.splitext(os.path.basename(pdf_path))[0]
print(fname)
object = PdfFileReader(pdf_path)
NumPages = object.getNumPages()
print(NumPages)
string = "Challenge 1:"
string2 = "Challenge 2:"
res=0
pageList=[]
for txt in range(0,NumPages):
pdf_writer = PdfFileWriter()
pageObject = object.getPage(txt)
Text = pageObject.extractText()
print(Text)
acc_pos = Text.find(string)
print(acc_pos)
Cur_pos = Text.find(string2)
print(Cur_pos)
loanAcctName = Text[acc_pos+12:Cur_pos]
print (loanAcctName)
# pageList.append(txt)
# print(pageList)
ReSearch = re.search(string, Text)
if ReSearch != None:
pageList.append(txt)
elif ReSearch ==None:
pageList.append(txt)
print(pageList)
res = res + 1
pdf_writer.addPage(object.getPage(page_num))
output_filename = '{}_page_{}.pdf'.format(loanAcctName,page + 1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
out.close()
res = res + 1
if __name__ == '__main__':
pdf_path = r"C:\Users\FY22.pdf"
pdf_splitter(pdf_path)

Separate large pdf document into small pages based on text

New to python scripting. I need to Separate a large PDF into individual pdfs based on the text I have tried.
def getPagebreakList(file_name: str)->list:
import os
from glob import glob
from PyPDF2 import PdfFileWriter, PdfFileReader
pdf_file = PyPDF2.PdfFileReader(file_name)
num_pages = pdf_file.getNumPages()
page_breaks = list()
for i in range(0, num_pages):
Page = file.getPage(i)
Text = PageObj.extractText()
if re.search(r"tool used", Text):
page_breaks.append(i)
return page_breaks
inputpdf = PdfFileReader(open('couch.pdf', "rb"))
num_pages = inputpdf.numPages
page_breaks = getPagebreakList('yourPDF.pdf')
i = 0
while (i < num_pages):
if page_breaks:
page_break = page_breaks.pop(0)
else:
page_break = num_pages
output = PdfFileWriter()
while (i != page_break + 1):
output.addPage(inputpdf.getPage(i))
i = i + 1
with open("couch%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
I know I'm missing the print statements and I am getting the error
`syntaxError: 'return ' outside function``
Any assistance would be grateful.

Converting multiple PDF files into txt in Python?

import os
from PyPDF2 import PdfFileReader, PdfFileWriter
for filename in os.listdir("C:/117"):
path = os.path.join("C:/117/", filename)
print(path)
with open('file.txt', 'w', encoding='utf-8') as file:
for page_num in range(PdfFileReader(path).numPages):
print('Page: {0}'.format(page_num))
pageObj = PdfFileReader(path).getPage(page_num)
try:
txt = pageObj.extractText()
except:
pass
else:
file.write('Page{0}\n'.format(page_num+1))
file.write(txt)
file.close()
I am converting hundreds of pdf files into txt. However, with this code, all the PDFs are merged into a single txt file. Is there a way to create separate txt file for each PDF I convert? Thanks
from pathlib import Path
from pypdf import PdfReader
def convert_pdf_to_text(path: Path) -> str:
text = ""
for page in PdfReader(path).pages:
text += page.extract_text() + "\n"
return text
for path in Path("Documents").glob("**/*.pdf"):
text = convert_pdf_to_text(path)
txt_path = path.parent / (".".join(path.name.split(".")[:-1]) + ".txt")
if txt_path.exists():
print(f"Skip {txt_path} as it already exists")
continue
with open(txt_path, "wt") as fp:
fp.write(text)

How to merge for loop results in one list

I am trying to merge all the values into one list when I run my for loop. However I keep getting to separate brackets in one list.
For example, when I run this code:
import glob
import re
#import PyPDF2
folder_path='/Users/my_path/cb_files'
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern, recursive=True)
#IP Bank
import re
ip = re.compile(r"((?:^|\b)(?:h[tTxX]ps?://)?(?:\d{1,3}\[?\.\]?){3}\d{1,3}(?:\b|$))")
hash_ = re.compile(r"((?:^|\b)[a-fA-F0-9]{32,64}(?:\b|$))")
domain = re.compile(r"((?:^|\b)(?:h[xXtT]{2}ps?:|meows?:)?//(?:[a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDFC\uFDF0-\uFFEF_.\[\]-]+)(?:\[?\.\]?[a-z]+)+(?::\d+)?(?:[/?][^\s]*)?)")
ip_list=[]
for file in folder_contents:
if re.search(r".*(?=pdf$)",file):
#this is pdf
pdfFileObj = open('pdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
read_file=pageObj.extractText()
elif '.' not in file:
continue
else:
read_file = open(file, 'rt', encoding="latin-1").read()
if ip.findall(read_file) or hash_.findall(read_file) or domain.findall(read_file):
ips =ip.findall(read_file)
hashs= hash_.findall(read_file)
domains=domain.findall(read_file)
# print("IPS",', '.join(ips))
ip_list.append(ips)
print(ip_list)
Here is my output:
[['000.000.0.1', '111.111.1.1'], ['222.222.2.2','333.333.3.3']]
So it looks like for each file it loops over, it is putting it in its own list.
I want the output to look like this:
['000.000.0.1', '111.111.1.1','222.222.2.2','333.333.3.3']
Any changes in my code that will produce these results?
Try changing:-
ip_list.append(ips)
to
ip_list.extend(ips)

How to read all pdf files in a directory and convert to text file using tesseract python 3?

How to read all pdf files in a directory and convert to text file using tesseract python 3?
The below code is for reading one pdf file and convert to text file.
But i want to read all pdf files in a directory and convert to text file using tesseract python 3
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
pdf_filename = "pdffile_name.pdf"
txt_filename = "text_file_created.txt"
def tesseract(pdf_filename,txt_filename):
PDF_file = pdf_filename
pages = convert_from_path(PDF_file, 500)
image_counter = 1
for page in pages:
pdf_filename = "page_"+str(image_counter)+".jpg"
page.save(pdf_filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
outfile = txt_filename
f = open(outfile, "a",encoding = "utf-8")
for i in range(1, filelimit + 1):
pdf_filename = "page_"+str(i)+".jpg"
text = str(((pytesseract.image_to_string(Image.open(pdf_filename)))))
text = text.replace('-\n', '')
f.write(text)
f.close()
f1 = open(outfile, "r",encoding = "utf-8")
text_list = f1.readlines()
return text_list
tesseract(pdf_filename,txt_filename)`enter code here`
i have code for reading pdf files in a directory but i dont know to combine this code with above code
def readfiles():
os.chdir(path)
pdfs = []
for file_list in glob.glob("*.pdf"):
print(file_list)
pdfs.append(file_list)
readfiles()
Simply convert the variable pdf_filename to a list using this code snippet:
import glob
pdf_filename = [f for f in glob.glob("your_preferred_path/*.pdf")]
which will get you all the pdf files you want and store it into a list.
Or simply use any of the methods posted here:
How do I list all files of a directory?
Once you do that, you now have a list of pdf files.
Now iterate over the list of pdfs, one at a time, which will give you a list of test files.
You can use it something like this code snippet:
for one_pdf in pdf_filename:
#* your code to convert the files *#
Hope this helps.

Categories