Separate large pdf document into small pages based on text - python

New to python scripting. I need to Separate a large PDF into individual pdfs based on the text I have tried.
def getPagebreakList(file_name: str)->list:
import os
from glob import glob
from PyPDF2 import PdfFileWriter, PdfFileReader
pdf_file = PyPDF2.PdfFileReader(file_name)
num_pages = pdf_file.getNumPages()
page_breaks = list()
for i in range(0, num_pages):
Page = file.getPage(i)
Text = PageObj.extractText()
if re.search(r"tool used", Text):
page_breaks.append(i)
return page_breaks
inputpdf = PdfFileReader(open('couch.pdf', "rb"))
num_pages = inputpdf.numPages
page_breaks = getPagebreakList('yourPDF.pdf')
i = 0
while (i < num_pages):
if page_breaks:
page_break = page_breaks.pop(0)
else:
page_break = num_pages
output = PdfFileWriter()
while (i != page_break + 1):
output.addPage(inputpdf.getPage(i))
i = i + 1
with open("couch%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
I know I'm missing the print statements and I am getting the error
`syntaxError: 'return ' outside function``
Any assistance would be grateful.

Related

Trying to extract a range of PDF page numbers from a pdf page to split and save it as a separate file using Python

I am trying to create a range of page numbers from a pdf file and then split and save them as a separate file.
Below is the code written for it.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
import re
def pdf_splitter(pdf_path):
directory = "C:\\Users\\Docs\\"
fname = os.path.splitext(os.path.basename(pdf_path))[0]
print(fname)
object = PdfFileReader(pdf_path)
NumPages = object.getNumPages()
print(NumPages)
string = "Challenge 1:"
string2 = "Challenge 2:"
res=0
pageList=[]
for txt in range(0,NumPages):
pdf_writer = PdfFileWriter()
pageObject = object.getPage(txt)
Text = pageObject.extractText()
print(Text)
acc_pos = Text.find(string)
print(acc_pos)
Cur_pos = Text.find(string2)
print(Cur_pos)
loanAcctName = Text[acc_pos+12:Cur_pos]
print (loanAcctName)
# pageList.append(txt)
# print(pageList)
ReSearch = re.search(string, Text)
if ReSearch != None:
pageList.append(txt)
elif ReSearch ==None:
pageList.append(txt)
print(pageList)
res = res + 1
pdf_writer.addPage(object.getPage(page_num))
output_filename = '{}_page_{}.pdf'.format(loanAcctName,page + 1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
out.close()
res = res + 1
if __name__ == '__main__':
pdf_path = r"C:\Users\FY22.pdf"
pdf_splitter(pdf_path)

extracting text from multiple pdf files from a folder in python

I am trying to extract text from multiple pdf files which will serve as the knowledge base for a closed domain chatbot. I used this code
import pandas as pd
import PyPDF2
import glob
pdf_dir = "C:/Users/Arush/OneDrive/Desktop/sample"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
output_data = pd.DataFrame(index = [0], columns = ['FileName','Text'])
fileIndex = 0
for file in pdf_files:
pdfFileObj = open(file,'rb') #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
startPage = 0
text = ''
cleanText = ''
while startPage <= pdfReader.numPages -1:
pageObj = pdfReader.getPage(startPage)
text += pageObj.extractText()
startPage += 1
pdfFileObj.close()
for myWord in text:
if myWord != '\n':
cleanText += myWord
text = cleanText.split()
newRow = pd.DataFrame(index = [0], columns = ['FileName', 'Text'])
newRow.iloc[0]['FileName'] = file
newRow.iloc[0]['Text'] = text
output_data = pd.concat([output_data, newRow], ignore_index=True)
and getting data in symbols only
FileName Text
0 NaN NaN
1 C:/Users/Arush/OneDrive/Desktop/sample\Introdu... [Andreas, C.Müller, &, Sarah, Guido˜˚˛˝˙ˆˇ˘˛˙...
2 C:/Users/Arush/OneDrive/Desktop/sample\Machine... [áâáâÞ;áâáâÞ;;áâáâáâáâç...
moreover I think it only fetching 1 page
can you guy please help me ?

PyPDF to read each PDF in a folder

I have the code below which is working. But it only reads one file at a time, which I have to insert in the code. How could I make this code read every PDF in a directory? PDF by PDF.
import PyPDF2
import textract
import re
filename = 'file.pdf'
pdfFileObj = open(filename,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
extractedtext = ""
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
wortlaut += pageObj.extractText()
print(extractedtext)
keyword = re.findall(r'(\d{7})(\-)(\d{2})',extractedtext)
if keyword:
print(keyword)
You can use glob to get a list of PDF files in a directory.
You can also accept a command-line argument for the directory within which to operate.
Call this program with: python3 this_script.py directory_to_read
import PyPDF2
import glob
import os
import re
import sys
dir_to_read = sys.argv[1] # accept a command-line argument with the dir to read
pdf_files = glob.glob(os.path.join(dir_to_read,'*.pdf'))
count = 0
extractedtext = ""
for pdf_file in pdf_files:
print(pdf_file)
pdfFileObj = open(pdf_file,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
print(num_pages)
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
extractedtext += pageObj.extractText()
print(extractedtext)
keyword = re.findall(r'(\d{7})(\-)(\d{2})',extractedtext)
if keyword:
print(keyword)

Adding page numbers in a text from a PDF file in Python

The following Python program reads a PDF file and collects unique words used in that file.
import PyPDF2
import re
print('process started')
pdfFile = open('pdf_file.pdf', 'rb')
pdfFileReader = PyPDF2.PdfFileReader(pdfFile)
pdfFilePageCount = pdfFileReader.numPages
pdfPageText = ""
for i in range(pdfFilePageCount):
pageObj = pdfFileReader.getPage(i)
pdfPageText += pageObj.extractText()
pdfPageText = pdfPageText.lower()
pdfPageText = re.sub(r'[^a-zA-Z]+', ' ', pdfPageText)
listOfWords = pdfPageText.split()
setPage = set(listOfWords)
textFile = open('text_file.txt', 'w')
for item in listOfWords:
textFile.write("%s\n" % item)
textFile.close()
pdfFile.close()
print('process ended')
Is it possible to add page numbers with each word (i.e. to indicate from which page they were picked)?
For instance, if a word "xyzabc" is found in multiple pages, I need to list them all as follows:
xyzabc (1,22,130, ...)
You can create a dict whose keys are the words, and values a list of page numbers.
Using a defaultdict makes it easy to append the page numbers.
You also have to update the dict in each loop, as shown in the modified code here:
import PyPDF2
import re
from collections import defaultdict
print('process started')
pdfFile = open('pdf_file.pdf', 'rb')
pdfFileReader = PyPDF2.PdfFileReader(pdfFile)
pdfFilePageCount = pdfFileReader.numPages
wordsAndPages = defaultdict(list)
pdfPageText = ""
for page in range(pdfFilePageCount):
pageObj = pdfFileReader.getPage(page)
pdfPageText = pageObj.extractText()
print(pdfPageText)
pdfPageText = pdfPageText.lower()
pdfPageText = re.sub(r'[^a-zA-Z]+', ' ', pdfPageText)
listOfWords = pdfPageText.split()
for word in listOfWords:
wordsAndPages[word].append(page)
textFile = open('text_file.txt', 'w')
for word, pages in wordsAndPages.items():
output = '{} ({})\n'.format(word, ','.join([str(page) for page in pages]))
textFile.write(output)
textFile.close()
pdfFile.close()
print('process ended')
and you can finally output the data in the expected format.

Issue with renaming PDFs in Python from CSV file

So I have this script that reads a CSV file (the file is as follows):
test0
test1
test2
test3
Then the script takes a multipage PDF file (4 pages in this example), and splits it into 4 separate pages, naming them as 'document-pages1', 'document-pages2' etc.
What I'd like to get it to do, is name the split page files to their equivelent location in the CSV file.
So a 4 page PDF would correlate to the 4 row csv file. Unfortunately, I'm at a loss for how to implement this part.
My code so far is as follows:
from PyPDF2 import PdfFileWriter, PdfFileReader
import csv
def readcsv(filename):
ifile = open(filename, "rU")
reader = csv.reader(ifile, delimiter=";")
rownum = 0
a = []
for row in reader:
a.append(row)
rownum += 1
ifile.close()
return a
filepath = input("Filepath: ")
filename = input("File name: ")
csv = readcsv(filepath + filename)
inputpdf = PdfFileReader(open("test.pdf", "rb"))
for i in range(inputpdf.numPages):
output = PdfFileWriter()
output.addPage(inputpdf.getPage(i))
with open("document-page%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
Any help at all would be immensely appreciated.
ended up fixing my own issue as follows:
from PyPDF2 import PdfFileWriter, PdfFileReader
import csv
def readcsv(filename):
ifile = open(filename, "rU")
reader = csv.reader(ifile, delimiter=";")
rownum = 0
a = []
for row in reader:
a.append(row)
rownum += 1
ifile.close()
return a
filepath = input("Filepath: ")
filename = input("File name: ")
csv = readcsv(filepath + filename)
inputpdf = PdfFileReader(open("test.pdf", "rb"))
for i in range(inputpdf.numPages):
output = PdfFileWriter()
output.addPage(inputpdf.getPage(i))
with open(str(csv[i]).translate(str.maketrans('','',"[]'")) + ".pdf", "wb") as outputStream:
output.write(outputStream)

Categories