The following Python program reads a PDF file and collects unique words used in that file.
import PyPDF2
import re
print('process started')
pdfFile = open('pdf_file.pdf', 'rb')
pdfFileReader = PyPDF2.PdfFileReader(pdfFile)
pdfFilePageCount = pdfFileReader.numPages
pdfPageText = ""
for i in range(pdfFilePageCount):
pageObj = pdfFileReader.getPage(i)
pdfPageText += pageObj.extractText()
pdfPageText = pdfPageText.lower()
pdfPageText = re.sub(r'[^a-zA-Z]+', ' ', pdfPageText)
listOfWords = pdfPageText.split()
setPage = set(listOfWords)
textFile = open('text_file.txt', 'w')
for item in listOfWords:
textFile.write("%s\n" % item)
textFile.close()
pdfFile.close()
print('process ended')
Is it possible to add page numbers with each word (i.e. to indicate from which page they were picked)?
For instance, if a word "xyzabc" is found in multiple pages, I need to list them all as follows:
xyzabc (1,22,130, ...)
You can create a dict whose keys are the words, and values a list of page numbers.
Using a defaultdict makes it easy to append the page numbers.
You also have to update the dict in each loop, as shown in the modified code here:
import PyPDF2
import re
from collections import defaultdict
print('process started')
pdfFile = open('pdf_file.pdf', 'rb')
pdfFileReader = PyPDF2.PdfFileReader(pdfFile)
pdfFilePageCount = pdfFileReader.numPages
wordsAndPages = defaultdict(list)
pdfPageText = ""
for page in range(pdfFilePageCount):
pageObj = pdfFileReader.getPage(page)
pdfPageText = pageObj.extractText()
print(pdfPageText)
pdfPageText = pdfPageText.lower()
pdfPageText = re.sub(r'[^a-zA-Z]+', ' ', pdfPageText)
listOfWords = pdfPageText.split()
for word in listOfWords:
wordsAndPages[word].append(page)
textFile = open('text_file.txt', 'w')
for word, pages in wordsAndPages.items():
output = '{} ({})\n'.format(word, ','.join([str(page) for page in pages]))
textFile.write(output)
textFile.close()
pdfFile.close()
print('process ended')
and you can finally output the data in the expected format.
Related
I am trying to create a range of page numbers from a pdf file and then split and save them as a separate file.
Below is the code written for it.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
import re
def pdf_splitter(pdf_path):
directory = "C:\\Users\\Docs\\"
fname = os.path.splitext(os.path.basename(pdf_path))[0]
print(fname)
object = PdfFileReader(pdf_path)
NumPages = object.getNumPages()
print(NumPages)
string = "Challenge 1:"
string2 = "Challenge 2:"
res=0
pageList=[]
for txt in range(0,NumPages):
pdf_writer = PdfFileWriter()
pageObject = object.getPage(txt)
Text = pageObject.extractText()
print(Text)
acc_pos = Text.find(string)
print(acc_pos)
Cur_pos = Text.find(string2)
print(Cur_pos)
loanAcctName = Text[acc_pos+12:Cur_pos]
print (loanAcctName)
# pageList.append(txt)
# print(pageList)
ReSearch = re.search(string, Text)
if ReSearch != None:
pageList.append(txt)
elif ReSearch ==None:
pageList.append(txt)
print(pageList)
res = res + 1
pdf_writer.addPage(object.getPage(page_num))
output_filename = '{}_page_{}.pdf'.format(loanAcctName,page + 1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
out.close()
res = res + 1
if __name__ == '__main__':
pdf_path = r"C:\Users\FY22.pdf"
pdf_splitter(pdf_path)
New to python scripting. I need to Separate a large PDF into individual pdfs based on the text I have tried.
def getPagebreakList(file_name: str)->list:
import os
from glob import glob
from PyPDF2 import PdfFileWriter, PdfFileReader
pdf_file = PyPDF2.PdfFileReader(file_name)
num_pages = pdf_file.getNumPages()
page_breaks = list()
for i in range(0, num_pages):
Page = file.getPage(i)
Text = PageObj.extractText()
if re.search(r"tool used", Text):
page_breaks.append(i)
return page_breaks
inputpdf = PdfFileReader(open('couch.pdf', "rb"))
num_pages = inputpdf.numPages
page_breaks = getPagebreakList('yourPDF.pdf')
i = 0
while (i < num_pages):
if page_breaks:
page_break = page_breaks.pop(0)
else:
page_break = num_pages
output = PdfFileWriter()
while (i != page_break + 1):
output.addPage(inputpdf.getPage(i))
i = i + 1
with open("couch%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
I know I'm missing the print statements and I am getting the error
`syntaxError: 'return ' outside function``
Any assistance would be grateful.
Any advice is much appreciated. I returned some text to the console and I want to save the string as a csv file. I have tried a couple different ways to save it to no avail. One of the other ways that I have tried is turning the output into an array but that did not work either. If you have thoughts about returning console text to csv file please let me know thank you.
import os
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import TextConverter
import io
from PyPDF2 import PdfFileMerger, PdfFileReader
class Transform:
#method for extracting data and merging it into one pdf
def __init__(self):
try:
source_dir = os.getcwd()
merger = PdfFileMerger()
for item in os.listdir(source_dir):
if item.endswith("pdf"):
merger.append(item)
except Exception:
print("unable to collect")
finally:
merger.write("test.pdf")
merger.close()
#running that method extract
def extract(self):
resource_manager = PDFResourceManager()
file = io.StringIO()
converter = TextConverter(resource_manager, file, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open('test.pdf', 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = file.getvalue()
# close open handles
converter.close()
file.close()
return text
# def convertoarry(self, text):
# listToPrint = []
# for text in dict.keys():
# listToPrint.append(text)
# listToPrint.append(dict[text])
# stringToPrint = ",".join(listToPrint)
# return stringToPrint
#
# stringToPrint = convertoarry(self, text)
# print(stringToprint)
def modify(self, text):
words = text.split()
combine = [words.index()]
with open("text.csv", "w") as f:
f.write(combine)
f.close()
return "compeleted"
program = Transform()
print(program.modify())
I assume the error you got comes from this line:
f.write(combine)
combine is a list object, while the write method of f wants a str object. This is what you should do:
f.write(str(combine))
Since you're creating a csv however, it is likely that you don't want the "[" and "]" in your file. Furthermore, by doing it like this, you will also include the "'" in your file, which it is likely you don't want. The safest way is simply to iterate over words rather than creating combine:
def modify(self, text):
words = text.split()
with open("text.csv", "w") as f:
f.write(words[0])
for word in words[1:]:
f.write(f", {word}")
f.close()
return "compeleted"
By the way, you dont' need to explicitely close the file if you're working with a context manager (that is, if you use with). Hence you can simplify this code as follows:
def modify(self, text):
words = text.split()
with open("text.csv", "w") as f:
f.write(words[0])
for word in words[1:]:
f.write(f", {word}")
return "compeleted"
I am trying to merge all the values into one list when I run my for loop. However I keep getting to separate brackets in one list.
For example, when I run this code:
import glob
import re
#import PyPDF2
folder_path='/Users/my_path/cb_files'
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern, recursive=True)
#IP Bank
import re
ip = re.compile(r"((?:^|\b)(?:h[tTxX]ps?://)?(?:\d{1,3}\[?\.\]?){3}\d{1,3}(?:\b|$))")
hash_ = re.compile(r"((?:^|\b)[a-fA-F0-9]{32,64}(?:\b|$))")
domain = re.compile(r"((?:^|\b)(?:h[xXtT]{2}ps?:|meows?:)?//(?:[a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDFC\uFDF0-\uFFEF_.\[\]-]+)(?:\[?\.\]?[a-z]+)+(?::\d+)?(?:[/?][^\s]*)?)")
ip_list=[]
for file in folder_contents:
if re.search(r".*(?=pdf$)",file):
#this is pdf
pdfFileObj = open('pdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
read_file=pageObj.extractText()
elif '.' not in file:
continue
else:
read_file = open(file, 'rt', encoding="latin-1").read()
if ip.findall(read_file) or hash_.findall(read_file) or domain.findall(read_file):
ips =ip.findall(read_file)
hashs= hash_.findall(read_file)
domains=domain.findall(read_file)
# print("IPS",', '.join(ips))
ip_list.append(ips)
print(ip_list)
Here is my output:
[['000.000.0.1', '111.111.1.1'], ['222.222.2.2','333.333.3.3']]
So it looks like for each file it loops over, it is putting it in its own list.
I want the output to look like this:
['000.000.0.1', '111.111.1.1','222.222.2.2','333.333.3.3']
Any changes in my code that will produce these results?
Try changing:-
ip_list.append(ips)
to
ip_list.extend(ips)
This may just be due to PyPdf2's extract text function but when I run the code below in order to rename the files, a lot of the most common words come out like "Nthe", "Nfrom" and "Ncommunications". I'm not sure what I can do to stop this happening or alternatively how to work around it.
What causes a problem like this?
Where are the N's coming from?
Other PDFs do perfectly what I want so I'm not sure where to go from here.
import PyPDF2
import re
from collections import Counter
import os.path
files = [f for f in os.listdir('.') if os.path.isfile(f)]
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
for file in files:
pdfFileObj = open('{0}'.format(file), 'rb') #Open the File
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #Read the file
frequency = {} #Create dict
ignore = {'the','a','if','in','it','of','or','and','for','can','that','are','this','your','you','will','not','have','its','with','need','has','from','more'} #Ignore dese ones
print "Number of Pages %s " % pdfReader.numPages #Print Num Pages
word_list = []
for i in range(pdfReader.numPages):
pageObj = pdfReader.getPage(i) # Get the first page
word_list.append(pageObj.extractText()) #Add the pages together
match_pattern = re.findall(r'\b[a-z]{3,15}\b', str(word_list)) #Find the text
cnt = Counter()
for words in match_pattern: #Start counting the frequency
words.lower() # Lower Case Words
if words not in ignore: #Ignore common words
count = frequency.get(words,0) #Start the count?
frequency[words] = count + 1 #Add one
fl = sorted(frequency, key=frequency.__getitem__, reverse = True)[:3] #Sort according to frequency
pdfFileObj.close() #Close the PDF
newtitle = ' '.join(map(str,fl, )).title() #Join the title list together
try:
print newtitle #Print the title
os.rename('{0}'.format(file), '{0}.pdf'.format(newtitle))#Rename the file
except:
print "Unable to Rename File"