extracting text from multiple pdf files from a folder in python

extracting text from multiple pdf files from a folder in python - python

I am trying to extract text from multiple pdf files which will serve as the knowledge base for a closed domain chatbot. I used this code
import pandas as pd
import PyPDF2
import glob
pdf_dir = "C:/Users/Arush/OneDrive/Desktop/sample"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
output_data = pd.DataFrame(index = [0], columns = ['FileName','Text'])
fileIndex = 0
for file in pdf_files:
pdfFileObj = open(file,'rb') #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
startPage = 0
text = ''
cleanText = ''
while startPage <= pdfReader.numPages -1:
pageObj = pdfReader.getPage(startPage)
text += pageObj.extractText()
startPage += 1
pdfFileObj.close()
for myWord in text:
if myWord != '\n':
cleanText += myWord
text = cleanText.split()
newRow = pd.DataFrame(index = [0], columns = ['FileName', 'Text'])
newRow.iloc[0]['FileName'] = file
newRow.iloc[0]['Text'] = text
output_data = pd.concat([output_data, newRow], ignore_index=True)
and getting data in symbols only
FileName Text
0 NaN NaN
1 C:/Users/Arush/OneDrive/Desktop/sample\Introdu... [Andreas, C.Müller, &, Sarah, Guido˜˚˛˝˙ˆˇ˘˛˙...
2 C:/Users/Arush/OneDrive/Desktop/sample\Machine... [áâáâÞ;áâáâÞ;;áâáâáâáâç...
moreover I think it only fetching 1 page
can you guy please help me ?

Related

Trying to extract a range of PDF page numbers from a pdf page to split and save it as a separate file using Python

I am trying to create a range of page numbers from a pdf file and then split and save them as a separate file.
Below is the code written for it.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
import re
def pdf_splitter(pdf_path):
directory = "C:\\Users\\Docs\\"
fname = os.path.splitext(os.path.basename(pdf_path))[0]
print(fname)
object = PdfFileReader(pdf_path)
NumPages = object.getNumPages()
print(NumPages)
string = "Challenge 1:"
string2 = "Challenge 2:"
res=0
pageList=[]
for txt in range(0,NumPages):
pdf_writer = PdfFileWriter()
pageObject = object.getPage(txt)
Text = pageObject.extractText()
print(Text)
acc_pos = Text.find(string)
print(acc_pos)
Cur_pos = Text.find(string2)
print(Cur_pos)
loanAcctName = Text[acc_pos+12:Cur_pos]
print (loanAcctName)
# pageList.append(txt)
# print(pageList)
ReSearch = re.search(string, Text)
if ReSearch != None:
pageList.append(txt)
elif ReSearch ==None:
pageList.append(txt)
print(pageList)
res = res + 1
pdf_writer.addPage(object.getPage(page_num))
output_filename = '{}_page_{}.pdf'.format(loanAcctName,page + 1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
out.close()
res = res + 1
if __name__ == '__main__':
pdf_path = r"C:\Users\FY22.pdf"
pdf_splitter(pdf_path)

Separate large pdf document into small pages based on text

New to python scripting. I need to Separate a large PDF into individual pdfs based on the text I have tried.
def getPagebreakList(file_name: str)->list:
import os
from glob import glob
from PyPDF2 import PdfFileWriter, PdfFileReader
pdf_file = PyPDF2.PdfFileReader(file_name)
num_pages = pdf_file.getNumPages()
page_breaks = list()
for i in range(0, num_pages):
Page = file.getPage(i)
Text = PageObj.extractText()
if re.search(r"tool used", Text):
page_breaks.append(i)
return page_breaks
inputpdf = PdfFileReader(open('couch.pdf', "rb"))
num_pages = inputpdf.numPages
page_breaks = getPagebreakList('yourPDF.pdf')
i = 0
while (i < num_pages):
if page_breaks:
page_break = page_breaks.pop(0)
else:
page_break = num_pages
output = PdfFileWriter()
while (i != page_break + 1):
output.addPage(inputpdf.getPage(i))
i = i + 1
with open("couch%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
I know I'm missing the print statements and I am getting the error
`syntaxError: 'return ' outside function``
Any assistance would be grateful.

How to enumerate some text files using python?

I want to make a program to automate excel task using Openpyxl. I am using "enumerate" to open some text files and then auto input to excel file.
import os
os.chdir(r'F:\tes')
filenames = ["eb.txt", "ea.txt"]
for i, filename in enumerate(filenames):
file = open(filename, 'r')
text = file.read().replace('\u2014', '-').replace('â€”', '-')
start = 0
startcheck = True
end = 0
endcheck = True
for idx, letter in enumerate(text):
if text[idx:idx+4] == 'NPWP' and startcheck:
start = idx + 7
startcheck = False
if text[idx:idx+7] == 'Pembeli' and endcheck:
end = idx
endcheck = False
data = text[start:end]
from openpyxl import load_workbook
wb = load_workbook(filename = r'F:\tes\Book1.xlsx')
sheet_ranges = wb['1771 III']
cell_name = 'M' + str(20 + (3*i))
sheet_ranges[cell_name] = data
wb.save(filename = r'F:\tes\Form 1771.xlsx')
I've tried to open 2 text files but it only open and input 1 text file to excel. How do I code it to open multiple text files?

Relocate some of the statements and ensure statements are in the appropriate loops (correct indentation). I have not tested this but it looks correct.
First all imports and setup at the beginning.
import os
from openpyxl import load_workbook
os.chdir(r'F:\tes')
filenames = ["eb.txt", "ea.txt"]
wb = load_workbook(filename = r'F:\tes\Book1.xlsx')
Then start the iteration.
# outer loop
for i, filename in enumerate(filenames):
file = open(filename, 'r')
text = file.read().replace('\u2014', '-').replace('â€”', '-')
# explicitly close the file
file.close()
start = 0
startcheck = True
end = 0
endcheck = True
# find the data
# inner loop
for idx, letter in enumerate(text):
if text[idx:idx+4] == 'NPWP' and startcheck:
start = idx + 7
startcheck = False
if text[idx:idx+7] == 'Pembeli' and endcheck:
end = idx
endcheck = False
# ensure this section in the outer loop
data = text[start:end]
sheet_ranges = wb['1771 III']
cell_name = 'M' + str(20 + (3*i))
sheet_ranges[cell_name] = data
Finally save the workbook. Indentation ensures it is saved after all data has been written to it.
wb.save(filename = r'F:\tes\Form 1771.xlsx')
It is probably best to open a file using the with keyword which will ensure that the file is closed.
with open(filename, 'r') as f:
text = f.read().replace('\u2014', '-').replace('â€”', '-')
In your example you iterate over each character in the file using enumerate to find the index of the start and end of your data, text[idx:idx+4] == 'NPWP'.
strings have a find method that will do that for you.
start = text.find('NPWP')
end = text.find('Pembeli', start)
data = text[start:end]
With these changes your code would look like this:
import os
from openpyxl import load_workbook
os.chdir(r'F:\tes')
filenames = ["eb.txt", "ea.txt"]
wb = load_workbook(filename = r'F:\tes\Book1.xlsx')
for i, filename in enumerate(filenames):
with open(filename, 'r') as f:
text = f.read().replace('\u2014', '-').replace('â€”', '-')
start = text.find('NPWP')
end = text.find('Pembeli', start)
data = text[start:end]
sheet_ranges = wb['1771 III']
cell_name = 'M' + str(20 + (3*i))
sheet_ranges[cell_name] = data
wb.save(filename = r'F:\tes\Form 1771.xlsx')

Exporting data from Word tables to Excel using Python

I have thousands of word documents that have one table on the first page with information that I need outputted into Excel. So far I have this, but I'm not sure why it's not working
import win32com.client as win32
import os
myDir = r'C:\Projects\Capital'
XL = win32.Dispatch('Excel.Application')
XL.Visible = 1
XLbook = XL.Workbooks.Open(os.path.join(myDir,'Dealflow.xlsx'))
XLsheet = XLbook.Worksheets(1)
XLrow = 2
for myFile in os.listdir(myDir):
filepath = os.path.join(myDir,myFile)
filename = os.path.splitext(myFile)[0]
ext = os.path.splitext(myFile)[1]
if ext == '.docx':
word = win32.Dispatch('Word.Application')
word.Visible = 1
word.Documents.Open(filepath)
doc = word.ActiveDocument
table = doc.Tables(1)
XLsheet.Cells(XLrow,1).Value = table.Cell(Row=1, Column=1).Range.Text
XLsheet.Cells(XLrow,2).Value = table.Cell(Row=2, Column=3).Range.Text
XLrow = XLrow +1
doc.Close()

PyPdf2 extracting text with n in front of certain letters

This may just be due to PyPdf2's extract text function but when I run the code below in order to rename the files, a lot of the most common words come out like "Nthe", "Nfrom" and "Ncommunications". I'm not sure what I can do to stop this happening or alternatively how to work around it.
What causes a problem like this?
Where are the N's coming from?
Other PDFs do perfectly what I want so I'm not sure where to go from here.
import PyPDF2
import re
from collections import Counter
import os.path
files = [f for f in os.listdir('.') if os.path.isfile(f)]
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
for file in files:
pdfFileObj = open('{0}'.format(file), 'rb') #Open the File
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #Read the file
frequency = {} #Create dict
ignore = {'the','a','if','in','it','of','or','and','for','can','that','are','this','your','you','will','not','have','its','with','need','has','from','more'} #Ignore dese ones
print "Number of Pages %s " % pdfReader.numPages #Print Num Pages
word_list = []
for i in range(pdfReader.numPages):
pageObj = pdfReader.getPage(i) # Get the first page
word_list.append(pageObj.extractText()) #Add the pages together
match_pattern = re.findall(r'\b[a-z]{3,15}\b', str(word_list)) #Find the text
cnt = Counter()
for words in match_pattern: #Start counting the frequency
words.lower() # Lower Case Words
if words not in ignore: #Ignore common words
count = frequency.get(words,0) #Start the count?
frequency[words] = count + 1 #Add one
fl = sorted(frequency, key=frequency.__getitem__, reverse = True)[:3] #Sort according to frequency
pdfFileObj.close() #Close the PDF
newtitle = ' '.join(map(str,fl, )).title() #Join the title list together
try:
print newtitle #Print the title
os.rename('{0}'.format(file), '{0}.pdf'.format(newtitle))#Rename the file
except:
print "Unable to Rename File"

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

extracting text from multiple pdf files from a folder in python - python

Related

Trying to extract a range of PDF page numbers from a pdf page to split and save it as a separate file using Python

Separate large pdf document into small pages based on text

How to enumerate some text files using python?

Exporting data from Word tables to Excel using Python

PyPdf2 extracting text with n in front of certain letters

Categories

Resources