extracting text from multiple pdf files from a folder in python - python

I am trying to extract text from multiple pdf files which will serve as the knowledge base for a closed domain chatbot. I used this code
import pandas as pd
import PyPDF2
import glob
pdf_dir = "C:/Users/Arush/OneDrive/Desktop/sample"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
output_data = pd.DataFrame(index = [0], columns = ['FileName','Text'])
fileIndex = 0
for file in pdf_files:
pdfFileObj = open(file,'rb') #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
startPage = 0
text = ''
cleanText = ''
while startPage <= pdfReader.numPages -1:
pageObj = pdfReader.getPage(startPage)
text += pageObj.extractText()
startPage += 1
pdfFileObj.close()
for myWord in text:
if myWord != '\n':
cleanText += myWord
text = cleanText.split()
newRow = pd.DataFrame(index = [0], columns = ['FileName', 'Text'])
newRow.iloc[0]['FileName'] = file
newRow.iloc[0]['Text'] = text
output_data = pd.concat([output_data, newRow], ignore_index=True)
and getting data in symbols only
FileName Text
0 NaN NaN
1 C:/Users/Arush/OneDrive/Desktop/sample\Introdu... [Andreas, C.Müller, &, Sarah, Guido˜˚˛˝˙ˆˇ˘˛˙...
2 C:/Users/Arush/OneDrive/Desktop/sample\Machine... [áâáâÞ;áâáâÞ;;áâáâáâáâç...
moreover I think it only fetching 1 page
can you guy please help me ?

Related

Trying to extract a range of PDF page numbers from a pdf page to split and save it as a separate file using Python

I am trying to create a range of page numbers from a pdf file and then split and save them as a separate file.
Below is the code written for it.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
import re
def pdf_splitter(pdf_path):
directory = "C:\\Users\\Docs\\"
fname = os.path.splitext(os.path.basename(pdf_path))[0]
print(fname)
object = PdfFileReader(pdf_path)
NumPages = object.getNumPages()
print(NumPages)
string = "Challenge 1:"
string2 = "Challenge 2:"
res=0
pageList=[]
for txt in range(0,NumPages):
pdf_writer = PdfFileWriter()
pageObject = object.getPage(txt)
Text = pageObject.extractText()
print(Text)
acc_pos = Text.find(string)
print(acc_pos)
Cur_pos = Text.find(string2)
print(Cur_pos)
loanAcctName = Text[acc_pos+12:Cur_pos]
print (loanAcctName)
# pageList.append(txt)
# print(pageList)
ReSearch = re.search(string, Text)
if ReSearch != None:
pageList.append(txt)
elif ReSearch ==None:
pageList.append(txt)
print(pageList)
res = res + 1
pdf_writer.addPage(object.getPage(page_num))
output_filename = '{}_page_{}.pdf'.format(loanAcctName,page + 1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
out.close()
res = res + 1
if __name__ == '__main__':
pdf_path = r"C:\Users\FY22.pdf"
pdf_splitter(pdf_path)

Separate large pdf document into small pages based on text

New to python scripting. I need to Separate a large PDF into individual pdfs based on the text I have tried.
def getPagebreakList(file_name: str)->list:
import os
from glob import glob
from PyPDF2 import PdfFileWriter, PdfFileReader
pdf_file = PyPDF2.PdfFileReader(file_name)
num_pages = pdf_file.getNumPages()
page_breaks = list()
for i in range(0, num_pages):
Page = file.getPage(i)
Text = PageObj.extractText()
if re.search(r"tool used", Text):
page_breaks.append(i)
return page_breaks
inputpdf = PdfFileReader(open('couch.pdf', "rb"))
num_pages = inputpdf.numPages
page_breaks = getPagebreakList('yourPDF.pdf')
i = 0
while (i < num_pages):
if page_breaks:
page_break = page_breaks.pop(0)
else:
page_break = num_pages
output = PdfFileWriter()
while (i != page_break + 1):
output.addPage(inputpdf.getPage(i))
i = i + 1
with open("couch%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
I know I'm missing the print statements and I am getting the error
`syntaxError: 'return ' outside function``
Any assistance would be grateful.

How to enumerate some text files using python?

I want to make a program to automate excel task using Openpyxl. I am using "enumerate" to open some text files and then auto input to excel file.
import os
os.chdir(r'F:\tes')
filenames = ["eb.txt", "ea.txt"]
for i, filename in enumerate(filenames):
file = open(filename, 'r')
text = file.read().replace('\u2014', '-').replace('—', '-')
start = 0
startcheck = True
end = 0
endcheck = True
for idx, letter in enumerate(text):
if text[idx:idx+4] == 'NPWP' and startcheck:
start = idx + 7
startcheck = False
if text[idx:idx+7] == 'Pembeli' and endcheck:
end = idx
endcheck = False
data = text[start:end]
from openpyxl import load_workbook
wb = load_workbook(filename = r'F:\tes\Book1.xlsx')
sheet_ranges = wb['1771 III']
cell_name = 'M' + str(20 + (3*i))
sheet_ranges[cell_name] = data
wb.save(filename = r'F:\tes\Form 1771.xlsx')
I've tried to open 2 text files but it only open and input 1 text file to excel. How do I code it to open multiple text files?
Relocate some of the statements and ensure statements are in the appropriate loops (correct indentation). I have not tested this but it looks correct.
First all imports and setup at the beginning.
import os
from openpyxl import load_workbook
os.chdir(r'F:\tes')
filenames = ["eb.txt", "ea.txt"]
wb = load_workbook(filename = r'F:\tes\Book1.xlsx')
Then start the iteration.
# outer loop
for i, filename in enumerate(filenames):
file = open(filename, 'r')
text = file.read().replace('\u2014', '-').replace('—', '-')
# explicitly close the file
file.close()
start = 0
startcheck = True
end = 0
endcheck = True
# find the data
# inner loop
for idx, letter in enumerate(text):
if text[idx:idx+4] == 'NPWP' and startcheck:
start = idx + 7
startcheck = False
if text[idx:idx+7] == 'Pembeli' and endcheck:
end = idx
endcheck = False
# ensure this section in the outer loop
data = text[start:end]
sheet_ranges = wb['1771 III']
cell_name = 'M' + str(20 + (3*i))
sheet_ranges[cell_name] = data
Finally save the workbook. Indentation ensures it is saved after all data has been written to it.
wb.save(filename = r'F:\tes\Form 1771.xlsx')
It is probably best to open a file using the with keyword which will ensure that the file is closed.
with open(filename, 'r') as f:
text = f.read().replace('\u2014', '-').replace('—', '-')
In your example you iterate over each character in the file using enumerate to find the index of the start and end of your data, text[idx:idx+4] == 'NPWP'.
strings have a find method that will do that for you.
start = text.find('NPWP')
end = text.find('Pembeli', start)
data = text[start:end]
With these changes your code would look like this:
import os
from openpyxl import load_workbook
os.chdir(r'F:\tes')
filenames = ["eb.txt", "ea.txt"]
wb = load_workbook(filename = r'F:\tes\Book1.xlsx')
for i, filename in enumerate(filenames):
with open(filename, 'r') as f:
text = f.read().replace('\u2014', '-').replace('—', '-')
start = text.find('NPWP')
end = text.find('Pembeli', start)
data = text[start:end]
sheet_ranges = wb['1771 III']
cell_name = 'M' + str(20 + (3*i))
sheet_ranges[cell_name] = data
wb.save(filename = r'F:\tes\Form 1771.xlsx')

Exporting data from Word tables to Excel using Python

I have thousands of word documents that have one table on the first page with information that I need outputted into Excel. So far I have this, but I'm not sure why it's not working
import win32com.client as win32
import os
myDir = r'C:\Projects\Capital'
XL = win32.Dispatch('Excel.Application')
XL.Visible = 1
XLbook = XL.Workbooks.Open(os.path.join(myDir,'Dealflow.xlsx'))
XLsheet = XLbook.Worksheets(1)
XLrow = 2
for myFile in os.listdir(myDir):
filepath = os.path.join(myDir,myFile)
filename = os.path.splitext(myFile)[0]
ext = os.path.splitext(myFile)[1]
if ext == '.docx':
word = win32.Dispatch('Word.Application')
word.Visible = 1
word.Documents.Open(filepath)
doc = word.ActiveDocument
table = doc.Tables(1)
XLsheet.Cells(XLrow,1).Value = table.Cell(Row=1, Column=1).Range.Text
XLsheet.Cells(XLrow,2).Value = table.Cell(Row=2, Column=3).Range.Text
XLrow = XLrow +1
doc.Close()

PyPdf2 extracting text with n in front of certain letters

This may just be due to PyPdf2's extract text function but when I run the code below in order to rename the files, a lot of the most common words come out like "Nthe", "Nfrom" and "Ncommunications". I'm not sure what I can do to stop this happening or alternatively how to work around it.
What causes a problem like this?
Where are the N's coming from?
Other PDFs do perfectly what I want so I'm not sure where to go from here.
import PyPDF2
import re
from collections import Counter
import os.path
files = [f for f in os.listdir('.') if os.path.isfile(f)]
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
for file in files:
pdfFileObj = open('{0}'.format(file), 'rb') #Open the File
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #Read the file
frequency = {} #Create dict
ignore = {'the','a','if','in','it','of','or','and','for','can','that','are','this','your','you','will','not','have','its','with','need','has','from','more'} #Ignore dese ones
print "Number of Pages %s " % pdfReader.numPages #Print Num Pages
word_list = []
for i in range(pdfReader.numPages):
pageObj = pdfReader.getPage(i) # Get the first page
word_list.append(pageObj.extractText()) #Add the pages together
match_pattern = re.findall(r'\b[a-z]{3,15}\b', str(word_list)) #Find the text
cnt = Counter()
for words in match_pattern: #Start counting the frequency
words.lower() # Lower Case Words
if words not in ignore: #Ignore common words
count = frequency.get(words,0) #Start the count?
frequency[words] = count + 1 #Add one
fl = sorted(frequency, key=frequency.__getitem__, reverse = True)[:3] #Sort according to frequency
pdfFileObj.close() #Close the PDF
newtitle = ' '.join(map(str,fl, )).title() #Join the title list together
try:
print newtitle #Print the title
os.rename('{0}'.format(file), '{0}.pdf'.format(newtitle))#Rename the file
except:
print "Unable to Rename File"

Categories