Just trying to work out how to mark something as a fail in pypdf2 if there is match on any page of a PDF doc. I have been using the below code which I have partly recycled and partly built.
Problem is that is prints fail for every single line which I don't need. I am trying to change it to only print Fail once if there are no matches on any page.
import PyPDF2
import re
import os
#create filereader object to read the PDF using PyPDF2
object = PyPDF2.PdfFileReader("shopping.pdf")
NumPages = object.getNumPages()
print(f"This document has {NumPages} pages")
for i in range(0, NumPages):
page = object.getPage(i)
text = page.extractText()
for line in text.splitlines():
if re.match('milk', line):
print("Pass the keyword is matched on page " + str(i), ": " + line)
else:
print("Fail")
re.match only returns a match if it exists at the beginning of a string. What you're probably looking for is re.search
Documentation: https://docs.python.org/3/library/re.html#search-vs-match
The solution is memorizing the match in a list instead of printing an immediate result.
The print should be done only after reading all the file
# [...]
loi = [] # Lines of Interest
for i in range(0, NumPages):
page = object.getPage(i)
text = page.extractText()
for line in text.splitlines():
if re.match('milk', line):
loi.append(f'{i}:{line}')
# Result
if len(loi) > 0: # or greater than a threshold
print('Pass. The keyword is matched on the following pages:')
print('\n'.join(loi))
else:
print('Fail.')
Related
I am searching some specific strings, however; I find it in places where it shouldn't be. Because I convert pdf files to .txt using html. I found some pattern that I can reach string which i dont want it.
This is the string I want to delete "6\n5\n4\n3\n2\n1\nD\nC\nB\nA" . I am looking " \nC\n" in some other text block not here. So if i find pattern " \nC\nB", I can delete but how I don't figure out. I write inplace_change function but i got error message like this.
"subprocess.CalledProcessError: Command '['java', '-jar', 'C:\Users\Kronos\AppData\Roaming\Python\Python310\site-packages\tabula\tabula-1.0.5-jar-with-dependencies.jar', '--pages', '1', '--guess', '--format', 'JSON', '383026_C.pdf']' returned non-zero exit status 1."
def findWordInText(name):
a = name+".txt"
count =0
with open(a,'r',encoding='utf-8',errors="ignore") as f:
line = f.read()
i = 0
if(r"\nC\n")in line:
String = r"\nC\n"
ch ="B"
if (String+ch) in line:
print('Need to remove')
txt = String+ch
f.close()
inplace_change(a,txt,"removed")
with open(a,'r',encoding='utf-8',errors="ignore") as f:
line = f.read()
if(r"\nC\n") in line:
txt="C"
writeOnExcel(name,txt)
count +=1
def inplace_change(file,old,new):
with open(file) as f:
s = f.read()
if old not in s:
print('"{old_string}" not found in {filename}.'.format(**locals()))
return
with open(file, 'w') as f:
print('Changing "{old_string}" to "{new_string}" in {filename}'.format(**locals()))
s = s.replace(old, new)
f.write(s)
If you have something like a string with alot of newline characters followed by a character like a number or letter, you can use regex to substitute them with an empty string, instead of going through it line by line, you can go through them all at once like this.
a = r"""python
\na\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\nq\nr\ns\nt\nu\nv\nw\nx\ny\nz
\n1\n2\n3\n4\n5\n6\n7\n8\n9\n0Message Here\na\nb\nc\nd\ne
\nA\nB\nC\nD\nE\nF\nG\nH\nI\nJ\nK\nL\nM\nN\nO\nPSome Other Message\nQ\nR\nS\nT\nU\nV\nW\nX\nY\nZ
\n!\n#\n$\n%\n^\n&\n*\n(\n)\n_\n-\n=\n+\n{\n}\n[\n]\n|\n;\n:\n'\n,\n.\n<\n>\n?\n/\n`\n~
"""
# regex to clear \n followed by a character
import re
a = re.sub(r"\\n[^\\]", "", a)
print(a)
Output:
Message Here
Some Other Message
In the above PDF file, my code has to extract keywords and Table Names like Table 1, Table 2, Title with Bold Letters like INTRODUCTION, CASE PRESENTATION from all pages from the given PDF.
Wrote a small program to extract texts from the PDF file
punctuations = ['(',')',';',':','[',']',',','^','=','-','!','.','{','}','/','#','^','&']
stop_words = stopwords.words('English')
keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
print(keywords)
and the output I got was as below
From the above output, How to extract keywords like INTRODUCTION, CASE PRESENTATION, Table 1 along with the page number and save them in a output file.
Output Format
INTRODUCTION in Page 1
CASE PRESENTATION in Page 3
Table 1 (Descriptive Statistics) in Page 5
Need help in obtaining output of this format.
Code
def main():
file_name = open("Test1.pdf","rb")
readpdf = PyPDF2.PdfFileReader(file_name)
#Parse thru each page to extract the texts
pdfPages = readpdf.numPages
count=0
text=""
print()
#The while loop will read each page.
while count < pdfPages:
pageObj = readpdf.getPage(count)
count +=1
text += pageObj.extractText()
#This if statement exists to check if the above library returned words. It's done because PyPDF2 cannot read scanned files.
if text != "":
text = text
#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text.
else:
text = textract.process(fileurl, method='tesseract', language='eng')
#PRINT THE TEXT EXTRACTED FROM GIVEN PDF
#print(text)
#The function will break text into individual words
tokens = word_tokenize(text)
#print('TOKENS')
#print(tokens)
#Clean the punctuations not required.
punctuations = ['(',')',';',':','[',']',',','^','=','-','!','.','{','}','/','#','^','&']
stop_words = stopwords.words('English')
keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
print(keywords)
If you want information on which page is some text then you shouldn't add all to one string but you should work with every page separatelly (in for-loop`)
It could be something similar to this. It is code without tesseract which would need method to split PDF to separated pages and works with every page separatelly
pdfPages = readpdf.numPages
# create it before loop
punctuations = ['(',')',';',':','[',']',',','^','=','-','!','.','{','}','/','#','^','&']
stop_words = stopwords.words('English')
#all_pages = []
# work with every page separatelly
for count in range(pdfPages):
pageObj = readpdf.getPage(count)
page_text = pageObj.extractText()
page_tokens = word_tokenize(page_text)
page_keywords = [word for word in page_tokens if not word in stop_words and not word in punctuations]
page_uppercase_words = [word for word in page_keywords if word.isupper()]
#all_pages.append( (count, page_keywords, page_uppercase_words) )
print('page:', count)
print('keywords:', page_keywords)
print('uppercase:', page_uppercase_words)
# TODO: append/save page to file
Issue partially resolved here: https://github.com/konfuzio-ai/document-ai-python-sdk/issues/6#issue-876036328
Check: https://github.com/konfuzio-ai/document-ai-python-sdk
# pip install konfuzio_sdk
# in working directory
# konfuzio_sdk init
from konfuzio_sdk.api import get_document_annotations
document_first_annotation = get_document_annotations(document_id=1111)[0]
page_index = document_first_annotation['bbox']['page_index']
keyword = document_first_annotation['offset_string']
The object Annotation in the Konfuzio SDK allows to access directly to the keyword string but, at the moment, not directly to the page index. This attribute will be added soon.
An example to access the first annotation in the first training document of your project would be:
# pip install konfuzio_sdk
# in working directory
# konfuzio_sdk init
from konfuzio_sdk.data import Project
my_project = Project()
annotations_first_doc = my_project.documents[0].annotations()
first_annotation = annotations_first_doc[0]
keyword = first_annotation.offset_string
import PyPDF2
import pandas
import numpy
import re
import os,sys
import nltk
import fitz
def main():
file_name = open("File1.pdf","rb")
readPDF = PyPDF2.PdfFileReader(file_name)
call_function(file_name,readPDF)
def call_function(fname,readpdf)
pdfPages = readpdf.numPages
for pageno in range(pdfPages):
doc_name = fitz.open(fname.name)
page = word_tokenize(doc_name[pageno].get_text())
page_texts = [word for word in page if not word in stop_words and not word in punctuations]
print('Page Number:',pageno)
print('Page Texts :',page_texts)
I'm trying to update a program to pull/read 10-K html and am getting a FileNotFound error. The error throws during the readHTML function. It looks like the FileName parameter is looking for a path to the Form10KName column, when it should be looking to the FileName column. I've no idea why this is happening, any help?
Here is the error code:
File "C:/Users/crabtreec/Downloads/4_ReadHTML.py", line 105, in <module>
main()
File "C:/Users/crabtreec/Downloads/4_ReadHTML.py", line 92, in main
match=readHTML(FileName)
File "C:/Users/crabtreec/Downloads/4_ReadHTML.py", line 18, in readHTML
input_file = open(input_path,'r+')
FileNotFoundError: [Errno 2] No such file or directory: './HTML/a10-k20189292018.htm'
And here is what I'm running.
from bs4 import BeautifulSoup #<---- Need to install this package manually using pip
from urllib.request import urlopen
os.chdir('C:/Users/crabtreec/Downloads/') # The location of the file "CompanyList.csv
htmlSubPath = "./HTML/" #<===The subfolder with the 10-K files in HTML format
txtSubPath = "./txt/" #<===The subfolder with the extracted text files
DownloadLogFile = "10kDownloadLog.csv" #a csv file (output of the 3DownloadHTML.py script) with the download history of 10-K forms
ReadLogFile = "10kReadlog.csv" #a csv file (output of the current script) showing whether item 1 is successfully extracted from 10-K forms
def readHTML(FileName):
input_path = htmlSubPath+FileName
output_path = txtSubPath+FileName.replace(".htm",".txt")
input_file = open(input_path,'r+')
page = input_file.read() #<===Read the HTML file into Python
#Pre-processing the html content by removing extra white space and combining then into one line.
page = page.strip() #<=== remove white space at the beginning and end
page = page.replace('\n', ' ') #<===replace the \n (new line) character with space
page = page.replace('\r', '') #<===replace the \r (carriage returns -if you're on windows) with space
page = page.replace(' ', ' ') #<===replace " " (a special character for space in HTML) with space.
page = page.replace(' ', ' ') #<===replace " " (a special character for space in HTML) with space.
while ' ' in page:
page = page.replace(' ', ' ') #<===remove extra space
#Using regular expression to extract texts that match a pattern
#Define pattern for regular expression.
#The following patterns find ITEM 1 and ITEM 1A as diplayed as subtitles
#(.+?) represents everything between the two subtitles
#If you want to extract something else, here is what you should change
#Define a list of potential patterns to find ITEM 1 and ITEM 1A as subtitles
regexs = ('bold;\">\s*Item 1\.(.+?)bold;\">\s*Item 1A\.', #<===pattern 1: with an attribute bold before the item subtitle
'b>\s*Item 1\.(.+?)b>\s*Item 1A\.', #<===pattern 2: with a tag <b> before the item subtitle
'Item 1\.\s*<\/b>(.+?)Item 1A\.\s*<\/b>', #<===pattern 3: with a tag <\b> after the item subtitle
'Item 1\.\s*Business\.\s*<\/b(.+?)Item 1A\.\s*Risk Factors\.\s*<\/b') #<===pattern 4: with a tag <\b> after the item+description subtitle
#Now we try to see if a match can be found...
for regex in regexs:
match = re.search (regex, page, flags=re.IGNORECASE) #<===search for the pattern in HTML using re.search from the re package. Ignore cases.
#If a match exist....
if match:
#Now we have the extracted content still in an HTML format
#We now turn it into a beautiful soup object
#so that we can remove the html tags and only keep the texts
soup = BeautifulSoup(match.group(1), "html.parser") #<=== match.group(1) returns the texts inside the parentheses (.*?)
#soup.text removes the html tags and only keep the texts
rawText = soup.text.encode('utf8') #<=== you have to change the encoding the unicodes
#remove space at the beginning and end and the subtitle "business" at the beginning
#^ matches the beginning of the text
outText = re.sub("^business\s*","",rawText.strip(),flags=re.IGNORECASE)
output_file = open(output_path, "w")
output_file.write(outText)
output_file.close()
break #<=== if a match is found, we break the for loop. Otherwise the for loop continues
input_file.close()
return match
def main():
if not os.path.isdir(txtSubPath): ### <=== keep all texts files in this subfolder
os.makedirs(txtSubPath)
csvFile = open(DownloadLogFile, "r") #<===A csv file with the list of 10k file names (the file should have no header)
csvReader = csv.reader(csvFile, delimiter=",")
csvData = list(csvReader)
logFile = open(ReadLogFile, "a+") #<===A log file to track which file is successfully extracted
logWriter = csv.writer(logFile, quoting = csv.QUOTE_NONNUMERIC)
logWriter.writerow(["filename","extracted"])
i=1
for rowData in csvData[1:]:
if len(rowData):
FileName = rowData[7]
if ".htm" in FileName:
match=readHTML(FileName)
if match:
logWriter.writerow([FileName,"yes"])
else:
logWriter.writerow([FileName,"no"])
i=i+1
csvFile.close()
logFile.close()
print("done!")
if __name__ == "__main__":
main()
CSV of file info
Your error message explains it is not looking inside the "HTML" directory for the file.
I would avoid using os.chdir to change the working directory - it is likely to complicate things. Instead, use pathlib and join paths correctly to ensure file paths are less error prone.
Try with this:
from pathlib import Path
base_dir = Path('C:/Users/crabtreec/Downloads/') # The location of the file "CompanyList.csv
htmlSubPath = base_dir.joinpath("HTML") #<===The subfolder with the 10-K files in HTML format
txtSubPath = base_dir.joinpath("txt") #<===The subfolder with the extracted text files
DownloadLogFile = "10kDownloadLog.csv" #a csv file (output of the 3DownloadHTML.py script) with the download history of 10-K forms
ReadLogFile = "10kReadlog.csv" #a csv file (output of the current script) showing whether item 1 is successfully extracted from 10-K forms
def readHTML(FileName):
input_path = htmlSubPath.joinpath(FileName)
output_path = txtSubPath.joinpath(FileName.replace(".htm",".txt"))
I have a text file (heavily modified for this example) which has some data that I want to extract and do some calculations with it. However the text file is extremely messy, so I'm trying to clean it up and write it out to new files first.
Here is the .txt file I'm working with: http://textuploader.com/5elql
I am trying to extract the data which is under the titles (called “Important title”). The only possible way to do that is to first locate a string which always occurs in the file, and its called “DATASET” because all the mess above and below the important data will cover an arbitrary number of lines, difficult to remove manually. Once that’s done I want to store the data in separate files so that it is easier to analyse like this:
http://textuploader.com/5elqw
The file names will be concatenated with the title + the date.
Here is what I have tried so far
with open("example.txt") as file:
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:]
if line.startswith("DATE:"):
fileDate = line[:]
print(fileTitle+fileDate)
OUTPUT
IMPORTANT TITLE 1
DATE: 12/30/2015
IMPORTANT TITLE 2
DATE: 01/03/2016
So it appears my loop manages to locate the lines where the titles inside the file are and print them out. But this is where I run out of steam. I have no idea on how to extract the data under those titles from there onwards. I have tried using file.readlines() but it outputs all the mess that is in between Important Title 1 and Important Title 2.
Any advice on how I can read all the data under the titles and output them into separate files? Thanks for your time.
You could use regex.
import re
pattern = r"(\s+X\s+Y\s*)|(\s*\d+\s+\d+\s*)"
prog = re.compile(pattern)
with open("example.txt") as file:
cur_filename = ''
content = ""
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:]
elif line.startswith("DATE:"):
fileDate = line[6:]
cur_filename = (fileTitle.strip() + fileDate.strip()).replace('/', '-')
print(cur_filename)
content_title = fileTitle + line
elif prog.match(line):
content += line
elif cur_filename and content:
with open(cur_filename, 'w') as fp:
fp.write(content_title)
fp.write(content)
cur_filename = ''
content = ''
I don't know exactly how you want to store your data but assuming you want a dictionary you could use regex to check if the incoming line matched the pattern, then because fileTitle isn't global you could use that as the key and add the values. I also added rstrip('\r\n') to remove the newline characters after fileTitle.
import re
#if you don't want to store the X and Y, just use re.compile('\d\s+\d+')
p = re.compile('(\d\s+\d+)|(X\s+Y)')
data={}
with open("input.txt") as file:
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:].rstrip('\r\n')
if line.startswith("DATE:"):
fileDate = line[:]
print(fileTitle+fileDate)
if p.match(line):
if fileTitle not in data:
data[fileTitle]=[]
line=line.rstrip('\r\n')
data[fileTitle].append(line.split('\t'))
if len(data[fileTitle][len(data[fileTitle])-1]) == 3:
data[fileTitle][len(data[fileTitle])-1].pop()
print data
Yet another regex solution:
sep = '*************************\n'
pattern = r'DATASET[^%]*'
good_stuff = re.compile(pattern)
pattern = r'^DATASET: (.*?)$'
title = re.compile(pattern, flags = re.MULTILINE)
pattern = r'^DATE: (.*?)$'
date = re.compile(pattern, flags = re.MULTILINE)
with open(r'foo.txt') as f:
data = f.read()
for match in good_stuff.finditer(data):
data = match.group()
important_title = title.search(data).group(1)
important_date = date.search(data).group(1)
important_date = important_date.replace(r'/', '-')
fname = important_title + important_date + '.txt'
print(sep, fname)
print(data)
##with open(fname, 'w') as f:
## f.write(data)
I have been trying to print the output to a new text file. But I get the error
TypeError: expected a character buffer object
What I'm trying to do is convert pdf to text and copy the text obtained to a new file.
import pyPdf
def getPDFContent():
content = ""
# Load PDF into pyPDF
pdf = pyPdf.PdfFileReader(file("D:\output.pdf", "rb"))
# Iterate pages
for i in range(0, pdf.getNumPages()):
# Extract text from page and add to content
#content += pdf.getPage(i).extractText() + "\n"
print pdf.getPage(i).extractText().encode("ascii", "ignore")
# Collapse whitespace
#content = " ".join(content.replace(u"\xa0", " ").strip().split())
#return content
#getPDFContent().encode("ascii", "ignore")
getPDFContent()
s =getPDFContent()
with open('D:\pdftxt.txt', 'w') as pdftxt:
pdftxt.write(s)
I did try to initialize s as str but then I get the error as "can't assign to function call".
You are not returning anything getPDFContent() so basically you are writing None.
result=[]
for i in range(0, pdf.getNumPages()):
result.append(pdf.getPage(i).extractText().encode("ascii", "ignore")) # store all in a list
return result
s = getPDFContent()
with open('D:\pdftxt.txt', 'w') as pdftxt:
pdftxt.writelines(s) # use writelines to write list content
How your code should look:
def getPDFContent():
# Load PDF into pyPDF
pdf = pyPdf.PdfFileReader(file("D:\output.pdf", "rb"))
# Iterate pages
result = []
for i in range(0, pdf.getNumPages()):
result.append(pdf.getPage(i).extractText().encode("ascii", "ignore"))
return result
s = getPDFContent()
with open('D:\pdftxt.txt', 'w') as pdftxt:
pdftxt.writelines(s)