Find text position in PDF file - python

I have a PDF file and I am trying to find a specific text in the PDF and highlight it using Python.
I found PyPDF2, which can highlight part of a PDF when we give the coordinates of the wanted highlight position in the file.
I am trying to find a tool which can give me the position of a given text in the PDF.

PyMuPDF can find text by coordinates. You can use this in conjunction with the PyPDF2 highlighting method to accomplish what you're describing. Or you can just use PyMuPDF to highlight the text.
Here is sample code for finding text and highlighting with PyMuPDF:
import fitz
### READ IN PDF
doc = fitz.open("input.pdf")
for page in doc:
### SEARCH
text = "Sample text"
text_instances = page.search_for(text)
### HIGHLIGHT
for inst in text_instances:
highlight = page.add_highlight_annot(inst)
highlight.update()
### OUTPUT
doc.save("output.pdf", garbage=4, deflate=True, clean=True)

With the new version of PyMuPDF, some methods got depreciated.
Here is the sample code as per the recent version. Secondly, I've also added a comment for each highlight which facilities the user to transverse.
pdfIn = fitz.open("page-4.pdf")
for page in pdfIn:
print(page)
texts = ["SEPA", "voorstelnummer"]
text_instances = [page.search_for(text) for text in texts]
# coordinates of each word found in PDF-page
print(text_instances)
# iterate through each instance for highlighting
for inst in text_instances:
annot = page.add_highlight_annot(inst)
# annot = page.add_rect_annot(inst)
## Adding comment to the highlighted text
info = annot.info
info["title"] = "word_diffs"
info["content"] = "diffs"
annot.set_info(info)
annot.update()
# Saving the PDF Output
pdfIn.save("page-4_output.pdf")

If you are on Windows and have Acrobat Pro (not reader), you can try the old Component Object Model with Python or VBA.
import win32com, winerror, os
from win32com.client.dynamic import ERRORS_BAD_CONTEXT
ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)
win32com.client.gencache.EnsureModule('{E64169B3-3592-47d2-816E-602C5C13F328}', 0, 1, 1)
avDoc = win32com.client.DispatchEx('AcroExch.AVDoc')
avDoc.Open(src, src)
avDoc.BringToFront()
pdDoc = avDoc.GetPDDoc()
jsoObject = pdDoc.GetJSObject()
for pageNo in range(1):
pdfPage = pdDoc.AcquirePage(pageNo)
pageHL = win32com.client.DispatchEx('AcroExch.HiliteList')
_ = pageHL.Add(0, 9000)
pageSel = pdfPage.CreatePageHilite(pageHL)
pdfText = ""
for wordNo in range(pageSel.GetNumText()):
word = pageSel.GetText(wordNo)
pdfText += word
if keyword in pdfText:
wordToHl = win32com.client.DispatchEx('AcroExch.HiliteList')
wordToHl.Add(wordNo, 1)
wordHl = pdfPage.CreateWordHilite(wordToHl)
rect = wordHl.GetBoundingRect()
annot = jsoObject.AddAnnot()
props = annot.GetProps()
props.Type = "Square"
props.Page = pageNo
props.Hidden = False
props.Lock = True
props.Name = word
props.NoView = False
props.Opacity = 0.3
props.ReadOnly = True
props.Style = "S"
props.ToggleNoView = False
props.PopupOpen = False
popupRect = [rect.Left - 5, rect.Top + 5, rect.Left + 40, rect.Top - 20]
props.Rect = popupRect
props.PopupRect = popupRect
props.StrokeColor = jsoObject.Color.Red
props.FillColor = jsoObject.Color.Yellow
annot.SetProps(props)
print(f'Found {keyword}')

Related

How to extract radiobutton / checkbox information with python from a pdf-file?

i would like to get the radio-button / checkbox information from a pdf-document -
I had a look at pdfplumber and pypdf2 - but was not able to find a solution with this modules.
I can parse the text using this code - but for the radio-buttons i get only the text - but no information which button (or checkbox) is selected.
import pdfplumber
import os
import sys
if __name__ == '__main__':
path = os.path.abspath(os.path.dirname(sys.argv[0]))
fn = os.path.join(path, "input.pdf")
pdf = pdfplumber.open(fn)
page = pdf.pages[0]
text = page.extract_text()
I have also uploaded an example file here:
https://easyupload.io/8y8k2v
Is there any way to get this information from the pdf-file using python?
I think i found a solution using pdfplumber -
(probably not elegant - but i can check if the radio-buttons are selected or not)
Generally:
i read all chars and all curves for all pages
then i sort all elements by x and y (to get the chars and elements in the correct order like in the pdf)
then i concatenate the cars and add also blanks when the distance between the chars is longer than in a word
i check the pts-information for the carves and get so the information if the radio button is selected or not
the final lines and yes/not informatin i store in a list line-by-line for furhter working
import pdfplumber
import os
import sys
fn = os.path.join(path, "input.pdf")
pdf = pdfplumber.open(fn)
finalContent = []
for idx,page in enumerate(pdf.pages, start=1):
print(f"Reading page {idx}")
contList = []
for e in page.chars:
tmpRow = ["char", e["text"], e["x0"], e["y0"]]
contList.append(tmpRow)
for e in page.curves:
tmpRow = ["curve", e["pts"], e["x0"], e["y0"]]
contList.append(tmpRow)
contList.sort(key=lambda x: x[2])
contList.sort(key=lambda x: x[3], reverse=True)
workContent = []
workText = ""
workDistCharX = False
for e in contList:
if e[0] == "char":
if workDistCharX != False and \
(e[2] - workDistCharX > 20 or e[3] - workDistCharY < -2):
workText += " / "
workText += e[1]
workDistCharX = e[2]
workDistCharY = e[3]
continue
if e[0] == "curve":
if workText != "":
workContent.append(workText)
workText = ""
if e[1][0][0] < 100:
tmpVal = "SELECT-YES"
else:
tmpVal = "SELECT-NO"
workContent.append(f"CURVE {tmpVal}, None, None")
finalContent.extend(workContent)
workContent = "\n".join(workContent)

How to change style of a title with python-docx?

I'm using python-docx but I don't understand or retrieve any way to change the style (from bold to normal) of the title. My code is:
import docx
from docx.shared import RGBColor
from docx.shared import Pt
from docx.dml.color import ColorFormat
from docx.enum.style import WD_STYLE_TYPE
#format only the filename as return text
def format_filename(fname):
index = fname.rfind('\\')
font.color.rgb = RGBColor(255,0,0)
#IF statement for structuring the fine name
if index>0:
filename = fname[index + 1, len(fname)]
else:
index = fname.rfind('/')
filename = fname[index + 1 : len(fname)]
return filename
#print all the file into docx
def print_file(file):
font.bold = False
font.color.rgb = RGBColor(0,0,0)
cnt = 0
fp = open(file, 'r')
#read all the file and use every single line
for line in fp.readlines():
cnt += 1
#if it's the first line add paragraph
if cnt == 1:
paragraph = document.add_paragraph(line)
#else continue the paragraph
else:
paragraph.add_run(line)
#open file
document = docx.Document()
filepath = '../cap1/prg1.txt'
# set the font in the paragraph
run = document.add_paragraph().add_run()
style = document.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(10.5)
font.bold = True
font.color.rgb = RGBColor(255,0,0)
#print as a heading the filename
filename = format_filename(filepath) #self procedure for format the filename
document.add_heading(filename , level=2)
#print all the file
print_file(filepath)
document.save('my_cake_file.docx')
Here is how the title looks like after coloring:
You need to update color for Heading 2 style. Here is the example:
import docx
# Create doc
document = docx.Document()
# Add black title
styles = document.styles
styles['Heading 2'].font.color.rgb = docx.shared.RGBColor(0, 0, 0)
document.add_heading('Title', level=2)
# Add text
paragraph = document.add_paragraph()
paragraph.add_run('text')
# Save file
document.save('output.docx')
Output:

Finding a piece of information in a document and deleting everything before and after

I have some .docx files that are very specifically formatted.
I have copied the file 5 times to represent the 5 different strings that I require to be "found" and everything else removed.
#! python 3
import docx
import os
import shutil
import readDocx as rD
def delete_paragraph(paragraph):
p = paragraph._element
p.getparent().remove(p)
p._p = p._element = None
#Select the file you want to work with
fP = rD.file
#get the working directory for the file
nfP = os.path.dirname(os.path.abspath(fP))
#print (nfP)
#Break the filepath into parts
fileSplit = fP.split('/')
#Get the filename only
fileCode = fileSplit[-1]
#print (fileCode)
#Seperate the course code
nameSplit = fileCode.split(' ')
courseCode = nameSplit[0]
#print (courseCode)
#List of files that we need to create
a1 = "Assessment Summary"
a2 = "Back to Business project"
a3 = "Back to Business Checklist"
a4 = "Skills Demonstration"
a5 = "Skills Demonstration Checklist"
names = [a1, a2, a3, a4, a5]
#Creates a list for the new filenames to sit in
newFiles = []
#Creates the files from the original
for name in names:
fileName = os.path.join(nfP + '\\' + courseCode + ' ' + str(name) + ' ' +'Version 1.0' + '.docx')
shutil.copy(fP, fileName)
#print(fileName)
newFiles.append(fileName)
#print (newFiles)
#Need to iterate through the files and start deleting data.
h1 = "Learner Declaration"
h2 = "Back to Business Project"
h3 = "Assessor Observation Checklist / Marking Guide"
h4 = "Skills Demonstration"
h5 = "Assessor Observation Checklist / Marking Guide"
This is where I start to fail in my limited skill. The h1-5 tags represent the heading of the document pieces that I want to keep.
How can I iterate through the document, find the heading and delete everything before / after these paragraphs?
I don't necessarily need the answer, just more of a "look in this direction".
Thanks
Try this. Have clearly mentioned in the comments what the code does.
from docx import Document #Package "Python-docx" needs to be installed to import this
import pandas as pd
# Read the document into a python-docx Document object
document = Document('Path/to/your/input/.docx/document')
#Initialize an empty dataframe to store the .docx document into a dataframe along with the style of each paragraph
document_text_dataframe = pd.DataFrame(columns=['para_text','style'])
#Iterate through the "document" object for extracting the paragraph texts along with their styles into the dataframe "document_text_dataframe"
for para in document.paragraphs:
#Extract paragraph style
style = str(para.style.name)
##### For headings which are created as NORMAL style but are BOLD, we need to extract them as well-
##### Ideally these represent headings as well.
runboldtext = ''
for run in para.runs:
if run.bold:
runboldtext = runboldtext + run.text
if runboldtext == str(para.text) and runboldtext != '':
print("Bold True for:",runboldtext)
style = 'Heading'
#################################################################
dftemp = pd.DataFrame({'para_text':[para.text],'style':[style]})
document_text_dataframe=document_text_dataframe.append(dftemp,sort=False) # Now append each paragraph along with its style into "document_text_dataframe"
document_text_dataframe = document_text_dataframe.reset_index(drop=True)
#Need to iterate through the files and start deleting data.
h1 = "Learner Declaration"
h2 = "Back to Business Project"
h3 = "Assessor Observation Checklist / Marking Guide"
h4 = "Skills Demonstration"
h5 = "Assessor Observation Checklist / Marking Guide"
h_list = [h1,h2,h3,h4]
#Initialize a list to store the extracted information relevant to each "h" value and store them in it
extracted_content=[]
for h in h_list:
df_temp = pd.DataFrame(columns=['para_text','style'])
###########Loop through the document to extract the content related to each "h" value######
start_index=0
end_index=0
for index, row in document_text_dataframe.iterrows():
if h == row['para_text']:
print("Found match in document for: ",h)
start_index = index
print("Matching index=",index)
break
if start_index != 0:
for i in range(start_index+1,len(document_text_dataframe)-1):
if 'Heading' in document_text_dataframe.loc[i,'style']:
end_index = i
break
if end_index !=0:
for i in range(start_index,end_index):
df_temp = df_temp.append(document_text_dataframe.loc[i])
############################################################################################
#Append every extracted content into the list "extracted_content"
if start_index != 0 and end_index!=0:
extracted_content.append(df_temp)
#The list "extracted_content" will consist of dataframes. Each dataframe will correspond to the extracted information of each "h" value.
print(extracted_content)
Now, using extracted_content, you can write every entry in the list extracted_content to a separate .docx document using your code.
Cheers!

Excel List iteration

I am working on a text search project. I have 2 lists.
a = ['ibm','dell']
b =['strength','keyword']##this is a list of keywords given by the user
Now i create combinations for searching google.
lst = list(itertools.product(a, b))
What i need help on is below:
using the code i will search for the text using different keywords and their lemma. After that I need to write the searched text to an excel file. I need to create worksheets with the names in list A and write it only the searched text in different worksheets. I am not able to figure. below is part of my code.
def getarticle(url,n):
final =[]
regex ='(.*).pdf'
pattern = re.compile(regex)
if re.match(pattern,url) is not None:
text = pdf_to_text(url)
final.append('')
final.append(url)
final.append(text)
New_file = open((('text' + str((round(random.random(),2))) + '.txt')),'w+')
New_file.write(smart_str(unicode(text,'utf-8')))
New_file.close()
else:
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent','Chrome')]
html = br.open(url).read()
titles = br.title()
readable_article= Document(html).summary()
readable_title = Document(html).short_title()
soup = bs4.BeautifulSoup(readable_article)
Final_Article = soup.text
final.append(titles)
final.append(url)
final.append(Final_Article)
raw = nltk.clean_html(html)
cleaned = re.sub(r'& ?(ld|rd)quo ?[;\]]', '\"', raw)
tokens = nltk.wordpunct_tokenize(raw)
lmtzr = WordNetLemmatizer()
t = [lmtzr.lemmatize(t) for t in tokens]
text = nltk.Text(t)
word = words(n)
find = ' '.join(str(e) for e in word)
search_words = set(find.split(' '))
sents = ' '.join([s.lower() for s in text])
blob = TextBlob(sents.decode('ascii','ignore'))
matches = [map(str, blob.sentences[i-1:i+2]) # from prev to after next
for i, s in enumerate(blob.sentences) # i is index, e is element
if search_words & set(s.words)]
return ''.join (str(y).replace('& rdquo','').replace('& rsquo','') for y in matches)
This returns the text now i need to write to excel files which i am unable to code.
As far as writing text out to a file Excel can read is concerned, ou might want to look at Python's csv library, which provides lots of useful .csv manipulation tools.

Parsing GeoRSS feed with python scripting

Have an exam tomorrow and need to get my program.py file working. I am to parse through a GeoRSS feed https://www.tvfoodmaps.com/MVFN.xml to be specific and obtain these attribute data; "Latitude", "Longitude" , "Title" , "Description" to compile into separate list. After creating these lists, I need to write to create a feature class that would hold these points and data in arcmap. The script is to be run in ArcMap to map out the restaurant locations and contain the information.
Right now I'm stuck on getting all the data into tables. The problem is at the get title and description parts because it seems that if I can get the Titles, then it doesn't run Descriptions and vice versa. Any help would be really appreciated! Here's what I have so far;
import os, urllib
#store the pathname to where you want to add text file to
#path = arcpy.GetParameterAsText(0) # pathname to folder
#FullFCOutputPath = arcpy.GetParameterAsText(1)
path = "https://www.tvfoodmaps.com/MVFN.xml"
f = urllib.urlopen(path)
myfile = f.read()
lstFieldNames = [ "Latitude", "Longitude" , "Title" , "Description" ]
lstPoints = myfile.split('<georss:point>')
#print lstPoints[1]
Latitudes = []
Longitudes = []
for Gval in lstPoints:
if Gval.find('</georss:point>') <> -1:
LatPos1 = 0
LatPos2 = Gval.index(' ')
LonPos1 = Gval.index(' ') + 1
LonPos2 = Gval.index('</georss:point>')
Latitudes.append(Gval[LatPos1:LatPos2])
Longitudes.append(Gval[LonPos1:LonPos2])
lstTitles = myfile.split('<item>')
Titles = []
Descriptions = []
#print lstTitles[1]
for Tval in lstTitles:
if Tval.find('<item>') <> -1: #
TlePos1 = Tval.index('<title>') + 7
TlePos2 = Tval.index('</title>')
Title = (Tval[TlePos1:TlePos2])
Title = Title.replace(''',"'")
Titles.append(Title)
elif Tval.find('</description>') <> -1:
DesPos1 = Tval.index('<description>') + 13
DesPos2 = Tval.index('</description>')
Description = (Tval[DesPos1:DesPos2])
Description = Description.replace(''',"'")
Descriptions.append(Description)

Categories