Keeping Formatting Stable With python-docx - python

My python code automates the process of replacing the Job Role and Hiring Manager for a cover letter.
The code works but it makes the document's formatting all wonky. For context, my cover letter includes images and lines and the code moves all of them from their original position.
Dictionary = {"Hiring Manager": hiringManager, "Job_Role": jobRole}
for i in Dictionary:
for p in document.paragraphs:
if p.text.find(i) >= 0:
p.text = p.text.replace(i, Dictionary[i])
for p in document.paragraphs:
p.style = document.styles['Normal']
for run in p.runs:
run.font.size = Pt(10.5)
run.font.name = 'Calibri'
savedFile = location + saveText + ' - ' + jobRole + ".docx" #File name and location
document.save(savedFile)
convert(savedFile) #Convert into PDF using docxtopdf

Related

Parsing PDF to extract Abstract

I'm trying to get a python script that allows me to automatically retrieve an abstract from any PDF, as long as it contains one.
Some people would have an idea how to write an automated python script already, requiring only the input file, that would allow me to extract the abstract from that PDF?
Here is what I have already obtained. This script is not automated since it requires a specific word to delimit the text to extract...
pdfFileObj = open('3D Printing in Pharmaceutical Sector: An Overview.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
pagecontent = ""
while count < num_pages: #The while loop will read each page
#print(count)
pageObj = pdfReader.getPage(count)
count +=1
pagecontent += pageObj.extractText()
def between(value, a, b):
# Find and validate before-part.
pos_a = value.find(a)
if pos_a == -1: return ""
# Find and validate after part.
pos_b = value.rfind(b)
if pos_b == -1: return ""
# Return middle part.
adjusted_pos_a = pos_a + len(a)
if adjusted_pos_a >= pos_b: return ""
return value[adjusted_pos_a:pos_b]
desired = between(pagecontent,"Abstract","Keywords")
print('The abstract of the document is :' + desired)
text = desired.encode('ascii','ignore').lower() # It returns an utf-8 encoded version of the string & Lowercasing each word
text = text.decode('ISO-8859-1')
keywords = re.findall(r'[a-zA-Z]\w+',text)

I can not find a way to deal with new pages in docx using Python

I have a docx file with 40 pages of text and I want to separate each page and import its context into a list. Is this possible? The only way I have found is to find the empty spots in my list but that does not always mean a page break. With my code I get the text after the word "Subject" is found and it stops
after a blank spot is found. The thing is that need a way to recognise pagebreak in my code to solve some issues. This way page break is also being treated as a " " . Thanks in advance
import os
import docx
def read(name):
doc = docx.Document(name)
text =[]
for par in doc.paragraphs:
text.append(par.text)
return text
''''''
for basename in os.listdir('files'):
path = os.path.join('files', basename)
jerk = read(path)
lari =[]
vaccum = []
indices = []
for i in jerk:
if not i.find('Subject'):
lari.append(jerk.index(i))
indices.append(jerk.index(i))
for j in jerk:
if jerk.index(j) in lari:
for k in range(20):
if jerk[jerk.index(j)+k]!='':
vaccum.append(jerk[jerk.index(j) + k + 1])
else:
break
final =[]
var =''
for k in vaccum:
var = var+k
if k =='':
final.append(var)
var =''
print(vaccum)

Finding a piece of information in a document and deleting everything before and after

I have some .docx files that are very specifically formatted.
I have copied the file 5 times to represent the 5 different strings that I require to be "found" and everything else removed.
#! python 3
import docx
import os
import shutil
import readDocx as rD
def delete_paragraph(paragraph):
p = paragraph._element
p.getparent().remove(p)
p._p = p._element = None
#Select the file you want to work with
fP = rD.file
#get the working directory for the file
nfP = os.path.dirname(os.path.abspath(fP))
#print (nfP)
#Break the filepath into parts
fileSplit = fP.split('/')
#Get the filename only
fileCode = fileSplit[-1]
#print (fileCode)
#Seperate the course code
nameSplit = fileCode.split(' ')
courseCode = nameSplit[0]
#print (courseCode)
#List of files that we need to create
a1 = "Assessment Summary"
a2 = "Back to Business project"
a3 = "Back to Business Checklist"
a4 = "Skills Demonstration"
a5 = "Skills Demonstration Checklist"
names = [a1, a2, a3, a4, a5]
#Creates a list for the new filenames to sit in
newFiles = []
#Creates the files from the original
for name in names:
fileName = os.path.join(nfP + '\\' + courseCode + ' ' + str(name) + ' ' +'Version 1.0' + '.docx')
shutil.copy(fP, fileName)
#print(fileName)
newFiles.append(fileName)
#print (newFiles)
#Need to iterate through the files and start deleting data.
h1 = "Learner Declaration"
h2 = "Back to Business Project"
h3 = "Assessor Observation Checklist / Marking Guide"
h4 = "Skills Demonstration"
h5 = "Assessor Observation Checklist / Marking Guide"
This is where I start to fail in my limited skill. The h1-5 tags represent the heading of the document pieces that I want to keep.
How can I iterate through the document, find the heading and delete everything before / after these paragraphs?
I don't necessarily need the answer, just more of a "look in this direction".
Thanks
Try this. Have clearly mentioned in the comments what the code does.
from docx import Document #Package "Python-docx" needs to be installed to import this
import pandas as pd
# Read the document into a python-docx Document object
document = Document('Path/to/your/input/.docx/document')
#Initialize an empty dataframe to store the .docx document into a dataframe along with the style of each paragraph
document_text_dataframe = pd.DataFrame(columns=['para_text','style'])
#Iterate through the "document" object for extracting the paragraph texts along with their styles into the dataframe "document_text_dataframe"
for para in document.paragraphs:
#Extract paragraph style
style = str(para.style.name)
##### For headings which are created as NORMAL style but are BOLD, we need to extract them as well-
##### Ideally these represent headings as well.
runboldtext = ''
for run in para.runs:
if run.bold:
runboldtext = runboldtext + run.text
if runboldtext == str(para.text) and runboldtext != '':
print("Bold True for:",runboldtext)
style = 'Heading'
#################################################################
dftemp = pd.DataFrame({'para_text':[para.text],'style':[style]})
document_text_dataframe=document_text_dataframe.append(dftemp,sort=False) # Now append each paragraph along with its style into "document_text_dataframe"
document_text_dataframe = document_text_dataframe.reset_index(drop=True)
#Need to iterate through the files and start deleting data.
h1 = "Learner Declaration"
h2 = "Back to Business Project"
h3 = "Assessor Observation Checklist / Marking Guide"
h4 = "Skills Demonstration"
h5 = "Assessor Observation Checklist / Marking Guide"
h_list = [h1,h2,h3,h4]
#Initialize a list to store the extracted information relevant to each "h" value and store them in it
extracted_content=[]
for h in h_list:
df_temp = pd.DataFrame(columns=['para_text','style'])
###########Loop through the document to extract the content related to each "h" value######
start_index=0
end_index=0
for index, row in document_text_dataframe.iterrows():
if h == row['para_text']:
print("Found match in document for: ",h)
start_index = index
print("Matching index=",index)
break
if start_index != 0:
for i in range(start_index+1,len(document_text_dataframe)-1):
if 'Heading' in document_text_dataframe.loc[i,'style']:
end_index = i
break
if end_index !=0:
for i in range(start_index,end_index):
df_temp = df_temp.append(document_text_dataframe.loc[i])
############################################################################################
#Append every extracted content into the list "extracted_content"
if start_index != 0 and end_index!=0:
extracted_content.append(df_temp)
#The list "extracted_content" will consist of dataframes. Each dataframe will correspond to the extracted information of each "h" value.
print(extracted_content)
Now, using extracted_content, you can write every entry in the list extracted_content to a separate .docx document using your code.
Cheers!

split() issues with pdf extractText()

I'm working on a minor content analysis program that I was hoping that I could have running through several pdf-files and return the sum of frequencies that some specific words are mentioned in the text. The words that are searched for are specified in a separate text file (list.txt) and can be altered. The program runs just fine through files with .txt format, but the result is completely different when running the program on a .pdf file. To illustrate, the test text that I have the program running trhough is the following:
"Hello
This is a product development notice
We’re working with innovative measures
A nice Innovation
The world that we live in is innovative
We are currently working on a new process
And in the fall, you will experience our new product development introduction"
The list of words grouped in categories are the following (marked in .txt file with ">>"):
innovation: innovat
product: Product, development, introduction
organization: Process
The output from running the code with a .txt file is the following:
Whereas the ouput from running it with a .pdf is the following:
As you can see, my issue is pertaining to the splitting of the words, where in the .pdf output i can have a string like "world" be split into 'w','o','rld'. I have tried to search for why this happens tirelessly, without success. As I am rather new to Python programming, I would appreciate any answe or direction to where I can fin and answer to why this happens, should you know any source.
Thanks
The code for the .txt is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.txt'
textfile = open(f)
text = textfile.read().split() # lowercase the text
print (text)
textfile.close()
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in text:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
While the code for the .pdf is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.pdf'
textfile = open(f, 'rb')
text = PyPDF2.PdfFileReader(textfile)# lowercase the text
for pageNum in range(0, text.numPages):
texts = text.getPage(pageNum)
textfile = texts.extractText().split()
print (textfile)
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in textfile:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])

Find text position in PDF file

I have a PDF file and I am trying to find a specific text in the PDF and highlight it using Python.
I found PyPDF2, which can highlight part of a PDF when we give the coordinates of the wanted highlight position in the file.
I am trying to find a tool which can give me the position of a given text in the PDF.
PyMuPDF can find text by coordinates. You can use this in conjunction with the PyPDF2 highlighting method to accomplish what you're describing. Or you can just use PyMuPDF to highlight the text.
Here is sample code for finding text and highlighting with PyMuPDF:
import fitz
### READ IN PDF
doc = fitz.open("input.pdf")
for page in doc:
### SEARCH
text = "Sample text"
text_instances = page.search_for(text)
### HIGHLIGHT
for inst in text_instances:
highlight = page.add_highlight_annot(inst)
highlight.update()
### OUTPUT
doc.save("output.pdf", garbage=4, deflate=True, clean=True)
With the new version of PyMuPDF, some methods got depreciated.
Here is the sample code as per the recent version. Secondly, I've also added a comment for each highlight which facilities the user to transverse.
pdfIn = fitz.open("page-4.pdf")
for page in pdfIn:
print(page)
texts = ["SEPA", "voorstelnummer"]
text_instances = [page.search_for(text) for text in texts]
# coordinates of each word found in PDF-page
print(text_instances)
# iterate through each instance for highlighting
for inst in text_instances:
annot = page.add_highlight_annot(inst)
# annot = page.add_rect_annot(inst)
## Adding comment to the highlighted text
info = annot.info
info["title"] = "word_diffs"
info["content"] = "diffs"
annot.set_info(info)
annot.update()
# Saving the PDF Output
pdfIn.save("page-4_output.pdf")
If you are on Windows and have Acrobat Pro (not reader), you can try the old Component Object Model with Python or VBA.
import win32com, winerror, os
from win32com.client.dynamic import ERRORS_BAD_CONTEXT
ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)
win32com.client.gencache.EnsureModule('{E64169B3-3592-47d2-816E-602C5C13F328}', 0, 1, 1)
avDoc = win32com.client.DispatchEx('AcroExch.AVDoc')
avDoc.Open(src, src)
avDoc.BringToFront()
pdDoc = avDoc.GetPDDoc()
jsoObject = pdDoc.GetJSObject()
for pageNo in range(1):
pdfPage = pdDoc.AcquirePage(pageNo)
pageHL = win32com.client.DispatchEx('AcroExch.HiliteList')
_ = pageHL.Add(0, 9000)
pageSel = pdfPage.CreatePageHilite(pageHL)
pdfText = ""
for wordNo in range(pageSel.GetNumText()):
word = pageSel.GetText(wordNo)
pdfText += word
if keyword in pdfText:
wordToHl = win32com.client.DispatchEx('AcroExch.HiliteList')
wordToHl.Add(wordNo, 1)
wordHl = pdfPage.CreateWordHilite(wordToHl)
rect = wordHl.GetBoundingRect()
annot = jsoObject.AddAnnot()
props = annot.GetProps()
props.Type = "Square"
props.Page = pageNo
props.Hidden = False
props.Lock = True
props.Name = word
props.NoView = False
props.Opacity = 0.3
props.ReadOnly = True
props.Style = "S"
props.ToggleNoView = False
props.PopupOpen = False
popupRect = [rect.Left - 5, rect.Top + 5, rect.Left + 40, rect.Top - 20]
props.Rect = popupRect
props.PopupRect = popupRect
props.StrokeColor = jsoObject.Color.Red
props.FillColor = jsoObject.Color.Yellow
annot.SetProps(props)
print(f'Found {keyword}')

Categories