I try to use python docx to read word file content.
For example: Attachment demo word file, it contains several paragraphs. Some paragraph contains a heading number, like 1.3, 1.4.1 etc.
My program is try to open the docx, and search a keyword in each paragraph. If the keyword exist in dedicate paragraph, print out that paragraph and its heading number.
However, it fail to print the heading number. For example, I search keyword "wall", it only print out paragraph with "wall", but no heading number 1.4.1.
I need the number too.
def search_word(filename,word):
#open the word file
document=Document(filename)
#read every paragraph
l=[paragraph.text.encode('utf-8') for paragraph in document.paragraphs]
result=[]
for i in l:
i=i.strip()
i=str(i)
pattern=re.compile(r"(.*)(%s)(.*)"%word,re.I|re.M)
rel=pattern.findall(i)
if len(rel):
result.append(rel)
print(filename+"="*30+"Search Result"+"="*30)
print("-"*150)
for k in result:
for m in k:
print("".join(m).strip('b\'')+"\n"*1)
print("-"*150+"\n"*2)
Finally, I find a stupid way to catch each paragraph heading and its content.
I convert the docx to HTML first, then using beautifulsoup & re to search my keyword.
def search_file(file,word):
global output_content
output_content=output_content+"\n"+"*"*30+file.split("\\")[-1]+" Search Result" +"*"*30+"\n"*2
url=file
htmlfile = open(url, 'r', encoding='utf-8')
demo = htmlfile.read()
soup=BeautifulSoup(demo,'lxml')
all_content=soup.find_all(['h1','h2','h3', 'h4', 'h5','p'])
new_list=[]
for item in all_content:
if item.text not in new_list:
new_list.append(item.text)
dic1={} #Build a empty dic to store each clause no, and its detail content from every paragraph
Target=""
content=""
for line in new_list:
line=str(line.replace("\n"," "))
pattern=re.compile(r"(^[1-9].+)") #Judge the paragraph whether start with heading no.
line_no=bool(pattern.search(line))
if line_no: #If the paragraph start with heading no
dic1[Target]=content #Save the conent to heading no. in dic.
Target=line
content=""
continue
else: #if the paragraph is detail, not heading line,
content=content+line+"\n" # save the content
continue
result=[] #The keyword search from the dic item, if the keyword in the item, shall print the dic key and item at the same time.
for value in dic1.values():
pattern=re.compile(r".*%s.*"%word,re.I|re.M)
rel=pattern.findall(value)
if len(rel):
result.append((list(dic1.keys())[list(dic1.values()).index(value)]))
result.append(list(rel))
result.append("\n")
return print_result(file,result)
def print_result(file,nums):
global output_content
for i in nums:
if isinstance(i, list):
print_result(file,i)
else:
output_content=output_content+i
Related
I am trying to detect specific words (with a regex pattern that I already have) in a Word Document. I do not only want to detect the word but also to know in which page it appears, I think of something like a list of tuples: [(WordA, 10), (WordB, 4) ....]
I am able to extract the text from the word document and detect all the words that match the regex pattern but I am not able to know if which page the word appears. Also, I want to detect all the occurrences regardless if they appear in the header, body or footnotes.
Here is my regex pattern:
pattern = re.compile(r'\bDOC[-–—]\d{9}(?!\d)')
Extraction of text:
import docx2txt
result = docx2txt.process("Word_Document.docx")
Thank you in advance,
I just wanted to say thank you to those who tried to answer this question. I found two solutions:
With Word Documents, splitting them into one word document per page with Aspose:
https://products.aspose.cloud/words/python/split/
Convert the Word Document into PDF and then create one PDF per page with PyPDF2 or other library
E
Ok, after a while of trying to figure this out, I managed to get this:
import docx2txt.docx2txt as docx2txt
import re
page_contents = []
def xml2text(xml):
text = u''
root = docx2txt.ET.fromstring(xml)
start = 0
for child in root.iter():
if child.tag == docx2txt.qn('w:t'):
t_text = child.text
text += t_text if t_text is not None else ''
elif child.tag == docx2txt.qn('w:tab'):
text += '\t'
elif child.tag in (docx2txt.qn('w:br'), docx2txt.qn('w:cr')):
text += '\n'
elif child.tag == docx2txt.qn("w:p"):
text += '\n\n'
elif child.tag == docx2txt.qn('w:lastRenderedPageBreak'):
end = len(text) + 1
page_contents.append(text[start:end])
start = len(text)
page_contents.append(text[start:len(text) + 1])
return text
docx2txt.xml2text = xml2text
docx2txt.process('test_file.docx') # use your filename
matches = []
pattern = re.compile(r'\bDOC[-–—]\d{9}(?!\d)')
for page_num, page_content in enumerate(page_contents, start=1):
# do regex search
all_matches = pattern.findall(page_content)
if all_matches:
for match in all_matches:
matches.append((match, page_num))
print(matches)
It modifies the module's function so that when it is called it will add each page to a list and the index + 1 will be the page number. It modifies the module's xml2text parser to additionally detect a page break and then add that pages contents to the local global list. It uses the tag 'lastRenderedPageBreak', the slight caution is to save the file if you have edited it so that the placement of these tags also gets updated.
I'm writing a script on python to read a PDF file and record both the string that appears after every instance that "time" is mentioned as well as the page number its mentioned on.
I have gotten it to recognize when each page has the string "time" on it and send me the page number, however if the page has "time" more than once, it does not tell me. I'm assuming this is because it has already fulfilled the criteria of having the string "time" on it at least once, and therefore it skips to the next page to perform the check.
How would I go about finding multiple instances of the word "time"?
This is my code:
import PyPDF2
def pdf_read():
pdfFile = "records\document.pdf"
pdf = PyPDF2.PdfFileReader(pdfFile)
pageCount = pdf.getNumPages()
for pageNumber in range(pageCount):
page = pdf.getPage(pageNumber)
pageContent = page.extractText()
if "Time" in pageContent or "time" in pageContent:
print(pageNumber)
Also as a side note, this pdf is a scanned document and therefore when I read the text on python (or copy and paste onto word) there are a lot words which come up with multiple random symbols and characters even though its perfectly legible. Is this a limitation of computer programming without having to apply more complex concepts such as machine learning in order to read the files accurately?
A solution would be to create a list of strings off pageContent and count the frequency of the word 'time' in the list. It is also easier to select the word following 'time' - you can simply retrieve the next item in the list:
import PyPDF2
import string
pdfFile = "records\document.pdf"
pdf = PyPDF2.PdfFileReader(pdfFile)
pageCount = pdf.getNumPages()
for pageNumber in range(pageCount):
page = pdf.getPage(pageNumber)
pageContent = page.extractText()
pageContent = ''.join(pageContent.splitlines()).split() # words to list
pageContent = ["".join(j.lower() for j in i if j not in string.punctuation) for i in pageContent] # remove punctuation
print(pageContent.count('time') + pageContent.count('Time')) # count occurances of time in list
print([(j, pageContent[i+1] if i+1 < len(pageContent) else '') for i, j in enumerate(pageContent) if j == 'Time' or j == 'time']) # list time and following word
Note that this example also strips all words from characters that are not letters or digits. Hopefully this sufficiently cleans up the bad OCR.
I am text mining a large document. I want to extract a specific line.
CONTINUED ON NEXT PAGE CONTINUATION SHEET REFERENCE NO. OF DOCUMENT BEING CONTINUED: PAGE 4 OF 16 PAGES
SPE2DH-20-T-0133 SECTION B
PR: 0081939954 NSN/MATERIAL: 6530015627381
ITEM DESCRIPTION
BOTTLE, SAFETY CAP
BOTTLE, SAFETY CAP RPOO1: DLA PACKAGING REQUIREMENTS FOR PROCUREMENT
RAQO1: THIS DOCUMENT INCORPORATES TECHNICAL AND/OR QUALITY REQUIREMENTS (IDENTIFIED BY AN 'R' OR AN 'I' NUMBER) SET FORTH IN FULL TEXT IN THE DLA MASTER LIST OF TECHNICAL AND QUALITY REQUIREMENTS FOUND ON THE WEB AT:
I want to extract the description immediately under ITEM DESCRIPTION.
I have tried many unsuccessful attempts.
My latest attempt was:
for line in text:
if 'ITEM' and 'DESCRIPTION'in line:
print ('Possibe Descript:\n', line)
But it did not find the text.
Is there a way to find ITEM DESCRIPTION and get the line after it or something similar?
The following function finds the description on the line below some given pattern, e.g. "ITEM DESCRIPTION", and also ignores any blank lines that may be present in between. However, beware that the function does not handle the special case when the pattern exists, but the description does not.
txt = '''
CONTINUED ON NEXT PAGE CONTINUATION SHEET REFERENCE NO. OF DOCUMENT BEING CONTINUED: PAGE 4 OF 16 PAGES
SPE2DH-20-T-0133 SECTION B
PR: 0081939954 NSN/MATERIAL: 6530015627381
ITEM DESCRIPTION
BOTTLE, SAFETY CAP
BOTTLE, SAFETY CAP RPOO1: DLA PACKAGING REQUIREMENTS FOR PROCUREMENT
RAQO1: THIS DOCUMENT INCORPORATES TECHNICAL AND/OR QUALITY REQUIREMENTS (IDENTIFIED BY AN 'R' OR AN 'I' NUMBER) SET FORTH IN FULL TEXT IN THE DLA MASTER LIST OF TECHNICAL AND QUALITY REQUIREMENTS FOUND ON THE WEB AT:
'''
I've assumed you got your text as a text string, and thus the function below will split it into a list of lines ..
pattern = "ITEM DESCRIPTION" # to search for
def find_pattern_in_txt(txt, pattern):
lines = [line for line in txt.split("\n") if line] # remove empty lines
if pattern in lines: return lines[lines.index(pattern)+1]
return None
print(find_pattern_in_txt(txt, pattern)) # prints: "BOTTLE, SAFETY CAP"
Test like this :
description = False
for line in text:
if 'ITEM DESCRIPTION' in line:
description = True
if description:
print(line)
Know this will work but you need something to stop reading the description, maybe another title like this
description = False
for line in text:
if 'ITEM DESCRIPTION' in line:
description = True
if description:
print(line)
if "END OF SOMETHING":
description = False
Use the string function 'find' as in the following, 'find' will return the index of the string you are looking for, so a positive number shows that you have found it.
code:
txt = "Hello, welcome to my world."
x = txt.find("welcome")
if x > 0:
print(x)
***
output:
***
7
f=open("aa.txt","r")
a=[]
for i in f:
a.append(i.split())
t1=0
for j in range(len(a)):
for i in range(len(a[j])):
if(a[j][i]=="ITEM" and a[j][i+1]=="DESCRIPTION"):
t1=j
for i in range(t1+1,len(a)):
for j in range(len(a[i])):
print(a[i][j]),
Use regex
import re
pattern = re.compile("(ITEM DESCRIPTION)\n.*") #if the information is directly
below without white space
pattern = re.compile("(ITEM DESCRIPTION)\n\n.*") #if there is a white space
before the information
for i, line in enumerate(open('file.txt')):
for match in re.finditer(pattern, line):
print 'Found on line %s: %s' % (i+1, match.group())
My question:
I want to parse a plain text with headings and listings into a single Python object, where headings as dict key and listings as list of values. The text is shown below:
Playing cricket is my hobby:
(a) true.
(b) false.
Furthermore, the heading does not include:
(a) Singlets.
(b) fabrics.
(c) Smocks.
My desired output is:
{"Playing cricket is my hobby:":["(a)true.","(b)false."],"Furthermore, the heading does not include:":["(a) Singlets.","(b) Garments.","(c) Smocks."]}
What I have done
I firstly convert text to list of string:
plaintxtlist=['Playing cricket is my hobby:','(a) true.','(b) false.','Furthermore, the heading does not include:','(a) Singlets.',' (b) fabrics.','(c) Smocks.']
I tried to convert the list above into a dictionary which its keys are the index of heading and values and lists of text. Here is the code:
import re
data = {} #dictonary
lst = [] #list
regalter=r"^\s*\(([^\)]+)\).*|^\s*\-.*" #regex to identify (a)(A) or - type of lines
j=0
sub = [] #list
plaintxtlist=['Playing cricket is my hobby:','(a) true.','(b) false.','Furthermore, the heading does not include:','(a) Singlets.',' (b) fabrics.','(c) Smocks.']
for i in plaintxtlist: #the data in text files are converted to list of strings and passed to code
if sub:
match = re.match(regalter, i) # pattern matching using regex
if match:
sub.append(i) #if the line containes (a)or(A) it will be appended to list called sub
else:
j=j+1 #each list of lines will have value from 0 n (n is the last line)
sub = [i] #list of text will be appended to list called sub
data[str(j)] = sub # here the sub list will be added to dictonary named data with o,1,2,3 respectively we are laster converting that to string
else:
if sub:
data[str(j)] = sub #else if sub the content in the sublist will be appended to dictonary named data
sub = [i] #each line will be appended to sub list
data[str(j)] = i # if there is no match with regex the pain text will be appended to dictonary
print(data) #print the
And the output from the code below:
{"0":["Playing cricket is my hobby:","(a)true.","(b)false."],"1":["Furthermore, the heading does not include:","(a) Singlets.","(b) Garments.","(c) Smocks."]}
You don't need to transfer each line to fit into a list at first. To make it simpler, you can firstly organize the raw text content by regex, then parse them into the dictionary you want.
You can find out the grouping relationship by specifying the text content goes before a "period" that isn't followed by a "(" in the next line.
Suppose the text content is saved in a file called a_text_file.txt. The full code lies here:
import re
with open('a_text_file.txt') as f:
s = f.read()
pattern = re.compile(r'[\w\s\().:,]+?\.(?!\n\()')
data = dict()
for m in re.findall(pattern, s):
# Group the raw content by `regex`,
# and fit each line into a list
group = m.strip()
lst = group.split('\n')
# Strip out spaces in `key` and `value`
key = lst[0].strip()
value = [i.strip() for i in lst[1:]]
# Fit into the final output
data.update({key: value})
print(data)
The final output:
{'Playing cricket is my hobby:': ['(a) true.', '(b) false.'], 'Furthermore, the heading does not include:': ['(a) Singlets.', '(b) fabrics.', '(c) Smocks.']}
I am new to python and i am trying to append new data to existing docx file using Python.
from docx import Document # for Word document
document = Document()
document.add_paragraph('My first paragraph')
document.add_paragraph("Second paragraph")
document.add_paragraph("Third paragraph")
document.add_paragraph("fourth paragraph")
document.add_paragraph("fifth paragraph")
document.save("testDocmod.docx")
document = Document('testDocmod.docx')
paragraphs = document.paragraphs
incr=1
for paragraph in paragraphs:
runs = paragraph.runs
for run in runs:
if(incr == 2):
run.text = 'Updatd text'
print run.text
incr = incr + 1
But its just updating the second element while i need to append it before second element
Depending on whether you wish to receive you can:
1) Delete all the content of the second paragraph and re-create it:
from docx import Document
document = Document('testDocmod.docx')
paragraphs = document.paragraphs
#Store content of second paragraph
text = paragraphs[1].text
#Clear content
paragraphs[1]._p.clear()
#Recreate second paragraph
paragraphs[1].add_run('Appended part ' + text)
document.save("testDocmod.docx")
Result:
My first paragraph
Appended part Second paragraph
Third paragraph
fourth paragraph
fifth paragraph
2) Simply add text in a first paragraph:
from docx import Document
from docx.enum.text import WD_BREAK
document = Document('testDocmod.docx')
paragraphs = document.paragraphs
#Add break line after last run
paragraphs[0].runs[-1].add_break(WD_BREAK.LINE)
paragraphs[0].add_run('New text')
document.save("testDocmod.docx")
Result:
My first paragraph
New text
Second paragraph
Third paragraph
fourth paragraph
fifth paragraph