text format in python-docx - python

I need to enter text in 1000s word document. I found python-docx suitable for this purmose. But the string I have to enter has font color and style, like:
{\color{Red}Mr Mike} Bold Class, Italics College, City
Is it possible to format that?
As an mwe,
from docx import Document
document = Document('doc.docx')
Name = ["Name1", "Name2", "Name3"]
Class = ["Class1", "Class2", "Class3"]
for i in range(len(Name)):
string = Name[i] + ", " + Class[i] + "" Class"
# Name[i] should be red, bold, 12 pt; Classp[i] should be 12pt, italics
for p in document.paragraphs:
if p.text.find("REPLACE") >= 0:
p.text = p.text.replace("REPLACE", string)
document.save(Name[i] + '.docx')

Character formatting is applied at the run level. So you will need to work at that level if you want "inline" differences in character formatting. In general, a paragraph is composed of zero or more runs. All characters in a run share the same character-level formatting ("font"). Pseudo-code for a possible approach is:
old = "foo"
new = "bar"
text = paragraph.text
start = text.find(old)
prefix = text[:start]
suffix = text[start+len(old):]
# --- assigning to `paragraph.text` leaves the paragraph with
# --- exactly one run
paragraph.text = prefix
# --- add an additional run for your replacement word, keeping
# --- a reference to it so you can set its visual attributes
run = paragraph.add_run("bar")
run.italic = True
# --- etc. to get the formatting you want ---
# --- add the suffix to complete the original paragraph text ---
paragraph.add_run(suffix)

Related

On transcribing a .docx file to a new .docx document how to insert <w:numPr> and associated data on a paragraph basis?

for src_paragraph in src_doc.paragraphs:
src_paragraph_format = src_paragraph.paragraph_format
# print(src_paragraph.text)
# Handle Headers/Footers Headers not implemented
#
sections = trgt_doc.sections # there's only 1 section
section = sections[0]
footer = section.footer # get the footer section of the section
paragraph = footer.paragraphs[0] # footer has 1 paragraph
paragraph.text = f'{page_number} \t\t\t {printed_time_stamp}'
# Transcribe paragraph settings - Build the target
#
trgt_paragraph = trgt_doc.add_paragraph(style = src_paragraph.style )
if src_paragraph._p.pPr.numPr is not None:
print('\n <w:pStyle> :', src_paragraph._p.pPr.pStyle)
print ('<w:numPr> :', src_paragraph._p.pPr.numPr)
print ('\t<w:ilvl> :', src_paragraph._p.pPr.numPr.ilvl)
print ('\t<w:numId> :', src_paragraph._p.pPr.numPr.numId)
print('\n', src_paragraph.text)
trgt_paragraph_format = trgt_paragraph.paragraph_format
trgt_paragraph.style.name = src_paragraph.style.name
trgt_paragraph_format.left_indent = src_paragraph_format.left_indent # inherited from style hierarchy
trgt_paragraph_format.right_indent = src_paragraph_format.right_indent
# print('S_INDENT -------|', src_paragraph_format.left_indent)
# print('T_INDENT -------|', trgt_paragraph_format.left_indent)
trgt_paragraph_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
trgt_paragraph_format.widow_control = True
font = trgt_paragraph.style.font
font.name = 'Times'
font.size = Pt(11)
I'm transcribing Word files into similar documents with the same info. content but with modifications and additions. Im building the target files by iterating through the source paragraphs then creating the target paragraph/runs.
This is mostly done but except for capturing numeric bullets. I can capture and but don't know, at this point, how to get these values into each target paragraph.
This is my first project with .docx data and I'm researching this as I go.
In attempting to just insert into the target .docx file I'm generating I tried this approach here
# Generate new Target file from Source File
for src_paragraph in src_doc.paragraphs:
src_paragraph_format = src_paragraph.paragraph_format
# Get Target section(s)
sections = trgt_doc.sections
section = sections[0]
sectPr = section._sectPr
lnNumType = OxmlElement('w:lnNumType')
lnNumType.set('fooAttrib', '42')
sectPr.append(lnNumType)
print('STUBB')
It's line numbers here, not an outline style numbering list. I just wanted to do an initial insert to see it it would work; It didn't.
# Add Numbered List to Target paragraphs.
# Isolate the number bulleted paragraphs
if src_paragraph._p.pPr.numPr:
# SOURCE XML Paragraphs containing numPr
print('--------------------------------------------')
print('TEXT_SRC', src_paragraph.text,'\n')
print('SRC ParXML \n', src_paragraph._p.xml)
print('--------------------------------------------')
I'm able to find the decimal numbers in the source .docx this way; the trick is just to get that over into the target i'm generating.

Python3; Docx; replacing text from word doc modifies spacing after paragraph--how to avoid or change back?

I have some functions that successfully replace specific strings from a .docx file (in both paragraphs and tables) with new strings.
Everything with the replacing part of the function works great. However, the "spacing after paragraph" values change when I replace the strings in the paragraph. I want the spacing to stay the same, or I want to be able to change the spacing back to its original format after the replacement occurs.
I am unfamiliar with docx, so maybe its just a simple solution that I missed?
A sample of the word .docx file can be downloaded here: https://filebin.net/9t5v96tb5y7z0e60
The working paragraph and table string replacement code:
"""
Script to replace specific texts within word doc templates
"""
import re, docx
from docx import Document
def clear_paragraph(self, paragraph):
p_element = paragraph._p
p_child_elements = [elm for elm in p_element.iterchildren()]
for child_element in p_child_elements:
p_element.remove(child_element)
def paragraph_replace(self, search, replace):
searchre = re.compile(search)
for paragraph in self.paragraphs:
paragraph_text = paragraph.text
if paragraph_text:
if searchre.search(paragraph_text):
clear_paragraph(self, paragraph)
paragraph.add_run(re.sub(search, replace, paragraph_text))
return paragraph
def table_replace(self, text_value, replace):
result = False
tbl_regex = re.compile(text_value)
for table in self.tables:
for row in table.rows:
for cell in row.cells:
if cell.text:
if tbl_regex.search(cell.text):
cell.text = replace
result = True
return result
regex1 = ["<<authors>>", "<<author>>", "<<id>>", \
"<<title>>", "<<date>>", "<<discipline>>", \
"<<countries>>"]
author = "Robert"
authors = "Robert, John; Bob, Billy; Duck, Donald"
ms_id = "2020-34-2321"
title = "blah blah blah and one more blah"
date = "31-03-2020"
discipline = "BE"
countries = "United States, Japan, China, South Africa"
replace1 = [authors, author, ms_id, title, date, discipline, countries]
filename = "Sample Template.docx"
doc = Document(filename)
for x in range(len(regex1)):
paragraph_replace(doc, regex1[x], replace1[x])
table_replace(doc, regex1[x], replace1[x])
doc.save(author + '_updated.docx')
After reading more of Docx documentation and some testing, the solution to this problem was easy.
def clear_paragraph(self, paragraph):
p_element = paragraph._p
p_child_elements = [elm for elm in p_element.iterchildren()]
for child_element in p_child_elements:
p_element.remove(child_element)
def paragraph_replace(self, search, replace, x):
searchre = re.compile(search)
for paragraph in self.paragraphs:
paragraph_text = paragraph.text
if paragraph_text:
if searchre.search(paragraph_text):
clear_paragraph(self, paragraph)
para = paragraph.add_run(re.sub(search, replace, paragraph_text))
para.font.size = Pt(10)
paragraph.paragraph_format.space_after=Pt(0)
if x is 2:
para.bold = True
else:
para.bold = False
paragraph.paragraph_format.line_spacing = 1.0
return paragraph
def table_replace(self, text_value, replace):
result = False
tbl_regex = re.compile(text_value)
for table in self.tables:
for row in table.rows:
for cell in row.cells:
paragraphs = cell.paragraphs
for paragraph in paragraphs:
for run in paragraph.runs:
font = run.font
font.size=Pt(10)
if cell.text:
if tbl_regex.search(cell.text):
cell.text = replace
result = True
return result
I added the x argument in the paragraph_replace function because I wanted the first line of my document to be bold. All my issues are now resolved with these simple additions to the code.

Add text in word based on content

I have a batch of .doc documents, in the first line of each document I have the name of a person written. I would like to add in each document the email adress of the person, based on a list I have. How can I use python or vba to program something that does the job for me?
I tried to do this vba code, that finds the name of the person and then writes the email, was thinking to loop it over. However even this minumum working example does not actually work. What am I doing wrong?
Sub email()
Selection.find.ClearFormatting
Selection.find.Replacement.ClearFormatting
If Selection.find.Text = "Chiara Gatta" Then
With Selection.find
.Text = "E-mail:"
.Replacement.Text = "E-mail: chiara.gatta#gmail.com"
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchByte = False
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.find.Execute replace:=wdReplaceAll
End If
End Sub
The question lacks minimum details & code required for help. However I am trying to give you a code that would pickup person names & email addresses from one table in a document containing the code. the table should have 3 columns, 1st col contain Name of the person, 2nd col should contain Email address with 3rd column blank for remarks from code. See image
On running the code you would be prompted to select the word files that would be replaced by the email address. On trial use only copy files and may try only a handful of files at a time (if file sizes are large). It is assumed that files will contain Name and word “E-mail:” (if "E-mail:" word is not in the file try to modify the code as commented)
Code:
Sub test2()
Dim Fldg As FileDialog, Fl As Variant
Dim Thdoc As Document, Edoc As Document
Dim Tbl As Table, Rw As Long, Fnd As Boolean
Dim xName As String, xEmail As String
Set Thdoc = ThisDocument
Set Tbl = Thdoc.Tables(1)
Set Fldg = Application.FileDialog(msoFileDialogFilePicker)
With Fldg
.Filters.Clear
.Filters.Add "Word Documents ", "*.doc,*.dot,*docx,*.docm,*.dotm", 1
.AllowMultiSelect = True
.InitialFileName = "C:\users\user\desktop\folder1\*.doc*" 'use your choice of folder
If .Show <> -1 Then Exit Sub
End With
'Search for each Name in Table 1 column 1
For Rw = 1 To Tbl.Rows.Count
xName = Tbl.Cell(Rw, 1).Range.Text
xEmail = Tbl.Cell(Rw, 2).Range.Text
If Len(xName) > 2 And Len(xEmail) > 2 Then
xName = Left(xName, Len(xName) - 2) 'Clean special characters in word cell text
xEmail = Left(xEmail, Len(xEmail) - 2) 'Clean special characters in word cell text
'open each Document selected & search for names
For Each Fl In Fldg.SelectedItems
Set Edoc = Documents.Open(Fl)
Fnd = False
With Edoc.Content.Find
.ClearFormatting
.Text = xName
.Replacement.Text = xName & vbCrLf & "E-mail: " & xEmail
.Wrap = wdFindContinue
.Execute Replace:=wdReplaceNone
'.Execute Replace:=wdReplaceOne
Fnd = .Found
End With
'if Word "E-mail is not already in the file, delete next if Fnd Branch"
' And use .Execute Replace:=wdReplaceOne instead of .Execute Replace:=wdReplaceNone
If Fnd Then ' If Name is found then Search for "E-Mail:"
Fnd = False
With Edoc.Content.Find
.ClearFormatting
.Text = "E-mail:"
.Replacement.Text = "E-mail: " & xEmail
.Wrap = wdFindContinue
.Execute Replace:=wdReplaceOne
Fnd = .Found
End With
End If
If Fnd Then
Edoc.Save
Tbl.Cell(Rw, 3).Range.Text = "Found & Replaced in " & Fl
Exit For
Else
Tbl.Cell(Rw, 3).Range.Text = "Not found in any selected document"
End If
Edoc.Close False
Next Fl
End If
Next Rw
End Sub
it's operation would be like this. Try to understand each action in the code and modify to your requirement.

pyparsing how to SkipTo end of indented block?

I am trying to parse a structure like this with pyparsing:
identifier: some description text here which will wrap
on to the next line. the follow-on text should be
indented. it may contain identifier: and any text
at all is allowed
next_identifier: more description, short this time
last_identifier: blah blah
I need something like:
import pyparsing as pp
colon = pp.Suppress(':')
term = pp.Word(pp.alphanums + "_")
description = pp.SkipTo(next_identifier)
definition = term + colon + description
grammar = pp.OneOrMore(definition)
But I am struggling to define the next_identifier of the SkipTo clause since the identifiers may appear freely in the description text.
It seems that I need to include the indentation in the grammar, so that I can SkipTo the next non-indented line.
I tried:
description = pp.Combine(
pp.SkipTo(pp.LineEnd()) +
pp.indentedBlock(
pp.ZeroOrMore(
pp.SkipTo(pp.LineEnd())
),
indent_stack
)
)
But I get the error:
ParseException: not a subentry (at char 55), (line:2, col:1)
Char 55 is at the very beginning of the run-on line:
...will wrap\n on to the next line...
^
Which seems a bit odd, because that char position is clearly followed by the whitespace which makes it an indented subentry.
My traceback in ipdb looks like:
5311 def checkSubIndent(s,l,t):
5312 curCol = col(l,s)
5313 if curCol > indentStack[-1]:
5314 indentStack.append( curCol )
5315 else:
-> 5316 raise ParseException(s,l,"not a subentry")
5317
ipdb> indentStack
[1]
ipdb> curCol
1
I should add that the whole structure above that I'm matching may also be indented (by an unknown amount), so a solution like:
description = pp.Combine(
pp.SkipTo(pp.LineEnd()) + pp.LineEnd() +
pp.ZeroOrMore(
pp.White(' ') + pp.SkipTo(pp.LineEnd()) + pp.LineEnd()
)
)
...which works for the example as presented will not work in my case as it will consume the subsequent definitions.
When you use indentedBlock, the argument you pass in is the expression for each line in the block, so it shouldn't be a indentedBlock(ZeroOrMore(line_expression), stack), just indentedBlock(line_expression, stack). Pyparsing includes a builtin expression for "everything from here to the end of the line", titled restOfLine, so we will just use that for the expression for each line in the indented block:
import pyparsing as pp
NL = pp.LineEnd().suppress()
label = pp.ungroup(pp.Word(pp.alphas, pp.alphanums+'_') + pp.Suppress(":"))
indent_stack = [1]
# see corrected version below
#description = pp.Group((pp.Empty()
# + pp.restOfLine + NL
# + pp.ungroup(pp.indentedBlock(pp.restOfLine, indent_stack))))
description = pp.Group(pp.restOfLine + NL
+ pp.Optional(pp.ungroup(~pp.StringEnd()
+ pp.indentedBlock(pp.restOfLine,
indent_stack))))
labeled_text = pp.Group(label("label") + pp.Empty() + description("description"))
We use ungroup to remove the extra level of nesting created by indentedBlock but we also need to remove the per-line nesting that is created internally in indentedBlock. We do this with a parse action:
def combine_parts(tokens):
# recombine description parts into a single list
tt = tokens[0]
new_desc = [tt.description[0]]
new_desc.extend(t[0] for t in tt.description[1:])
# reassign rebuild description into the parsed token structure
tt['description'] = new_desc
tt[1][:] = new_desc
labeled_text.addParseAction(combine_parts)
At this point, we are pretty much done. Here is your sample text parsed and dumped:
parsed_data = (pp.OneOrMore(labeled_text)).parseString(sample)
print(parsed_data[0].dump())
['identifier', ['some description text here which will wrap', 'on to the next line. the follow-on text should be', 'indented. it may contain identifier: and any text', 'at all is allowed']]
- description: ['some description text here which will wrap', 'on to the next line. the follow-on text should be', 'indented. it may contain identifier: and any text', 'at all is allowed']
- label: 'identifier'
Or this code to pull out the label and description fields:
for item in parsed_data:
print(item.label)
print('..' + '\n..'.join(item.description))
print()
identifier
..some description text here which will wrap
..on to the next line. the follow-on text should be
..indented. it may contain identifier: and any text
..at all is allowed
next_identifier
..more description, short this time
last_identifier
..blah blah

How can I tokenize this text into sentences with Regex

"You could not possibly have come at a better time, my dear Watson,"
he said cordially. 'It is not worth your while to wait,' she went
on."You can pass through the door; no one hinders." And then, seeing that I smiled and shook my head, she suddenly threw aside her
constraint and made a step forward, with her hands wrung together.
Look at the highlighted area. How can I possibly distinguish a case where '"' is followed by a period (.) to end a sentence and a case where a period (.) is followed by a '"'
I have tried this piece for the tokenizer. It works well except for just that one part.
(([^।\.?!]|[।\.?!](?=[\"\']))+\s*[।\.?!]\s*)
Edit: I am not planning to use any NLP toolkit to solve this problem.
Use NLTK instead of regular expressions here:
from nltk import sent_tokenize
parts = sent_tokenize(your_string)
# ['"You could not possibly have come at a better time, my dear Watson," he said cordially.', "'It is not worth your while to wait,' she went on.", '"You can pass through the door; no one hinders."', 'And then, seeing that I smiled and shook my head, she suddenly threw aside her constraint and made a step forward, with her hands wrung together.']
Found this function a while ago
def split_into_sentences(text):
caps = u"([A-Z])"
prefixes = u"(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = u"(Inc|Ltd|Jr|Sr|Co)"
starters = u"(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = u"([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = u"[.](com|net|org|io|gov|mobi|info|edu)"
if not isinstance(text,unicode):
text = text.decode('utf-8')
text = u" {0} ".format(text)
text = text.replace(u"\n",u" ")
text = re.sub(prefixes,u"\\1<prd>",text)
text = re.sub(websites,u"<prd>\\1",text)
if u"Ph.D" in text: text = text.replace(u"Ph.D.",u"Ph<prd>D<prd>")
text = re.sub(u"\s" + caps + u"[.] ",u" \\1<prd> ",text)
text = re.sub(acronyms+u" "+starters,u"\\1<stop> \\2",text)
text = re.sub(caps + u"[.]" + caps + u"[.]" + caps + u"[.]",u"\\1<prd>\\2<prd>\\3<prd>",text)
text = re.sub(caps + u"[.]" + caps + u"[.]",u"\\1<prd>\\2<prd>",text)
text = re.sub(u" "+suffixes+u"[.] "+starters,u" \\1<stop> \\2",text)
text = re.sub(u" "+suffixes+u"[.]",u" \\1<prd>",text)
text = re.sub(u" " + caps + u"[.]",u" \\1<prd>",text)
if u"\"" in text: text = text.replace(u".\"",u"\".")
if u"!" in text: text = text.replace(u"!\"",u"\"!")
if u"?" in text: text = text.replace(u"?\"",u"\"?")
text = text.replace(u".",u".<stop>")
text = text.replace(u"?",u"?<stop>")
text = text.replace(u"!",u"!<stop>")
text = text.replace(u"<prd>",u".")
sentences = text.split(u"<stop>")
sentences = sentences[:-1]
sentences = [s.strip() for s in sentences]
return sentences

Categories