search/replace content of xml - python

I've been successful using xml.etree.ElementTree to parse an xml, search for content, then write this to a different xml. However, I just worked with text, inside of a singe tag.
import os, sys, glob, xml.etree.ElementTree as ET
path = r"G:\\63D RRC GIS Data\\metadata\\general\\2010_contract"
for fn in os.listdir(path):
filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
for filepath in filepaths:
(pa, filename) = os.path.split(filepath)
####use this section to grab element text from old, archived metadata files; this text then gets put into current, working .xml###
root = ET.parse(pa + os.sep + "archive" + os.sep + "base_metadata_overall.xml").getroot()
iterator = root.getiterator()
for item in iterator:
if item.tag == "abstract":
correct_abstract = item.text
root2 = ET.parse(pa + os.sep + "base_metadata_overall.xml").getroot()
iterator2 = root2.getiterator("descript")
for item in iterator2:
if item.tag == "abstract":
old_abstract = item.find("abstract")
old_abstract_text = old_abstract.text
item.remove(old_abstract)
new_symbol_abstract = ET.SubElement(item, "title")
new_symbol_abstract.text = correct_abstract
tree = ET.ElementTree(root2)
tree.write(pa + os.sep + "base_metadata_overall.xml")
print "created --- " + filename + " metadata"
But now, I need to:
1) search an xml and grab everything between "attr" tags, below is example:
<attr><attrlabl Sync="TRUE">OBJECTID</attrlabl><attalias Sync="TRUE">ObjectIdentifier</attalias><attrtype Sync="TRUE">OID</attrtype><attwidth Sync="TRUE">4</attwidth><atprecis Sync="TRUE">0</atprecis><attscale Sync="TRUE">0</attscale><attrdef Sync="TRUE">Internal feature number.</attrdef></attr>
2) Now, I need to open a different xml and search for all content between the same "attr" tag and replace with the above.
Basically, what I was doing before, but ignoring subelements, attributes, ect... between "attr" tags and treat it like text.
thanks!!
Please bear with me, this forum is a little different (posting) then Im used to!
Here's what I have so far:
import os, sys, glob, re, xml.etree.ElementTree as ET
from lxml import etree
path = r"C:\\temp\\python\\xml"
for fn in os.listdir(path):
filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
for filepath in filepaths:
(pa, filename) = os.path.split(filepath)
xml = open(pa + os.sep + "attributes.xml")
xmltext = xml.read()
correct_attrs = re.findall("<attr> (.*?)</attr>",xmltext,re.DOTALL)
for item in correct_attrs:
correct_attribute = "<attr>" + item + "</attr>"
xml2 = open(pa + os.sep + "base_metadata_overall.xml")
xmltext2 = xml2.read()
old_attrs = re.findall("<attr>(.*?)</attr>",xmltext,re.DOTALL)
for item2 in old_attrs:
old_attribute = "<attr>" + item + "</attr>"
old = etree.fromstring(old_attribute)
replacement = new.xpath('//attr')
for attr in old.xpath('//attr'):
attr.getparent().replace(attr, copy.deepcopy(replacement))
print lxml.etree.tostring(old)
got this working, see below, even figured out how to export to new .xml
However, If the # of attr's is dif. from source to dest, I get the following error, any suggestions?
node = replacements.pop()
IndexError: pop from empty list
import os, sys, glob, re, copy, lxml, xml.etree.ElementTree as ET
from lxml import etree
path = r"C:\\temp\\python\\xml"
for fn in os.listdir(path):
filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
for filepath in filepaths:
xmlatributes = open(pa + os.sep + "attributes.xml")
xmlatributes_txt = xmlatributes.read()
xmltarget = open(pa + os.sep + "base_metadata_overall.xml")
xmltarget_txt = xmltarget.read()
source = lxml.etree.fromstring(xmlatributes_txt)
dest = lxml.etree.fromstring(xmltarget_txt)
replacements = source.xpath('//attr')
replacements.reverse()
for attr in dest.xpath('//attr'):
node = replacements.pop()
attr.getparent().replace(attr, copy.deepcopy(node))
#print lxml.etree.tostring(dest)
tree = ET.ElementTree(dest)
tree.write (pa + os.sep + "edited_metadata.xml")
print fn + "--- sucessfully edited"
update 5/16/2011
restructured a few things to fix the "IndexError: pop from empty list" error mentioned above. Realized that the replacement of the "attr" tags will not always be a 1-to-1 replacement. For ex. sometimes the source .xml has 20 attr's and the destination .xml has 25 attr's. In this case, the 1-to-1 replacement would choke.
Anyway, the below will remove all attr's, then replace with the source attr's. It also checks for another tag, "subtype" if it exists, it adds them after the attr's, but inside the "detailed" tags.
thanks again to everyone who helped.
import os, sys, glob, re, copy, lxml, xml.etree.ElementTree as ET
from lxml import etree
path = r"G:\\63D RRC GIS Data\\metadata\\general\\2010_contract"
#path = r"C:\\temp\python\\xml"
for fn in os.listdir(path):
correct_title = fn.replace ('_', ' ') + " various facilities"
correct_fc_name = fn.replace ('_', ' ')
filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
for filepath in filepaths:
print "-----" + fn + "-----"
(pa, filename) = os.path.split(filepath)
xmlatributes = open(pa + os.sep + "attributes.xml")
xmlatributes_txt = xmlatributes.read()
xmltarget = open(pa + os.sep + "base_metadata_overall.xml")
xmltarget_txt = xmltarget.read()
source = lxml.etree.fromstring(xmlatributes_txt)
dest = lxml.etree.fromstring(xmltarget_txt)
replacements = source.xpath('//attr')
replacesubtypes = source.xpath('//subtype')
subtype_true_f = len(replacesubtypes)
attrtag = dest.xpath('//attr')
#print len(attrtag)
num_realatrs = len(replacements)
for n in attrtag:
n.getparent().remove(n)
print n.tag + " removed"
detailedtag = dest.xpath('//detailed')
for n2 in detailedtag:
pos = 0
for realatrs in replacements:
n2.insert(pos + 1, realatrs)
print "attr's replaced"
if subtype_true_f >= 1:
#print subtype_true_f
for realsubtypes in replacesubtypes:
n2.insert(num_realatrs + 1, realsubtypes)
print "subtype's replaced"
tree = ET.ElementTree(dest)
tree.write (pa + os.sep + "base_metadata_overall_v2.xml")
print fn + "--- sucessfully edited"

Here is an example of using lxml to do this. I'm not exactly sure how you want the <attr/> nodes replaced, but this example should provide a pattern you can reuse.
Update - I changed it to replace each <attr> in tree2 with the corresponding node from tree1, in document order:
import copy
import lxml.etree
xml1 = '''<root><attr><chaos foo="0"/></attr><attr><arena foo="1"/></attr></root>'''
xml2 = '''<tree><attr><one/></attr><attr><two/></attr></tree>'''
tree1 = lxml.etree.fromstring(xml1)
tree2 = lxml.etree.fromstring(xml2)
# select <attr/> nodes from tree1, will be used to replace corresponding
# nodes in tree2
replacements = tree1.xpath('//attr')
replacements.reverse()
for attr in tree2.xpath('//attr'):
# replace the attr node in tree2 with 'replacement' from tree1
node = replacements.pop()
attr.getparent().replace(attr, copy.deepcopy(node))
print lxml.etree.tostring(tree2)
Result:
<tree>
<attr><chaos foo="0"/></attr>
<attr><arena foo="1"/></attr>
</tree>

This sounds like something that XSL-T transformations were made for. Have you tried that?
I'd also recommend a library like Beautiful Soup for parsing and manipulating XML.

Related

Append to associate array

I have a Python script that iterates through a PDF file (loops over each page), and inside each page does some text manipulation. So basically two loops:
files = {}
#npages is the number of PDF pages in the specific file.
for n in range(npages):
path = pdf_name + str(n + 1) + '_1.txt'
files[int(n)] = path
for i, col in enumerate(COLUMNS):
path = pdf_name + str(n + 1) + '_' + str(i + 2) + '.txt'
files[int(n)][int(i)] = path
Basically, I looking on each PDF page, and on each page I then further do some text manipulation.
I am trying to output it like:
- file_page_1.pdf
- file_page_1_col_1.pdf
- file_page_1_col_2.pdf
file_page_2.pdf
- file_page_2_col_1.pdf
- file_page_2_col_2.pdf
However using above coes gives me below error:
files[int(n)][int(i)] = path
TypeError: 'str' object does not support item assignment
I think the structure you're looking for is a dict that has string keys to list values.
files = {}
for page in range(npages):
path = pdf_name + str(n+1) + '_1.txt'
files[path] = []
for i, col in enumerate(COLUMNS):
subpath = pdf_name + str(n + 1) + '_' + str(i + 2) + '.txt'
files[path].append(subpath)
# For accessing items
for path, subpaths in files.items():
# path is a string, the key in files dict
print(path)
# subpaths is a list of strings, the value in files dict
for subpath in subpaths:
print(subpath)
If you want the path/subpath pairs to be returned in the order it was inserted, you can use OrderedDict instead of dict.
from collections import OrderedDict
files = OrderedDict()
# code as above
it is because files[int(n)] returns you str and not a dictionary.
as you can see from your line.
files[int(n)] = path
you are trying to achieve a dictionary behavior from a str object.
to carry out what you are trying to do we can do something like.
from collections import defaultdict
files = {}
for n in range(npages):
path = pdf_name + str(n + 1) + '_1.txt'
files[int(n)] = defaultdict()
files[int(n)]['path_root'] = path
for i, col in enumerate(COLUMNS):
path = pdf_name + str(n + 1) + '_' + str(i + 2) + '.txt'
files[int(n)][int(i)] = path
this should give you result like:
|-- nth file
| |
| |- path_root
| |- child1 (0)
| |- child2 (1)
..
A quick side note about defaultdict:
somedict = {}
print(somedict[3]) # KeyError
someddict = defaultdict(int) # or str
print(someddict[3]) # print int(), thus 0 (str will return you '')

python recursively rename directories

I made a script to rename directories with a name that contains spaces or special characters recursively:
import os
import re
import pdb
def renameInvalid(root):
print("root is: " + root)
for f in os.listdir(root):
if os.path.isdir(f):
old = f
f = f.replace(" ", "_")
f = re.sub(r'[^a-zA-Z0-9-_]', '',f)
if old != f:
print(root + " na substitutie")
os.rename(old,f)
print(root + " na hernoemen")
print("renamed " + old + " to " + f )
#pdb.set_trace()
f = '/' + f
pad = root + f
renameInvalid(str(pad))
mountpunt = os.getcwd()
renameInvalid(mountpunt)
You can test this script by making two directories with names containing spaces. You place one of the directories inside the other and run the script from inside the first directory. The script renames the first directory but generates an OSError on isdir(f).
Does anyone know what is the problem here?
Regards,
I found the answer (thanks to timbaileyjones for his solution).
import os
import re
def renameInvalid(root):
for f in os.listdir(root):
old = f
f = f.replace(" ", "_")
f = re.sub(r'[^a-zA-Z0-9-_]', '',f)
if old != f:
os.rename(old,f)
print("renamed " + old + " to " + f )
if os.path.isdir(f):
os.chdir(f)
renameInvalid(".")
os.chdir("..")
renameInvalid(".")
One should only run this code if they know what they are doing. It renames all the folders and files with whitespace or special characters in the filename.
Regards,

Python - excluding tags from a list of a different tags

I have a python script for Editorial on iOS that I've modified, and I would like help tweaking it further.
I have .taskpaper files in a dropbox folder that Editorial is pointed at. When I run this workflow the script search all the files and return a list of lines that include "#hardware". This is working well but the final list includes items with #hardware that I've finished and appended with #done. How can I exclude #hardware lines with #done?
There are seven files that run. These two seem to be the ones that need to be modified:
Generate the list of hashtags
import editor
import console
import os
import re
import sys
import codecs
import workflow
pattern = re.compile(r'\s#{1}(\w+)', re.I|re.U)
p = editor.get_path()
from urllib import quote
dir = os.path.split(p)[0]
valid_extensions = set(['.taskpaper'])
tags = ['#hardware']
for w in os.walk(dir):
dir_path = w[0]
filenames = w[2]
for name in filenames:
full_path = os.path.join(dir_path, name)
ext = os.path.splitext(full_path)[1]
if ext.lower() in valid_extensions:
try:
with codecs.open(full_path, 'r', 'utf-8') as f:
for line in f:
for match in re.finditer(pattern, line):
tags.append(match.group(1))
except UnicodeDecodeError, e:
pass
workflow.set_output('\n'.join(sorted(set(tags))))
and
Search documents with hashtags
import editor
import console
import os
import re
import sys
import codecs
import workflow
from StringIO import StringIO
theme = editor.get_theme()
workflow.set_variable('CSS', workflow.get_variable('CSS Dark' if theme == 'Dark' else 'CSS Light'))
p = editor.get_path()
searchterm = workflow.get_variable('Search Term')
term = '#' + searchterm
pattern = re.compile(re.escape(term), flags=re.IGNORECASE)
from urllib import quote
dir = os.path.split(p)[0]
valid_extensions = set(['.taskpaper'])
html = StringIO()
match_count = 0
for w in os.walk(dir):
dir_path = w[0]
filenames = w[2]
for name in filenames:
full_path = os.path.join(dir_path, name)
ext = os.path.splitext(full_path)[1]
if ext.lower() not in valid_extensions:
continue
found_snippets = []
i = 0
try:
with codecs.open(full_path, 'r', 'utf-8') as f:
for line in f:
for match in re.finditer(pattern, line):
start = max(0, match.start(0) - 100)
end = min(len(line)-1, match.end(0) + 100)
snippet = (line[start:match.start(0)],
match.group(0),
line[match.end(0):end],
match.start(0) + i,
match.end(0) + i)
found_snippets.append(snippet)
i += len(line)
except UnicodeDecodeError, e:
pass
if len(found_snippets) > 0:
match_count += 1
root, rel_path = editor.to_relative_path(full_path)
ed_url = 'editorial://open/' + quote(rel_path.encode('utf-8')) + '?root=' + root
html.write('<h2>' + name + '</h2>')
for snippet in found_snippets:
start = snippet[3]
end = snippet[4]
select_url = 'editorial://open/' + quote(rel_path.encode('utf-8')) + '?root=' + root
select_url += '&selection=' + str(start) + '-' + str(end)
html.write('<a class="result-box" href="' + select_url + '">' + snippet[0] + '<span class="highlight">' + snippet[1] + '</span>' + snippet[2] + '</a>')
if match_count == 0:
html.write('<p>No matches found.</p>')
workflow.set_output(html.getvalue())
Thank you.
Since the matching lines are stored in a list, you can use a list comprhension to exlcude the ones you don't want. Something like this:
l = ['#hardware ttuff', 'stuff #hardware', 'things #hardware sett #done', '#hardware', '#hardware# #done']
print(l)
['#hardware ttuff', 'stuff #hardware', 'things #hardware sett #done', '#hardware', '#hardware# #done']
m = [ s for s in l if '#done' not in s]
print(m)
['#hardware ttuff', 'stuff #hardware', '#hardware']
A friend solved it for me.
We added:
if not "#done" in line:
in the "Search documents with hashtags" file after
for line in f:
Works great

os.walk() filename scope inside inner loop

Writing a script to help with data migration in renaming images. It seems as though when I try to access the variable filename from within the inner-for-loop, it's just printing .DS_Store
See commented lines for example:
#!/usr/bin/env python
import os
import csv
FILE_PATH = '/Users/admin/Desktop/data-migration/images/product'
COUNT = 0
with open('paths_formatted.csv') as csvfile:
reader = csv.reader(csvfile)
# Walk the tree.
for root, directories, files in os.walk(FILE_PATH):
for filename in files:
# Join the two strings in order to form the full filepath.
filePath = os.path.join(root, filename)
#print(filePath) - this results in the actual file path
for row in reader:
#print(filePath) - this results in .DS_Store
oldFilePath = row[1].strip()
displayName = row[0].strip()
colour = row[2].strip()
if " " in colour:
colour = colour.replace(" ", "-")
slashIndex = oldFilePath.rfind("/")
oldFileName = oldFilePath[slashIndex+1:]
if oldFileName == filename:
number = 1;
newFileName = displayName + "_" + colour + "-" + str(number) + ".jpg"
while os.path.exists(FILE_PATH + leadingPath + newFileName):
number = number + 1
newFileName = filePath, displayName + "_" + colour + "-" + str(number)
os.rename(newFileName)
COUNT = COUNT+1
print(COUNT)
Why would this be?
After changing my code as per the comments, to store the results in a list, now the for root, directories, files in os.walk(FILE_PATH): is not being executed.
I verified that the FILE_PATH exists and printed it to console, also that it has contents.
My new code is as follows:
#!/usr/bin/env python
import os
import csv
FILE_PATH = '/Users/admin/Desktop/data-migration/images/product'
COUNT = 0
productInfo = []
with open('paths_formatted.csv') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
productInfo.append(row)
for root, directories, files in os.walk(FILE_PATH):
for filename in files:
for info in productInfo:
displayName = info[0]
oldFilePath = info[1]
colour = info[2]
slashIndex = oldFilePath.rfind("/")
oldFileName = oldFilePath[slashIndex+1:]
if " " in colour:
colour = colour.replace(" ", "-")
if oldFileName == filename:
number = 1;
newFileName = displayName + "_" + colour + "-" + str(number) + ".jpg"
while os.path.exists(FILE_PATH + leadingPath + newFileName):
number = number + 1
newFileName = filePath, displayName + "_" + colour + "-" + str(number) + ".jpg"
os.rename(newFileName)
COUNT = COUNT + 1
print(COUNT)

Why does my Python XML parser break after the first file?

I am working on a Python (3) XML parser that should extract the text content of specific nodes from every xml file within a folder. Then, the script should write the collected data into a tab-separated text file. So far, all the functions seem to be working. The script returns all the information that I want from the first file, but it always breaks, I believe, when it starts to parse the second file.
When it breaks, it returns "TypeError: 'str' object is not callable." I've checked the second file and found that the functions work just as well on that as the first file when I remove the first file from the folder. I'm very new to Python/XML. Any advice, help, or useful links would be greatly appreciated. Thanks!
import xml.etree.ElementTree as ET
import re
import glob
import csv
import sys
content_file = open('WWP Project/WWP_texts.txt','wt')
quotes_file = open('WWP Project/WWP_quotes.txt', 'wt')
list_of_files = glob.glob("../../../Documents/WWPtextbase/distribution/*.xml")
ns = {'wwp':'http://www.wwp.northeastern.edu/ns/textbase'}
def content(tree):
lines = ''.join(ET.tostring(tree.getroot(),encoding='unicode',method='text')).replace('\n',' ').replace('\t',' ').strip()
clean_lines = re.sub(' +',' ', lines)
return clean_lines.lower()
def quotes(tree):
quotes_list = []
for node in tree.findall('.//wwp:quote', namespaces=ns):
quote = ET.tostring(node,encoding='unicode',method='text')
clean_quote = re.sub(' +',' ', quote)
quotes_list.append(clean_quote)
return ' '.join(str(v) for v in quotes_list).replace('\t','').replace('\n','').lower()
def pid(tree):
for node in tree.findall('.//wwp:sourceDesc//wwp:author/wwp:persName[1]', namespaces=ns):
pid = node.attrib.get('ref')
return pid.replace('personography.xml#','') # will need to replace 'p:'
def trid(tree): # this function will eventually need to call OT (.//wwp:publicationStmt//wwp:idno)
for node in tree.findall('.//wwp:sourceDesc',namespaces=ns):
trid = node.attrib.get('n')
return trid
content_file.write('pid' + '\t' + 'trid' + '\t' +'text' + '\n')
quotes_file.write('pid' + '\t' + 'trid' + '\t' + 'quotes' + '\n')
for file_name in list_of_files:
file = open(file_name, 'rt')
tree = ET.parse(file)
file.close()
pid = pid(tree)
trid = trid(tree)
content = content(tree)
quotes = quotes(tree)
content_file.write(pid + '\t' + trid + '\t' + content + '\n')
quotes_file.write(pid + '\t' + trid + '\t' + quotes + '\n')
content_file.close()
quotes_file.close()
You are overwriting your function calls with the values they returned. changing the function names should fix it.
import xml.etree.ElementTree as ET
import re
import glob
import csv
import sys
content_file = open('WWP Project/WWP_texts.txt','wt')
quotes_file = open('WWP Project/WWP_quotes.txt', 'wt')
list_of_files = glob.glob("../../../Documents/WWPtextbase/distribution/*.xml")
ns = {'wwp':'http://www.wwp.northeastern.edu/ns/textbase'}
def get_content(tree):
lines = ''.join(ET.tostring(tree.getroot(),encoding='unicode',method='text')).replace('\n',' ').replace('\t',' ').strip()
clean_lines = re.sub(' +',' ', lines)
return clean_lines.lower()
def get_quotes(tree):
quotes_list = []
for node in tree.findall('.//wwp:quote', namespaces=ns):
quote = ET.tostring(node,encoding='unicode',method='text')
clean_quote = re.sub(' +',' ', quote)
quotes_list.append(clean_quote)
return ' '.join(str(v) for v in quotes_list).replace('\t','').replace('\n','').lower()
def get_pid(tree):
for node in tree.findall('.//wwp:sourceDesc//wwp:author/wwp:persName[1]', namespaces=ns):
pid = node.attrib.get('ref')
return pid.replace('personography.xml#','') # will need to replace 'p:'
def get_trid(tree): # this function will eventually need to call OT (.//wwp:publicationStmt//wwp:idno)
for node in tree.findall('.//wwp:sourceDesc',namespaces=ns):
trid = node.attrib.get('n')
return trid
content_file.write('pid' + '\t' + 'trid' + '\t' +'text' + '\n')
quotes_file.write('pid' + '\t' + 'trid' + '\t' + 'quotes' + '\n')
for file_name in list_of_files:
file = open(file_name, 'rt')
tree = ET.parse(file)
file.close()
pid = get_pid(tree)
trid = get_trid(tree)
content = get_content(tree)
quotes = get_quotes(tree)
content_file.write(pid + '\t' + trid + '\t' + content + '\n')
quotes_file.write(pid + '\t' + trid + '\t' + quotes + '\n')
content_file.close()
quotes_file.close()

Categories