Insert after x node minidom xml python - python

I'm appending a node to an xml, but i want it to insert before some tags, could that be possible?
newNode = xmldoc.createElement("tag2")
txt = xmldoc.createTextNode("value2")
newNode.appendChild(txt)
n.appendChild(newNode)
This is my XML. When I append the child, it add after UniMed, I want it to insert after Cantidad and before UniMed. (Simplified version of my XML) "Item" can have more childs, and i do not know how many.
<ns0:Item>
<ns0:Cantidad>1</ns0:Cantidad>
<ns0:UniMed>L</ns0:UniMed>
</ns0:Item>
I think i can solve it by reading al the childs of Item, erase them, and then add them in the order I want.
But i dont think its the best idea...
Any ideas?
EDITED
SOLUTION
itemChildNodes = n.childNodes
n.insertBefore(newNode, itemChildNodes[itemChildNodes.length-2])

Use insertBefore method to insert New created tag.
Demo:
>>> from xml.dom import minidom
>>> content = """
... <xml>
... <Item>
... <Cantidad>1</Cantidad>
... <UniMed>L</UniMed>
... </Item>
... </xml>
... """
>>> root = minidom.parseString(content)
>>> insert_tag = root.createElement("tag2")
>>> htext = root.createTextNode('test')
>>> insert_tag.appendChild(htext)
<DOM Text node "'test'">
>>>
>>> items = root.getElementsByTagName("Item")
>>> item = items[0]
>>> item_chidren = item.childNodes
>>> item.insertBefore(insert_tag, item_chidren[2])
<DOM Element: tag2 at 0xb700da8c>
>>> root.toxml()
u'<?xml version="1.0" ?><xml>\n\t<Item>\n\t <Cantidad>1</Cantidad><tag2>test</tag2>\n\t <UniMed>L</UniMed>\n\t</Item>\n</xml>'
>>>

Related

Removing empty xml nodes

I have an xml file that I'm trying to remove empty nodes from with python. When I've tested it to check if a the value is, say, 'shark', it works. But when i check for it being none, it doesn't remove the empty node.
for records in recordList:
for fieldGroup in records:
for field in fieldGroup:
if field.text is None:
fieldGroup.remove(field)
xpath is your friend here.
from lxml import etree
doc = etree.XML("""<root><a>1</a><b><c></c></b><d></d></root>""")
def remove_empty_elements(doc):
for element in doc.xpath('//*[not(node())]'):
element.getparent().remove(element)
Then:
>>> print etree.tostring(doc,pretty_print=True)
<root>
<a>1</a>
<b>
<c/>
</b>
<d/>
</root>
>>> remove_empty_elements(doc)
>>> print etree.tostring(doc,pretty_print=True)
<root>
<a>1</a>
<b/>
</root>
>>> remove_empty_elements(doc)
>>> print etree.tostring(doc,pretty_print=True)
<root>
<a>1</a>
</root>

lxml get full ext of element

I have the following xml:
<text>test<br/><br/>All you need to know about British birds.<br/></text>
I am wishing to set the whole content of the tag <text> to 11111
I'm using pythong and lxml and the following are my codes:
import nltk
import lxml.etree as le
current_file = '/Users/noor/Dropbox/apps/APIofLife/src/clear_description/bird.rdf'
f = open(current_file,'r')
doc=le.parse(f)
for elem in doc.xpath("//text"):
elem.text = "11111"
f.close()
f = open(current_file,'w')
f.write(le.tostring(doc))
f.close()
However, after running the above codes, my results are:
<text>11111<br/><br/>All you need to know about British birds.<br/></text>
I want to know why the whole content of the tag <text> has not been changed to 11111
According to lxml.etree._Element documentation, text property correspond to the text before the first subelement.
You need to delete sub elements:
>>> import lxml.etree as le
>>>
>>> root = le.fromstring('''<text>test<br/><br/>
... All you need to know about British birds.
... <br/></text>''')
>>> for elem in root.xpath("//text"):
... elem.text = '1111'
... del elem[:] # <----------
...
>>> le.tostring(root)
'<text>1111</text>'

Parsing XML Dom to a Python List using mindom

I have a XML file that I fetch from a website. I have placed the XML in a DOM and I am able to get most of the needed information from it except where I have the following:
<response>
<result name="response" numFound="2567888" start="0">
<doc>
<int name="ImageCount">3</int>
<arr name="Images">
<str>binder/jnws/jnws40/images/p1120.jpg</str>
<str>binder/jnws/jnws40/images/g0753.jpg</str>
<str>binder/jnws/jnws40/images/p0754.jpg</str>
</arr>
</doc>
</result>
</response>
My Code is:
for node in solardom.getElementsByTagName('doc'):
# Get the Image Count & Video Counts for this doc element ..."
imageCount = int(getMyElementValue(node, "int", "ImageCount"))
videoCount = int(getMyElementValue(node, "int", "VideoCount"))
if imageCount > 0:
print "Image Count is: " + str(imageCount)
imageList = getMyList(node, "arr", "Images", imageCount)
def getMyList(n, ntype, s, num):
list = []
i = 0
for node in n.getElementsByTagName(ntype):
if node.getAttribute("name") == s:
print "Found Image Path!!!"
I see that I am at the correct level in the XML but I can't figure out how to stuff the string value for the image paths into a Python list.
Thanks for any assistance or pointers you can give me.
Jake
Try return [child.nodeValue for child in node.childNodes].
Try xmltodict module.
>>> import xmltodict
>>> obj = xmltodict.parse(xml)
>>> print(obj['response']['result']['doc']['arr']['str'])
>>> ['binder/jnws/jnws40/images/p1120.jpg', 'binder/jnws/jnws40/images/g0753.jpg', 'binder/jnws/jnws40/images/p0754.jpg']
Ok, try this
xml = '''
<response>
<result name="response" numFound="2567888" start="0">
<doc>
<int name="ImageCount">3</int>
<arr name="Images">
<str>binder/jnws/jnws40/images/p1120.jpg</str>
<str>binder/jnws/jnws40/images/g0753.jpg</str>
<str>binder/jnws/jnws40/images/p0754.jpg</str>
</arr>
</doc>
</result>
</response>
'''
>>> import xml.etree.ElementTree as ET
>>> root = ET.fromstring(xml)
>>> imgs = [img.text for img in root.findall(".//*[#name='Images']/str")]
>>> ['binder/jnws/jnws40/images/p1120.jpg', 'binder/jnws/jnws40/images/g0753.jpg', 'binder/jnws/jnws40/images/p0754.jpg']
You can read more here

extract tag inside text element of an xml tag

Suppose I have an XML document of the following form
<root>
<foos>
<foo>the quick <bar>brown </bar>fox</foo>
</foos>
<!-- Lots more <foo></foo> -->
</root>
How do I extract the full text string the quick fox as well as the string brown?
import xml.etree.ElementTree as ET
doc = ET.parse(xmldocument).getroot()
foos = doc.find('foos')
for foo in foos:
print foo.text # This will print 'the quick '
Not sure how to solve this problem.
You can also try something like this, which iterates in all nested tags automatically:
foos = doc.find('foos')
for foo in foos:
for text in foo.itertext():
print text.strip(),
print
from scrapy.selector import XmlXPathSelector
xml = \
"""
<root>
<foos>
<foo>the quick <bar>brown </bar>fox</foo>
</foos>
</root>
"""
hxs =XmlXPathSelector(text=xml)
foos = hxs.select('//foos')
for one in foos:
text = one.select('./foo//text()').extract()
text = ''.join(text)
print text

Getting unique value when the same tag is in children's tree in XML with Python

I have getElementText as follows which works pretty well with [0] as the XML that I'm working on doesn't have the duplicate tag.
from xml.dom import minidom
def getElementText(element, tagName):
return str(element.getElementsByTagName(tagName)[0].firstChild.data)
doc = minidom.parse("/Users/smcho/Desktop/hello.xml")
outputTree = doc.getElementsByTagName("Output")[0]
print getElementText(outputTree, "Number")
However, when I parse the following XML, I can't get the value <Number>0</Number> but <ConnectedTerminal><Number>1</Number></ConnectedTerminal> with getElementText(outputTree, "Number"), because the getElementText function returns the first of the two elements with the tag "Number".
<Output>
<ConnectedTerminal>
<Node>5</Node>
<Number>1</Number>
</ConnectedTerminal>
<Type>int8</Type>
<Number>0</Number>
</Output>
Any solution to this problem? Is there any way to get only <Number>0</Number> or <ConnectedTerminal><Number>1</Number></ConnectedTerminal>.
If lxml is an option (it's much nicer than minidomyou) can do:
from lxml import etree
doc = etree.fromstring(xml)
node = doc.find('Number')
print node.text # 0
node = doc.xpath('//ConnectedTerminal/Number')[0]
print node.text # 1
Also see the xpath tutorial.
There's not a direct DOM method to do this, no. But it's fairly easy to write one:
def getChildElementsByTagName(element, tag):
children= []
for child in element.childNodes:
if child.nodeType==child.ELEMENT_NODE and tag in (child.tagName, '*'):
children.push(child)
return children
Plus here's a safer text-getting function, so you don't have to worry about multiple nodes, missing nodes due to blank strings, or CDATA sections.
def getTextContent(element):
texts= []
for child in element.childNodes:
if child.nodeType==child.ELEMENT_NODE:
texts.append(getTextContent(child))
elif child.nodeType==child.TEXT_NODE:
texts.append(child.data)
return u''.join(texts)
then just:
>>> getTextContent(getChildElementsByTagName(doc, u'Number')[0])
u'0'
>>> getTextContent(getChildElementsByTagName(doc, u'Output')[0].getElementsByTagName(u'Number')[0])
u'1'

Categories