Parse XML (musicbrainz) In Python - python

I am trying to import urls like this one (http://musicbrainz.org/ws/2/artist/72c536dc-7137-4477-a521-567eeb840fa8) into python and extract the value of the "gender".
import urllib2
import codecs
import sys
import os
from xml.dom import minidom
import xml.etree.cElementTree as ET
#urlbob = urllib2.urlopen('http://musicbrainz.org/ws/2/artist/72c536dc-7137-4477-a521-567eeb840fa8')
url = 'dylan.xml'
#attempt 1 - using minidom
xmldoc = minidom.parse(url)
itemlist = xmldoc.getElementsByTagName('artist')
#attempt 2 - using ET
tree = ET.parse('dylan.xml')
root = tree.getroot()
for child in root:
print child.tag, child.attrib
I can't seem to get at gender either via the mini-dom stuff or the etree stuff. In its current form, the script returns
{http://musicbrainz.org/ns/mmd-2.0#}artist {'type': 'Person', 'id': '72c536dc-7137-4477-a521-567eeb840fa8'}

That is because you're looping root which is only the root of the tree, does that make sense? When you loop the root, it will only return the next child and stop there.
You need to loop the iterable so it will get returning next node and get the result, see this:
tree = ET.parse('dylan.xml')
root = tree.getroot()
# loop the root iterable which will keep returning next node
for node in root.iter(): # or root.getiterator() if < Python 2.7
print node.tag, node.attrib, node.text
Results:
{http://musicbrainz.org/ns/mmd-2.0#}metadata {} None
{http://musicbrainz.org/ns/mmd-2.0#}artist {'type': 'Person', 'id': '72c536dc-7137-4477-a521-567eeb840fa8'} None
{http://musicbrainz.org/ns/mmd-2.0#}name {} Bob Dylan
{http://musicbrainz.org/ns/mmd-2.0#}sort-name {} Dylan, Bob
{http://musicbrainz.org/ns/mmd-2.0#}ipi {} 00008955074
{http://musicbrainz.org/ns/mmd-2.0#}ipi-list {} None
{http://musicbrainz.org/ns/mmd-2.0#}ipi {} 00008955074
{http://musicbrainz.org/ns/mmd-2.0#}ipi {} 00008955172
{http://musicbrainz.org/ns/mmd-2.0#}isni-list {} None
{http://musicbrainz.org/ns/mmd-2.0#}isni {} 0000000121479733
{http://musicbrainz.org/ns/mmd-2.0#}gender {} Male
{http://musicbrainz.org/ns/mmd-2.0#}country {} US
{http://musicbrainz.org/ns/mmd-2.0#}area {'id': '489ce91b-6658-3307-9877-795b68554c98'} None
{http://musicbrainz.org/ns/mmd-2.0#}name {} United States
{http://musicbrainz.org/ns/mmd-2.0#}sort-name {} United States
{http://musicbrainz.org/ns/mmd-2.0#}iso-3166-1-code-list {} None
{http://musicbrainz.org/ns/mmd-2.0#}iso-3166-1-code {} US
{http://musicbrainz.org/ns/mmd-2.0#}begin-area {'id': '04e60741-b1ae-4078-80bb-ffe8ae643ea7'} None
{http://musicbrainz.org/ns/mmd-2.0#}name {} Duluth
{http://musicbrainz.org/ns/mmd-2.0#}sort-name {} Duluth
{http://musicbrainz.org/ns/mmd-2.0#}life-span {} None
{http://musicbrainz.org/ns/mmd-2.0#}begin {} 1941-05-24

## This prints out the tree as the xml lib sees it
## (I found it made debugging a little easier)
#def print_xml(node, depth = 0):
# for child in node:
# print "\t"*depth + str(child)
# print_xml(child, depth = depth + 1)
#print_xml(root)
# attempt 1
xmldoc = minidom.parse(url)
genders = xmldoc.getElementsByTagName('gender') # <== you want gender not artist
for gender in genders:
print gender.firstChild.nodeValue
# attempt 2
ns = "{http://musicbrainz.org/ns/mmd-2.0#}"
xlpath = "./" + ns + "artist/" + ns + "gender"
genders = root.findall(xlpath) # <== xpath was made for this..
for gender in genders:
print gender.text
So.. the problem with your first attempt is that you're looking at a list of all the artist elements not the gender elements (the child of the only artist element in the list).
The problem with your second attempt is that you are looking at a list of the children of the root element (which is a list containing a single metadata element).
The underlying structure is:
<artist>
<name>
<sort-name>
<ipi>
<ipi-list>
<ipi>
<ipi>
<isni-list>
<isni>
<gender>
<country>
<area>
<name>
<sort-name>
<iso-3166-1-code-list>
<iso-3166-1-code>
<begin-area>
<name>
<sort-name>
<life-span>
<begin>
so you need to get root -> artist -> gender, or just search for the node you actually want (gender in this case).

Related

creating dynamic nested xml from Excel

I'm trying to convert Excel to nested XML and could not succeed as expected.
Here is my code.
import openpyxl
import xml.etree.ElementTree as etree
# reading data from the source, xls
wb1 = openpyxl.load_workbook(filename='C:\GSH\parent_child.xlsx')
ws1 = wb1.get_sheet_by_name('Sheet1')
row_max = ws1.max_row
# creating xml tree structure
root = etree.Element('Hierarchy')
# iterating through the xls and creating children based on the condition
for row_values in range(2, row_max+1):
parent = etree.SubElement(root, 'parent')
parent.text = ws1.cell(column=1, row=row_values).value
root.append(parent)
if (ws1.cell(column=1, row = row_values).value == ws1.cell(column=2, row = row_values-1).value):
print("------Inside if condition")
print(ws1.cell(column=2, row=row_values).value)
child = etree.SubElement(parent, 'child')
child.text = ws1.cell(column=2, row=row_values).value
parent.append(child)
print("-------Inside if condition")
tree = etree.ElementTree(root)
tree.write('C:\GSH\gsh.xml')
I am getting XML like this..
However, my XML should look like this.
Any suggestions, please.
The above is the source XLS from which I am working on.
You can set variable name instead of parent and child. This code is only part of your list and seems tricky but works fine. d[child[i]].text = " " is only to show both sides of tags. For making var in loop with dictionary, please refer this.
import xml.etree.ElementTree as ET
India = ET.Element('India') # set root
parent = ['India', 'Telangana', 'Telangana', 'Telangana','Nalgonda'] # parent list
child = ['Telangana', 'Cyberabad', 'Warangal','Nalgonda','BusStation'] # child list
d = {} # use dictionary to define var in loop
d['India'] = India
for i in range(len(child)):
for k, v in d.items():
if k == parent[i]:
pa = v
break
d[child[i]] = ET.SubElement(pa, child[i])
d[child[i]].text = " " # to get both side of tags
tree = ET.ElementTree(India)
tree.write('gsh.xml')
# <India>
# <Telangana>
# <Cyberabad> </Cyberabad>
# <Warangal> </Warangal>
# <Nalgonda>
# <BusStation> </BusStation>
# </Nalgonda>
# </Telangana>
# </India>

Extracting nested XML elements of different sizes into Pandas

Lets assume we have an arbitrary XML document like below
<?xml version="1.0" encoding="UTF-8"?>
<programs xmlns="http://something.org/schema/s/program">
<program xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://something.org/schema/s/program http://something.org/schema/s/program.xsd">
<orgUnitId>Organization 1</orgUnitId>
<requiredLevel>academic bachelor</requiredLevel>
<requiredLevel>academic master</requiredLevel>
<programDescriptionText xml:lang="nl">Here is some text; blablabla</programDescriptionText>
<searchword xml:lang="nl">Scrum master</searchword>
</program>
<program xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://something.org/schema/s/program http://something.org/schema/s/program.xsd">
<requiredLevel>bachelor</requiredLevel>
<requiredLevel>academic master</requiredLevel>
<requiredLevel>academic bachelor</requiredLevel>
<orgUnitId>Organization 2</orgUnitId>
<programDescriptionText xml:lang="nl">Text from another organization about some stuff.</programDescriptionText>
<searchword xml:lang="nl">Excutives</searchword>
</program>
<program xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<orgUnitId>Organization 3</orgUnitId>
<programDescriptionText xml:lang="nl">Also another huge text description from another organization.</programDescriptionText>
<searchword xml:lang="nl">Negotiating</searchword>
<searchword xml:lang="nl">Effective leadership</searchword>
<searchword xml:lang="nl">negotiating techniques</searchword>
<searchword xml:lang="nl">leadership</searchword>
<searchword xml:lang="nl">strategic planning</searchword>
</program>
</programs>
Currently I'm looping over the elements I need by using their absolute paths, since I'm not able to use any of the get or find methods in ElementTree. As such, my code looks like below:
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np
import itertools
tree = ET.parse('data.xml')
root = tree.getroot()
root.tag
dfcols=['organization','description','level','keyword']
organization=[]
description=[]
level=[]
keyword=[]
for node in root:
for child in
node.findall('.//{http://something.org/schema/s/program}orgUnitId'):
organization.append(child.text)
for child in node.findall('.//{http://something.org/schema/s/program}programDescriptionText'):
description.append(child.text)
for child in node.findall('.//{http://something.org/schema/s/program}requiredLevel'):
level.append(child.text)
for child in node.findall('.//{http://something.org/schema/s/program}searchword'):
keyword.append(child.text)
The goal, of course, is to create one dataframe. However, since each node in the XML file contains one or multiple elements, such as requiredLevel or searchword I'm currently losing data when I'm casting it to a dataframe by either:
df=pd.DataFrame(list(itertools.zip_longest(organization,
description,level,searchword,
fillvalue=np.nan)),columns=dfcols)
or using pd.Series as given here or another solution which I don't seem to get it fit from here
My best bet is not to use Lists at all, since they don't seem to index the data correctly. That is, I lose data from the 2nd to Xth child node. But right now I'm stuck, and don't see any other options.
What my end result should look like is this:
organization description level keyword
Organization 1 .... academic bachelor, Scrum master
academic master
Organization 2 .... bachelor, Executives
academic master,
academic bachelor
Organization 3 .... Negotiating,
Effective leadership,
negotiating techniques,
....
Consider building a list of dictionaries with comma-collapsed text values. Then pass list into the pandas.DataFrame constructor:
dicts = []
for node in root:
orgs = ", ".join([org.text for org in node.findall('.//{http://something.org/schema/s/program}orgUnitId')])
desc = ", ".join([desc.text for desc in node.findall('.//{http://something.org/schema/s/program}programDescriptionText')])
lvls = ", ".join([lvl.text for lvl in node.findall('.//{http://something.org/schema/s/program}requiredLevel')])
wrds = ", ".join([wrd.text for wrd in node.findall('.//{http://something.org/schema/s/program}searchword')])
dicts.append({'organization': orgs, 'description': desc, 'level': lvls, 'keyword': wrds})
final_df = pd.DataFrame(dicts, columns=['organization','description','level','keyword'])
Output
print(final_df)
# organization description level keyword
# 0 Organization 1 Here is some text; blablabla academic bachelor, academic master Scrum master
# 1 Organization 2 Text from another organization about some stuff. bachelor, academic master, academic bachelor Excutives
# 2 Organization 3 Also another huge text description from anothe... Negotiating, Effective leadership, negotiating...
A lightweight xml_to_dict converter can be found here. It can be improved by this to handle namespaces.
def xml_to_dict(xml='', remove_namespace=True):
"""Converts an XML string into a dict
Args:
xml: The XML as string
remove_namespace: True (default) if namespaces are to be removed
Returns:
The XML string as dict
Examples:
>>> xml_to_dict('<text><para>hello world</para></text>')
{'text': {'para': 'hello world'}}
"""
def _xml_remove_namespace(buf):
# Reference: https://stackoverflow.com/a/25920989/1498199
it = ElementTree.iterparse(buf)
for _, el in it:
if '}' in el.tag:
el.tag = el.tag.split('}', 1)[1]
return it.root
def _xml_to_dict(t):
# Reference: https://stackoverflow.com/a/10077069/1498199
from collections import defaultdict
d = {t.tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in map(_xml_to_dict, children):
for k, v in dc.items():
dd[k].append(v)
d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
if t.attrib:
d[t.tag].update(('#' + k, v) for k, v in t.attrib.items())
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]['#text'] = text
else:
d[t.tag] = text
return d
buffer = io.StringIO(xml.strip())
if remove_namespace:
root = _xml_remove_namespace(buffer)
else:
root = ElementTree.parse(buffer).getroot()
return _xml_to_dict(root)
So let s be the string which holds your xml. We can convert it to a dict via
d = xml_to_dict(s, remove_namespace=True)
Now the solution is straight forward:
rows = []
for program in d['programs']['program']:
cols = []
cols.append(program['orgUnitId'])
cols.append(program['programDescriptionText']['#text'])
try:
cols.append(','.join(program['requiredLevel']))
except KeyError:
cols.append('')
try:
searchwords = program['searchword']['#text']
except TypeError:
searchwords = []
for searchword in program['searchword']:
searchwords.append(searchword['#text'])
searchwords = ','.join(searchwords)
cols.append(searchwords)
rows.append(cols)
df = pd.DataFrame(rows, columns=['organization', 'description', 'level', 'keyword'])

How to simplify my code - extract all node values of same xml tag name

For example, here is the XML data:
<SOAP-ENV:Body>
<reportList>
<reportName>report 1</reportName>
</reportList>
<reportList>
<reportName>report 2</reportName>
</reportList>
<reportList>
<reportName>report 3</reportName>
</reportList>
</SOAP-ENV:Body>
Here is my code to extract the node values of all reportName, and it works.
import xml.dom.minidom
...
node = xml.dom.minimom.parseString(xml_file.text).documentElement
reportLists = node.getElementsByTagName('reportList')
reports = []
for reportList in reportLists:
reportObj = reportList.getElementsByTagName('reportName')[0]
reports.append(reportObj)
for report in reports:
nodes = report.childNodes
for node in nodes:
if node.nodeType == node.TEXT_NODE:
print (node.data)
result:
report 1
report 2
report 3
Although it works, I want to simplify the code. How to achieve the same result using shorter code?
You can simplify both for loops using list comprehensions:
import xml.dom.minidom
node = xml.dom.minidom.parseString(xml_file.text).documentElement
reportLists = node.getElementsByTagName('reportList')
reports = [report.getElementsByTagName('reportName')[0] for report in reportLists]
node_data = [node.data for report in reports for node in report.childNodes if node.nodeType == node.TEXT_NODE]
node_data is now a list containing the information you were printing.

get text of xml element is not working: AttributeError: 'NoneType' object has no attribute 'findall'

The code below is getting information about a artist using lastfm api. Then it stores the name in bands[Name] and each tag name(ex: rock, etc) of the artist in the bands[Tags]. Its working fine to get and store the name but its not working for the tags. It appears:
Traceback (most recent call last):
File "C:/Users/Ozzy/PycharmProjects/getData/getData.py", line 19, in <module>
for tag in artist.find('tags').findall('tag'):
AttributeError: 'NoneType' object has no attribute 'findall'
Minimum working exemple to demonstrate the error:
import xml.etree.ElementTree as ET
import requests
ID = 1
chosen = "U2"
artist = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=artist.getinfo&artist=U2&api_key=b088cbedecd40b35dd89e90f55227ac2')
tree = ET.fromstring(artist.content)
for child in tree:
for artist in child:
print(artist)
for tag in artist.find('tags').findall('tag'):
print(tag.find('name').text)
The response have this format:
<lfm status="ok">
<artist>
<name>U2</name>
<tags>
<tag>
<name>rock</name>
<url>https://www.last.fm/tag/rock</url>
</tag>
<tag>
<tag>
<name>alternative</name>
<url>https://www.last.fm/tag/alternative</url>
</tag>
</tags>
</artist>
</lfm>
Full working exemple, the code gets the top artists from a specific country and then collect info about the artist (without getting and storing the tags of each artist working because it appears the NoneType error above):
import xml.etree.ElementTree as ET
import requests
import json
ID = 1
api_key = "b088cbedecd40b35dd89e90f55227ac2"
bands = {}
# GET TOP ARTISTS
for i in range(2, 3):
artistslist = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=geo.gettopartists&country=spain&page='+str(i) +'&api_key=' + api_key)
tree = ET.fromstring(artistslist.content)
for child in tree:
for artist in child.findall('artist'):
name = artist.find('name').text
url = artist.find('url').text
bands[ID] = {}
bands[ID]['ID'] = ID
bands[ID]['Name'] = name
bands[ID]['URL'] = url
ID += 1
# GET ARTIST INFO
for i, v in bands.items():
chosen = bands[i]['Name'].replace(" ", "+")
artist = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=artist.getinfo&artist=' + chosen + '&api_key=' + api_key)
tree = ET.fromstring(artist.content)
for child in tree:
for artist in child:
#for tag in artist.find('tags').findall('tag'):
#print(tag.find('name').text)
#bands[i][Tags] = tag.find('name').text
if (artist.get('size') == "large"):
if (artist.text is not None):
bands[i]['Image'] = artist.text
for bio in artist.findall('summary'):
if (bio.text is not None):
bands[i]['Description'] = bio.text
else:
bands[i]['Description'] = bio.text
print(bands[i]['Name'] + " INFO RETRIEVED")
with open('artists.json', 'w') as outfile:
json.dump(bands, outfile)
with open('artists.json') as data_file:
bands = json.load(data_file)
data_file.close()
Do you know how to fix this issue?
Your loops go one level too deep.
<lfm status="ok"> --> tree
<artist> --> child in tree
<name>U2</name> --> for artist in child
<tags>..</tags>
<tags is already part of child and therefore artist.find('tags') will return None.
You can shorten your loop to:
for band in bands.values():
url = 'http://ws.audioscrobbler.com/2.0/?method=artist.getinfo&artist={}&api_key={}'.format(band['Name'], api_key)
artist = requests.get(url)
tree = ET.fromstring(artist.content)
if tree.find('artist') is None:
continue
for child in tree.find('artist').getchildren():
if child.get('size') == "large":
if (child.text is not None):
band['Image'] = child.text
for bio in child.findall('summary'):
if bio.text is not None:
band['Description'] = bio.text
else:
band['Description'] = ""
for tag in child.findall('tag'):
if band.get('Tags'):
band['Tags'].append(tag.find('name').text)
else:
band['Tags'] = [tag.find('name').text]
print(band['Name'] + " INFO RETRIEVED")
A few notes:
It is easier and more efficient to loop over the keys in a dict by using for k in my_dict or over the values with for val in my_dict.values()
You are overwriting your Tags with the last value, using a list and appending to it will make sure you save all values
Your if/else statement (if (bio.text is not None):) behaves identical independent of the condition

Python - Find descendants and ancestors from the unsorted hierarchy file

I have a unsorted parent-child hierarchy file (tab delimited) in the following format:
City1 Area1
City1 Area2
Continent1 Country1
Continent2 Country2
Continent3 Country3
Continent4 Country4
Continents Continent1
Continents Continent2
Continents Continent3
Continents Continent4
Country1 State1
Country2 State2
Country3 State3
Earth Continents
State1 City1
State1 City1.1
State2 City2
My goal is to find all the "descendants" and "ancestors" of given member.
Here is what I have coded do far:
import sys, re
with open("input.txt", "r") as my_in:
collections={}
for line in my_in:
parent, child=line.rstrip('\r\n').split('\t')
collections.setdefault(parent, []).append(child)
print (collections)
'''
{'Continent4': ['Country4'], 'Continent2': ['Country2'],
'Continents': ['Continent1', 'Continent2', 'Continent3', 'Continent4'],
'Continent1': ['Country1'], 'Country2': ['State2'],
'Country3': ['State3'], 'State1': ['City1', 'City1.1'],
'Country1': ['State1'], 'State2': ['City2'],
'Earth': ['Continents'], 'City1': ['Area1', 'Area2'], 'Continent3': ['Country3']}
'''
def find_descendants(parent, collections):
descendants = []
for descendant in collections[parent]:
if descendant in collections:
descendants = descendants + find_descendants(descendant, collections)
else:
descendants.append(descendant)
return descendants
# Get descendants of "Continent1":
lis=find_descendants("Continent1", collections)
print (lis) # It shows ['Area1', 'Area2', 'City1.1']
# Actually it should show ['Country1', 'State1', 'City1', 'Area1', 'Area2', 'City1.1']
def find_ancestors(child, collections):
# pseudo code
# link child to its parent and parent to its parent until no more parents are found
pass
# lis=find_ancestors("City1.1", collections)
# should show ['Earth', 'Continents', 'Continent1', 'Country1', 'State1']
The function find_descendants is not working as expected. And as far as find_ancestors function is concerned, although I know the pseudo code, I am not able to express it in Python.
Please help.
As I said in comments, you forget to append your descendant before looking
deeper in your collection. This works:
def find_descendants(parent, collections):
descendants = []
for descendant in collections[parent]:
descendants.append(descendant)
if descendant in collections:
descendants = descendants + find_descendants(descendant, collections)
return descendants
For ancestors, just build an other collections, say ancestors_collection, that stores the reverse relation descendant/ancestor. The function to find ancestors should then be exactly the same as find_descendants, which you can rename accordingly.
EDIT:
Here a complete working code, I use relative to refer to ancestor or descendant:
import sys, re
with open("input.txt", "r") as my_in:
descendants={}
ancestors={}
for line in my_in:
parent, child=line.rstrip('\r\n').split('\t')
descendants.setdefault(parent, []).append(child)
ancestors.setdefault(child, []).append(parent)
def get_relatives(element, collection):
relatives = []
for relative in collection[element]:
relatives.append(relative)
if relative in collection:
relatives = relatives + get_relatives(relative, collection)
return relatives
# Get descendants of "Continent1":
lis=get_relatives("Continent1", descendants)
print (lis)
# shows ['Country1', 'State1', 'City1', 'Area1', 'Area2', 'City1.1']
lis=get_relatives("City1.1", ancestors)
print (lis)
# shows ['Earth', 'Continents', 'Continent1', 'Country1', 'State1']
Here's a simpler solution that uses networkx:
import networkx as nx
coll = nx.DiGraph()
with open("input.txt") as f:
for line in map(str.strip, f):
ancestor, descendant = line.split("\t")
coll.add_edge(ancestor, descendant)
print(nx.descendants(coll, "Continent1"))
# {'Area2', 'City1.1', 'Area1', 'City1', 'State1', 'Country1'}
print(nx.ancestors(coll, "City1.1"))
# {'Earth', 'Continent1', 'State1', 'Continents', 'Country1'}
Both functions return a set so the ancestors and descendants are not ordered.

Categories