encoding Lithuanian character in xml using python - python

I have a code:
def convert_df_to_xml(df,fd,ld):
# sukuriam pagrindini elementa (root) su pavadinimu Invoices.
root = ET.Element("Invoices")
root.set("from", str(fd))
root.set("till", str(ld))
for i in range(len(df['partner_id'])):
# pridedam sub elementa.
invoices = ET.SubElement(root, "Invoice")
invoices.set('clientid',df['company_registry'][i])
invoices.set('imones_pavadinimas', df['partner_id'][i])
# pridedam sub-sub elementa.
quebec = ET.SubElement(invoices, "Product")
# susikraunam eiluciu info is dataframe
sectin_1 = ET.SubElement(quebec, "Name")
sectin_1.text = str(df["Name"][i])
sectin_2 = ET.SubElement(quebec, 'Quantity')
sectin_2.text = str(df["time_dif"][i])
sectin_3 = ET.SubElement(quebec, 'Price')
sectin_3.text = str(df["price_unit"][i])
xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ", encoding="UTF-8").decode("UTF-8")
with open("bandomasis_itp_xml_failas_V_1.1.xml", "w") as f:
f.write(xmlstr)
I'm creating xml file from python DataFrame. The problem is that in xml file I got "?" marks instead "ė" character.
In dataframe i have strings with characters "ė,ą,š,ų" and I need them to be in xml file.
My dataframe:
df1 = pd.DataFrame({'partner_id': ['MED GRUPĖ, UAB'], 'Name':['Pirmas'], 'company_registry': ['3432543'],
'time_dif':['2'],'price_unit':['23']})
what is the problem with encoding here?

Related

supplementary quotes appearing in my csv using python code

I did a code to generate multiple addresses and export it in csv
import csv
import ipaddress
import random
from random import shuffle
LAN = ipaddress.ip_network('192.168.0.0/16')
WAN1 = ipaddress.ip_network('11.10.8.0/22')
WAN2 = ipaddress.ip_network('12.10.8.0/22')
LAN_IP_Adresses = [ IP_LAN for IP_LAN in LAN.hosts()]
WAN1_IP_Adresses = [ IP_WAN1 for IP_WAN1 in WAN1.hosts()]
WAN2_IP_Adresses = [ IP_WAN2 for IP_WAN2 in WAN2.hosts()]
index_IP_GW = len(WAN1_IP_Adresses)-1
locations_list=['Abidjan','Abu Dhabi','Adana','Adelaide', 'Ahmadabad','Algiers','Amsterdam','Ankara','Anshan','Athens','BANGKOK','BUCHAREST','BUDAPEST','Bagdad','Bandung','Bangalore','Bangkok','Barcelona','Beirut','Belgrade','Bern','Bogota','Brasilia','Brazzaville','Brussels','Bursa','CAIRO','CARACAS','CONAKRY','Canberra','Casablanca','Changchun','Chengdu','Chicago','Copenhagen','Dakar','MINSK','Madrid','Medina','Nairobi','Napoli','Montreal',
'Odessa','Omdurman','Osaka','Ottawa','PYONGYANG','Paris','Pekin', 'Perth','Philadelphia','Phoenix','Pimpri Chinchwad','Porto','Porto Alegre','QUITO','Qingdao','Rabat','Rajkot','Riadh','Rio de Janeiro','Rome','SANTIAGO','Salvador','Samara','San Antonio','San Francisco','Sao Paulo','Sendai','Seongnam','Seoul','Shanghai','Singapore','Sydney','Taiyuan','Tehran','Tijuana','Tokyo','Toronto','Moscou','Moscow','Mumbai (Bombay)','Munich','México','Milan',
'Tripoli','Tunis','Vienna','Warsaw','Wuhan','Xian','Yaounde','Yokohama', 'Zapopan','hong kong','Dallas','Delhi','Doha','Dublin','Durban','Ecatepec','Frankfurt','Fukuoka','Giza','Hamburg','Havana','Helsinki','Houston','Hyderabad','Istanbul','Jaipur','Jakarta','Jeddah','Johannesburg','KIEV','Kaduna','Kano','Kazan','Kuala Lumpur''Kyoto','LUANDA','Lahore','Lanzhou','Le Caire','Leon','Lima','Lisbon','London','Los Angeles','Lyon','MANILA','Melbourne','New York']
#Site_Nmb=1
def initial_Sites_list_generator(filename='SITES_CI.csv', Number_of_Sites=1000):
file_to_output = open(filename,'w',newline='')
csv_writer = csv.writer(file_to_output,delimiter=',')
Site_Nbr=1
index = 0
csv_writer.writerow(["SITE_NAME", "SERIAL_NUMBER",'"LAN_IP_ADDRESS"','"WAN_IP_ADDRESS1"','"WAN_IP_ADDRESS2"','"GATEWAY_IP_ADDRESS1"','"GATEWAY_IP_ADDRESS2"','"ROUTE_REFLECTOR"','"LOCATIONS"','"HARDWAREMODEL"','"LANINT"','"WANINT1"','"WANINT2"','"BW_OUT"','"BW_IN"'])
for i in range(1,Number_of_Sites+1):
shuffle(locations_list)
location = random.choice(locations_list)
csv_writer.writerow(['"SITE'+ str(Site_Nbr)+'"',"2e70129bde9c4426b9213d4408c300",f'"{(LAN_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index])}"',f'"{str(WAN2_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index_IP_GW])}"',f'"{str(WAN2_IP_Adresses[index_IP_GW])}"','"False"',f'"{location}"','"ONEv600"','"gigabitethernet0/2"','"gigabitethernet0/0"','"gigabitethernet0/1"','"10"','"20"'])
Site_Nbr = Site_Nbr+1
index = index+1
file_to_output.close()
initial_Sites_list_generator('SITES_OVP.csv', 1000)
but i got unnecessary quotes added in my csv
You are adding the extra quotes yourself. In your for loop, change this line:
csv_writer.writerow(['"SITE'+ str(Site_Nbr)+'"',"2e70129bde9c4426b9213d4408c300",f'"{(LAN_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index])}"',f'"{str(WAN2_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index_IP_GW])}"',f'"{str(WAN2_IP_Adresses[index_IP_GW])}"','"False"',f'"{location}"','"ONEv600"','"gigabitethernet0/2"','"gigabitethernet0/0"','"gigabitethernet0/1"','"10"','"20"'])
to this:
csv_writer.writerow(['SITE'+ str(Site_Nbr)+"2e70129bde9c4426b9213d4408c300",
f'{(LAN_IP_Adresses[index])}',
f'{str(WAN1_IP_Adresses[index])}',
f'{str(WAN2_IP_Adresses[index])}',
f'{str(WAN1_IP_Adresses[index_IP_GW])}',
f'{str(WAN2_IP_Adresses[index_IP_GW])}',
'False',
f'{location}',
'ONEv600',
'gigabitethernet0/2',
'gigabitethernet0/0',
'gigabitethernet0/1',
'10',
'20'])
The CSV writer already adds quotes to strings as appropriate.
I did
csv_writer = csv.writer(file_to_output,delimiter=",",quoting=csv.QUOTE_ALL)
and it worked !

Transform Nested XML

I am currently looking to parse out a nested XML into a pandas Datatable so I can generate a CSV with each column being an element name and the value of that being the element text but I am having some issues parsing the information out. Below is an example of the nested XML and what I have tried.
The below XML can be quite large with hundreds of different records. This is what I tried:
##Import modules
import xml.etree.ElementTree as ET
import pandas as pd
from lxml import etree
tree = ET.parse("File.xml")
root = tree.getroot()
for subelement in root:
for subsub in subelement:
print(subsub.tag,",", subsub.text, subsub.attrib, subsub.items())
for subelement in root:
for subsub in subelement:
for subsubsub in subsub:
print(subsubsub.tag,",", subsubsub.text, subsubsub.attrib)
<?xml version="1.0" encoding="utf-16"?>
<test1 xmlns="test.xsd">
<test2 ID="123123123" test3="123123">
<test3>Separate</test3>
<test4>AA</test4>
<Comments>BB</Comments>
<test5>
<test6 ID="123123">
<test3>today</test3>
<test7>123 street</test7>
</test6>
</test5>
<test8>
<test10 ID="434234">
<test3>type of work</test3>
<test9>test work</test9>
</test10>
</test8>
<test11>
<test12 ID="234234234">
<test3>Social</test3>
<test14>test</test14>
</test12>
<test12 ID="123123">
<test3>Something Here</test3>
<test13>Some date</test13>
<test14>123123124433</test14>
</test12>
</test11>
<test15>
<test16 ID="6456456456">
<test3>Something Something</test3>
<test14>746745636</test14>
</test16>
</test15>
</test2>
<test2 ID="353453245" test3="list of something">
<test3>Somewhere</test3>
<test4>Someone</test4>
<Comments>Some comment</Comments>
<test5>
<test6 ID="567456756">
<test3>Not today</test3>
<test7>5634643643</test7>
<test17>Some Info</test17>
<test19>Somewhere</test19>
<test18>63243333</test18>
</test6>
</test5>
<test11>
<test12 ID="456436346">
<test3>Pattern</test3>
<test14>436346346</test14>
</test12>
<test12 ID="4364356">
<test3> ID</test3>
<test14>5674567457</test14>
</test12>
<test12 ID="123123123443">
<test3>Other ID</test3>
<test13>54234532452345</test13>
<test14>231423532452345</test14>
</test12>
</test11>
<test15>
<test16 ID="34252345">
<test3>None test</test3>
<test14>456436436346</test14>
</test16>
</test15>
</test2>
</test1>
Update So would the full code look something like this?
###TEST USING EXAMPLE HOTLIST
with open("file.csv", "w", newline='') as fout:
header = ['test3','test4','test7','test9','test13','test14','test17','test18','test19','Comments']
csvout = csv.DictWriter(fout, fieldnames=header)
csvout.writeheader()
row = {}
for _, elem in ET.iterparse('file.xml'):
# strip the namespace from the element tag name; e.g. {Test.xsd}test14 > test14
tag = re.sub("^{.*?}", "", elem.tag)
if tag == 'test2':
if len(row) != 0:
print(row)
csvout.writerow(row)
row = {}
if len(elem) == 0:
text = elem.text
old = row.get(tag)
if old is None:
# first occurrence of the tag
row[tag] = text
elif isinstance(old, str):
# second occurrence of the tag
row[tag] = [old, text]
else:
# already a list
old.append(text)
For nested XML you can use iterparse() function to iterate over all elements in the XML. You would then need to have logic to handle the elements depending on what tag it's looking at to add to a dictionary object to export as a row.
for _, elem in ET.iterparse('file.xml'):
if len(elem) == 0:
print(f'{elem.tag} {elem.attrib} text={elem.text}')
else:
print(f'{elem.tag} {elem.attrib}')
To create a row in a CSV file from the element text then can do something like this. If, for example, the "test2" marks the beginning of a new record then that can be used to write the record to a new row and clear the dictionary for the next record.
If want to output all or some attributes then need to add a few lines of code for that. If attribute names have the same name as element name or multiple elements have same attribute (e.g. ID) then need to address that in your code.
import xml.etree.ElementTree as ET
import re
import csv
with open("out.csv", "w", newline='') as fout:
header = ['test3','test4','test7','test9','test13','test14','test17','test18','test19','Comments']
csvout = csv.DictWriter(fout, fieldnames=header)
csvout.writeheader()
row = {}
for _, elem in ET.iterparse('test.xml'):
# strip the namespace from the element tag name; e.g. {Test.xsd}test14 > test14
tag = re.sub("^{.*?}", "", elem.tag)
if tag == 'test2':
if len(row) != 0:
print(row)
csvout.writerow(row)
row = {}
if len(elem) == 0:
row[tag] = elem.text
Output:
{'test3': 'Something Something', 'test4': 'AA', 'Comments': 'BB', 'test7': '123 street', 'test9': 'test work', 'test14': '746745636', 'test13': 'Some date'}
{'test3': 'None test', 'test4': 'Someone', 'Comments': 'Some comment', 'test7': '5634643643', 'test17': 'Some Info', 'test19': 'Somewhere', 'test18': '63243333', 'test14': '456436436346', 'test13': '54234532452345'}
CSV Output:
test3,test4,test7,test9,test13,test14,test17,test18,test19,Comments
Something Something,AA,123 street,test work,Some date,746745636,,,,BB
None test,Someone,5634643643,,54234532452345,456436436346,Some Info,63243333,Somewhere,Some comment
Update:
If want to handle duplicate tags and create a list of values then try something like this:
if len(elem) == 0:
text = elem.text
old = row.get(tag)
if old is None:
# first occurrence
row[tag] = text
elif isinstance(old, str):
# second occurrence > create list
row[tag] = [old, text]
else:
old.append(text)

XML Parsing Python ElementTree - Nested for loops

I'm using Jupyter Notebook and ElementTree (Python 3) to create a dataframe and save as csv from an XML file. Here is the XML format (in Estonian):
<asutused hetk="2020-04-14T03:53:33" ver="2">
<asutus>
<registrikood>10000515</registrikood>
<nimi>Osaühing B.Braun Medical</nimi>
<aadress />
<tegevusload>
<tegevusluba>
<tegevusloa_number>L04647</tegevusloa_number>
<alates>2019-12-10</alates>
<kuni />
<loaliik_kood>1</loaliik_kood>
<loaliik_nimi>Eriarstiabi</loaliik_nimi>
<haiglaliik_kood />
<haiglaliik_nimi />
<tegevuskohad>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
</tegevuskohad>
</tegevusluba>
<tegevusluba>
<tegevusloa_number>L04651</tegevusloa_number>
<alates>2019-12-11</alates>
<kuni />
<loaliik_kood>2</loaliik_kood>
<loaliik_nimi>Õendusabi</loaliik_nimi>
<haiglaliik_kood />
<haiglaliik_nimi />
<tegevuskohad>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
</tegevuskohad>
</tegevusluba>
</tegevusload>
<tootajad>
<tootaja>
<kood>D03091</kood>
<eesnimi>Evo</eesnimi>
<perenimi>Kaha</perenimi>
<kutse_kood>11</kutse_kood>
<kutse_nimi>Arst</kutse_nimi>
<erialad>
<eriala>
<kood>E420</kood>
<nimi>üldkirurgia</nimi>
</eriala>
</erialad>
</tootaja>
<tootaja>
<kood>N01146</kood>
<eesnimi>Karmen</eesnimi>
<perenimi>Mežulis</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
<tootaja>
<kood>N01153</kood>
<eesnimi>Nele</eesnimi>
<perenimi>Terras</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
<tootaja>
<kood>N02767</kood>
<eesnimi>Helena</eesnimi>
<perenimi>Tern</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
<tootaja>
<kood>N12882</kood>
<eesnimi>Hanna</eesnimi>
<perenimi>Leemet</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
</tootajad>
</asutus>
</asutused>
Each "asutus" is a hospital and I need some of the information inside. Here is my code:
tree = ET.parse("od_asutused.xml")
root = tree.getroot()
# open a file for writing
data = open('EE.csv', 'w')
# create the csv writer object
csvwriter = csv.writer(data, delimiter=';')
head = []
count = 0
for member in root.findall('asutus'):
hospital = []
if count == 0:
ident = member.find('registrikood').tag
head.append(id)
name = member.find('nimi').tag
head.append(name)
address = member.find('aadress').tag
head.append(address)
facility_type = member.find('./tegevusload/tegevusluba/haiglaliik_nimi').tag
head.append(facility_type)
site_address = member.find('./tegevusload/tegevusluba/tegevuskohad/tegevuskoht/aadress').tag
head.append(site_address)
for elem in member.findall('tegevusload'):
list_specs = elem.find('./tegevusluba/tegevuskohad/tegevuskoht/teenused/teenus/nimi').tag
head.append(list_specs)
csvwriter.writerow(head)
count = count + 1
ident = member.find('registrikood').text
hospital.append(ident)
name = member.find('nimi').text
hospital.append(name)
address = member.find('aadress').text
hospital.append(address)
facility_type = member.find('./tegevusload/tegevusluba/haiglaliik_nimi').text
hospital.append(facility_type)
site_address = member.find('./tegevusload/tegevusluba/tegevuskohad/tegevuskoht/aadress').text
hospital.append(site_address)
for spec in elem.findall('tegevusload'):
list_specs = spec.find('./tegevusluba/tegevuskohad/tegevuskoht/teenused/teenus/nimi').text
hospital.append(list_specs)
csvwriter.writerow(hospital)
data.close()
#Upload csv for geocoding
df = pd.read_csv(r'EE.csv', na_filter= False, delimiter=';')
#Rename columns
df.rename(columns = {'<built-in function id>':'id',
'nimi':'name',
'aadress':'address',
'haiglaliik_nimi':'facility_type',
'haiglaliik_kood':'facility_type_c',
'aadress.1':'site_address',
'nimi.1':'list_specs'},
inplace = True)
#Add columns
df['country'] = 'Estonia'
df['cc'] = 'EE'
df.head(10)
And the result of the df.head(10):
Result of dataframe
The "list_specs" is blank no matter what I do. How can I populate this field with a list of each 'nimi' for each site address? Thank you.
I found in your code the following points to change:
At least on my computer, calling csv.writer causes that newline chars
are doubled. The remedy I found is to open the output file with
additional parameters:
data = open('EE.csv', 'w', newline='\n', encoding='utf-8')
There is no sense to write head with Estonian column names and then
rename the columns. Note also that in head.append(id) you use an undeclared
variable (id).
But this is not so important, as I changed this whole section with writing
target column names (see below).
As you write the CSV file to be read by read_csv, it should contain a
fixed number of columns. So it is a bad practice to use a loop to write
one element.
Your instruction list_specs = elem.findall(...) was wrong, because
elem is not set in the current loop. Instead you should use member (but
I solved this detail other way).
There is no sense to create a variable only in order to use it once.
More concise and readable code is e.g. hospital.append(member.findtext('nimi')).
To avoid long XPath expressions, with repeated initial part, I decided
to set a temporary variable "in the middle" of this path, e.g.
tgvLb = member.find('tegevusload/tegevusluba') and then use a relative
XPath starting from this node.
Your rename instruction contains one not needed column, namely facility_type_c. You read only 6 columns, not 7.
So change the middle part of your code to:
data = open('EE.csv', 'w', newline='\n', encoding='utf-8')
csvwriter = csv.writer(data, delimiter=';')
head = ['id', 'name', 'address', 'facility_type', 'site_address', 'list_specs']
csvwriter.writerow(head)
for member in root.findall('asutus'):
hospital = []
hospital.append(member.findtext('registrikood'))
hospital.append(member.findtext('nimi'))
hospital.append(member.findtext('aadress'))
tgvLb = member.find('tegevusload/tegevusluba')
hospital.append(tgvLb.findtext('haiglaliik_nimi'))
tgvKoht = tgvLb.find('tegevuskohad/tegevuskoht')
hospital.append(tgvKoht.findtext('aadress'))
hospital.append(tgvKoht.findtext('teenused/teenus/nimi'))
csvwriter.writerow(hospital)
data.close()
df = pd.read_csv(r'EE.csv', na_filter= False, delimiter=';')
and drop df.rename from your code.

Building XML from excel data with Python

I am trying to build an xml file from an excel spreadsheet using python but am having trouble building the structure. The xml schema is unique to a software so the opening few tags and ending few would be easier to be written to the xml file just as variables, shown below. They are constant so are pulled from the "
I believe the script neeeds to loop through another sheet, being the ".XML Framework" sheet to build the .xml structure as these are the values which will be ultimately changing. The structure of this sheet is provided below.
here is the .xml structure, from which the python is outputting well up to the unique values, and the changing values are shown in bold. This shows just one row of the data from the workbook. When the workbook has a second row, the .xml structure repeats again where it starts with .
The data structure in the excel sheet ".XML Framework" is:
col 1 = **equals**
col 2 = **74**
col 3 = **Data**"
col 4 = col 3
col 5 = **Name 07**
col 6 = col 5
col 7 = **wstring**
col 8 = /**SM15-HVAC-SUPP-TM-37250-ST**
THIS IS THE DESIRED XML STRUCTURE
<?xml version="1.0" encoding="UTF-8" ?>
<exchange xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://download.autodesk.com/us/navisworks/schemas/nw-exchange-12.0.xsd" units="m" filename="" filepath="">
<selectionsets>
<selectionset name="Dev_1">
<findspec mode="all" disjoint="0">
<conditions>
<condition test="**equals**" flags="**74**">
<category>
<name internal="**Data**">**Data**</name>
</category>
<property>
<name internal="**Name 07**">**Name 07**</name>
</property>
<value>
<data type="**wstring**">/**SM15-HVAC-SUPP-TM-37250-ST**</data>
</value>
</condition>
</conditions>
<locator>/</locator>
</findspec>
</exchange>
Here is my attempt from the python:
path = (r"C:\\Users\\ciara\\desktop\\")
book = os.path.join(path + "Search_Set.xlsm")
wb = openpyxl.load_workbook(book)
sh = wb.get_sheet_by_name('.XML Framework')
df1 = pd.read_excel(book, "<CLEAN>", header=None)
#opening 5 lines of .xml search
print(df1)
cV1 = df1.iloc[0,0] #xml header
print (cV1)
cV2 = df1.iloc[1,0] #<exchange>
print (cV2)
cV3 = df1.iloc[2,0] #<selectionsets>
print (cV3)
cV4 = df1.iloc[3,0] #<selection set name>
print (cV4)
cV5 = df1.iloc[4,0] #<findspec mode>
print (cV5)
cV6 = df1.iloc[5,0] #<findspec mode>
print (cV6)
E = lxml.builder.ElementMaker()
root = ET.Element(cV1)
doc0 = ET.SubElement(root, cV2)
doc1 = ET.SubElement(doc0, cV3)
doc2 = ET.SubElement(doc1, cV4)
doc3 = ET.SubElement(doc2, cV5)
doc4 = ET.SubElement(doc3, cV6)
the_doc = root(
doc0(
doc1(
doc2(
doc3(
FIELD1('condition test=', name='blah'),
FIELD2('some value2', name='asdfasd'),
)
)
)
)
)
print (lxml.etree.tostring(the_doc, pretty_print=True))
tree = ET.ElementTree(root)
tree.write("filename.xml")

parsing repeating child elements python

I am trying to parse an XML document that contains repeating child elements using Python. When I attempt to parse the data, it creates an empty file. If I comment out the repeating child elements code (see bolded section in python script below), the document generates correctly. Can someone help?
XML:
<?xml version="1.0" encoding="ISO-8859-1" standalone="no"?>
<FRPerformance xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<FRPerformanceShareClassCurrency>
<FundCode>00190</FundCode>
<CurrencyID>USD</CurrencyID>
<FundShareClassCode>A</FundShareClassCode>
<ReportPeriodFrequency>Quarterly</ReportPeriodFrequency>
<ReportPeriodEndDate>06/30/2012</ReportPeriodEndDate>
<Net>
<Annualized>
<Year1>-4.909000000</Year1>
<Year3>10.140000000</Year3>
<Year5>-22.250000000</Year5>
<Year10>-7.570000000</Year10>
<Year15>-4.730000000</Year15>
<Year20>-0.900000000</Year20>
<SI>1.900000000</SI>
</Annualized>
</Net>
<Gross>
<Annualized>
<Month3>1.279000000</Month3>
<YTD>7.294000000</YTD>
<Year1>-0.167000000</Year1>
<Year3>11.940000000</Year3>
<Year5>-21.490000000</Year5>
<Year10>-7.120000000</Year10>
<Year15>-4.420000000</Year15>
<Year20>-0.660000000</Year20>
<SI>2.110000000</SI>
</Annualized>
<Cumulative>
<Month1Back>2.288000000</Month1Back>
<Month2Back>-1.587000000</Month2Back>
<Month3Back>0.610000000</Month3Back>
<CurrentYear>7.294000000</CurrentYear>
<Year1Back>-2.409000000</Year1Back>
<Year2Back>13.804000000</Year2Back>
<Year3Back>20.287000000</Year3Back>
<Year4Back>-78.528000000</Year4Back>
<Year5Back>-0.101000000</Year5Back>
<Year6Back>9.193000000</Year6Back>
<Year7Back>2.659000000</Year7Back>
<Year8Back>9.208000000</Year8Back>
<Year9Back>25.916000000</Year9Back>
<Year10Back>-3.612000000</Year10Back>
</Cumulative>
<HistoricReturns>
<HistoricReturns_Item>
<Date>Fri, 28 Feb 1997 00:00:00 -0600</Date>
<Return>32058.090000000</Return>
</HistoricReturns_Item>
<HistoricReturns_Item>
<Date>Fri, 28 Feb 2003 00:00:00 -0600</Date>
<Return>36415.110000000</Return>
</HistoricReturns_Item>
<HistoricReturns_Item>
<Date>Fri, 29 Feb 2008 00:00:00 -0600</Date>
<Return>49529.290000000</Return>
</HistoricReturns_Item>
<HistoricReturns_Item>
<Date>Fri, 30 Apr 1993 00:00:00 -0600</Date>
<Return>21621.500000000</Return>
</HistoricReturns_Item>
</<HistoricReturns>
Python script
## Create command line arguments for XML file and tageName
xmlFile = sys.argv[1]
tagName = sys.argv[2]
tree = ET.parse(xmlFile)
root = tree.getroot()
## Setup the file for output
saveout = sys.stdout
output_file = open('parsedXML.csv', 'w')
sys.stdout = output_file
## Parse XML
for node in root.findall(tagName):
fundCode = node.find('FundCode').text
curr = node.find('CurrencyID').text
shareClass = node.find('FundShareClassCode').text
for node2 in node.findall('./Net/Annualized'):
year1 = node2.findtext('Year1')
year3 = node2.findtext('Year3')
year5 = node2.findtext('Year5')
year10 = node2.findtext('Year10')
year15 = node2.findtext('Year15')
year20 = node2.findtext('Year20')
SI = node2.findtext('SI')
for node3 in node.findall('./Gross'):
for node4 in node3.findall('./Annualized'):
month3 = node4.findtext('Month3')
ytd = node4.findtext('YTD')
year1g = node4.findtext('Year1')
year3g = node4.findtext('Year3')
year5g = node4.findtext('Year5')
year10g = node4.findtext('Year10')
year15g = node4.findtext('Year15')
year20g = node4.findtext('Year2')
SIg = node4.findtext('SI')
for node5 in node3.findall('./Cumulative'):
month1b = node5.findtext('Month1Back')
month2b = node5.findtext('Month2Back')
month3b = node5.findtext('Month3Back')
curYear = node5.findtext('CurrentYear')
year1b = node5.findtext('Year1Back')
year2b = node5.findtext('Year2Back')
year3b = node5.findtext('Year3Back')
year4b = node5.findtext('Year4Back')
year5b = node5.findtext('Year5Back')
year6b = node5.findtext('Year6Back')
year7b = node5.findtext('Year7Back')
year8b = node5.findtext('Year8Back')
year9b = node5.findtext('Year9Back')
year10b = node5.findtext('Year10Back')
**for node6 in node.findall('./HistoricReturns'):
for node7 in node6.findall('./HistoricReturns_Item'):
hDate = node7.findall('Date')
hReturn = node7.findall('Return')**
print(fundCode, curr, shareClass,year1, year3, year5, year10, year15, year15, year20, SI,month3, ytd, year1g, year3g, year5g, year10g, year15g, year20g, SIg, month1b, month2b, month3b, curYear, year1b, year2b, year3b, year4b, year5b, year6b, year7b, year8b,year9b,year10b, hDate, hReturn)
The sample XML and the python code don't match up in terms of structure. Either
you're missing a closing </Gross> tag from the XML (which should be before the <HistoricReturns> section starts) - in which case the code is correct or
the code should be for node6 in node3.findall('./HistoricReturns'): i.e. node3 instead of node
N.B. The XML sample isn't complete (it isn't well-formed XML) because it's missing closing tags for Gross, FRPerformanceShareClassCurrency and FRPerformance so this makes it impossible to answer the question definitively. Hope this helps though.

Categories