parsing repeating child elements python - python

I am trying to parse an XML document that contains repeating child elements using Python. When I attempt to parse the data, it creates an empty file. If I comment out the repeating child elements code (see bolded section in python script below), the document generates correctly. Can someone help?
XML:
<?xml version="1.0" encoding="ISO-8859-1" standalone="no"?>
<FRPerformance xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<FRPerformanceShareClassCurrency>
<FundCode>00190</FundCode>
<CurrencyID>USD</CurrencyID>
<FundShareClassCode>A</FundShareClassCode>
<ReportPeriodFrequency>Quarterly</ReportPeriodFrequency>
<ReportPeriodEndDate>06/30/2012</ReportPeriodEndDate>
<Net>
<Annualized>
<Year1>-4.909000000</Year1>
<Year3>10.140000000</Year3>
<Year5>-22.250000000</Year5>
<Year10>-7.570000000</Year10>
<Year15>-4.730000000</Year15>
<Year20>-0.900000000</Year20>
<SI>1.900000000</SI>
</Annualized>
</Net>
<Gross>
<Annualized>
<Month3>1.279000000</Month3>
<YTD>7.294000000</YTD>
<Year1>-0.167000000</Year1>
<Year3>11.940000000</Year3>
<Year5>-21.490000000</Year5>
<Year10>-7.120000000</Year10>
<Year15>-4.420000000</Year15>
<Year20>-0.660000000</Year20>
<SI>2.110000000</SI>
</Annualized>
<Cumulative>
<Month1Back>2.288000000</Month1Back>
<Month2Back>-1.587000000</Month2Back>
<Month3Back>0.610000000</Month3Back>
<CurrentYear>7.294000000</CurrentYear>
<Year1Back>-2.409000000</Year1Back>
<Year2Back>13.804000000</Year2Back>
<Year3Back>20.287000000</Year3Back>
<Year4Back>-78.528000000</Year4Back>
<Year5Back>-0.101000000</Year5Back>
<Year6Back>9.193000000</Year6Back>
<Year7Back>2.659000000</Year7Back>
<Year8Back>9.208000000</Year8Back>
<Year9Back>25.916000000</Year9Back>
<Year10Back>-3.612000000</Year10Back>
</Cumulative>
<HistoricReturns>
<HistoricReturns_Item>
<Date>Fri, 28 Feb 1997 00:00:00 -0600</Date>
<Return>32058.090000000</Return>
</HistoricReturns_Item>
<HistoricReturns_Item>
<Date>Fri, 28 Feb 2003 00:00:00 -0600</Date>
<Return>36415.110000000</Return>
</HistoricReturns_Item>
<HistoricReturns_Item>
<Date>Fri, 29 Feb 2008 00:00:00 -0600</Date>
<Return>49529.290000000</Return>
</HistoricReturns_Item>
<HistoricReturns_Item>
<Date>Fri, 30 Apr 1993 00:00:00 -0600</Date>
<Return>21621.500000000</Return>
</HistoricReturns_Item>
</<HistoricReturns>
Python script
## Create command line arguments for XML file and tageName
xmlFile = sys.argv[1]
tagName = sys.argv[2]
tree = ET.parse(xmlFile)
root = tree.getroot()
## Setup the file for output
saveout = sys.stdout
output_file = open('parsedXML.csv', 'w')
sys.stdout = output_file
## Parse XML
for node in root.findall(tagName):
fundCode = node.find('FundCode').text
curr = node.find('CurrencyID').text
shareClass = node.find('FundShareClassCode').text
for node2 in node.findall('./Net/Annualized'):
year1 = node2.findtext('Year1')
year3 = node2.findtext('Year3')
year5 = node2.findtext('Year5')
year10 = node2.findtext('Year10')
year15 = node2.findtext('Year15')
year20 = node2.findtext('Year20')
SI = node2.findtext('SI')
for node3 in node.findall('./Gross'):
for node4 in node3.findall('./Annualized'):
month3 = node4.findtext('Month3')
ytd = node4.findtext('YTD')
year1g = node4.findtext('Year1')
year3g = node4.findtext('Year3')
year5g = node4.findtext('Year5')
year10g = node4.findtext('Year10')
year15g = node4.findtext('Year15')
year20g = node4.findtext('Year2')
SIg = node4.findtext('SI')
for node5 in node3.findall('./Cumulative'):
month1b = node5.findtext('Month1Back')
month2b = node5.findtext('Month2Back')
month3b = node5.findtext('Month3Back')
curYear = node5.findtext('CurrentYear')
year1b = node5.findtext('Year1Back')
year2b = node5.findtext('Year2Back')
year3b = node5.findtext('Year3Back')
year4b = node5.findtext('Year4Back')
year5b = node5.findtext('Year5Back')
year6b = node5.findtext('Year6Back')
year7b = node5.findtext('Year7Back')
year8b = node5.findtext('Year8Back')
year9b = node5.findtext('Year9Back')
year10b = node5.findtext('Year10Back')
**for node6 in node.findall('./HistoricReturns'):
for node7 in node6.findall('./HistoricReturns_Item'):
hDate = node7.findall('Date')
hReturn = node7.findall('Return')**
print(fundCode, curr, shareClass,year1, year3, year5, year10, year15, year15, year20, SI,month3, ytd, year1g, year3g, year5g, year10g, year15g, year20g, SIg, month1b, month2b, month3b, curYear, year1b, year2b, year3b, year4b, year5b, year6b, year7b, year8b,year9b,year10b, hDate, hReturn)

The sample XML and the python code don't match up in terms of structure. Either
you're missing a closing </Gross> tag from the XML (which should be before the <HistoricReturns> section starts) - in which case the code is correct or
the code should be for node6 in node3.findall('./HistoricReturns'): i.e. node3 instead of node
N.B. The XML sample isn't complete (it isn't well-formed XML) because it's missing closing tags for Gross, FRPerformanceShareClassCurrency and FRPerformance so this makes it impossible to answer the question definitively. Hope this helps though.

Related

encoding Lithuanian character in xml using python

I have a code:
def convert_df_to_xml(df,fd,ld):
# sukuriam pagrindini elementa (root) su pavadinimu Invoices.
root = ET.Element("Invoices")
root.set("from", str(fd))
root.set("till", str(ld))
for i in range(len(df['partner_id'])):
# pridedam sub elementa.
invoices = ET.SubElement(root, "Invoice")
invoices.set('clientid',df['company_registry'][i])
invoices.set('imones_pavadinimas', df['partner_id'][i])
# pridedam sub-sub elementa.
quebec = ET.SubElement(invoices, "Product")
# susikraunam eiluciu info is dataframe
sectin_1 = ET.SubElement(quebec, "Name")
sectin_1.text = str(df["Name"][i])
sectin_2 = ET.SubElement(quebec, 'Quantity')
sectin_2.text = str(df["time_dif"][i])
sectin_3 = ET.SubElement(quebec, 'Price')
sectin_3.text = str(df["price_unit"][i])
xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ", encoding="UTF-8").decode("UTF-8")
with open("bandomasis_itp_xml_failas_V_1.1.xml", "w") as f:
f.write(xmlstr)
I'm creating xml file from python DataFrame. The problem is that in xml file I got "?" marks instead "ė" character.
In dataframe i have strings with characters "ė,ą,š,ų" and I need them to be in xml file.
My dataframe:
df1 = pd.DataFrame({'partner_id': ['MED GRUPĖ, UAB'], 'Name':['Pirmas'], 'company_registry': ['3432543'],
'time_dif':['2'],'price_unit':['23']})
what is the problem with encoding here?

python regex gives 1 instead of 01

Taken reference Why doesn't [01-12] range work as expected?
m = re.search(r"(\w+)\[([0-9]+)\:([0-9]+)\]", DUNESX[01:44])
or
m = re.search(r"(\w+)\[(0[1-9]|1[0-9]|2[0-9]|3[0-9]|4[0-9])\:([0-9]+)\]", DUNESX[01:44])
or
m = re.search(r"(\w+)\[(0?[1-9]|1[0-9]|2[0-9]|3[0-9]|4[0-9])\:([0-9]+)\]", DUNESX[01:44])
or
m = re.search(r"(\w+)\[([0-1][0-9]+)\:([0-9]+)\]", DUNESX[01:44])
But output from above expressions are
['DUNESX1', 'DUNESX2', 'DUNESX3', 'DUNESX4', 'DUNESX5', 'DUNESX6', 'DUNESX7', 'DUNESX8', 'DUNESX9', 'DUNESX10', 'DUNESX11', 'DUNESX12', 'DUNESX13', 'DUNESX14', 'DUNESX15', 'DUNESX16', 'DUNESX17', 'DUNESX18', 'DUNESX19', 'DUNESX20', 'DUNESX21', 'DUNESX22', 'DUNESX23', 'DUNESX24', 'DUNESX25', 'DUNESX26', 'DUNESX27', 'DUNESX28', 'DUNESX29', 'DUNESX30', 'DUNESX31', 'DUNESX32', 'DUNESX33', 'DUNESX34', 'DUNESX35', 'DUNESX36', 'DUNESX37', 'DUNESX38', 'DUNESX39', 'DUNESX40', 'DUNESX41', 'DUNESX42', 'DUNESX43', 'DUNESX44']
Doesn't provide desired output like
['DUNESX01', 'DUNESX02', 'DUNESX03', 'DUNESX04', 'DUNESX05', 'DUNESX06', 'DUNESX07', 'DUNESX08', 'DUNESX09', 'DUNESX10', 'DUNESX11', 'DUNESX12', 'DUNESX13', 'DUNESX14', 'DUNESX15', 'DUNESX16', 'DUNESX17', 'DUNESX18', 'DUNESX19', 'DUNESX20', 'DUNESX21', 'DUNESX22', 'DUNESX23', 'DUNESX24', 'DUNESX25', 'DUNESX26', 'DUNESX27', 'DUNESX28', 'DUNESX29', 'DUNESX30', 'DUNESX31', 'DUNESX32', 'DUNESX33', 'DUNESX34', 'DUNESX35', 'DUNESX36', 'DUNESX37', 'DUNESX38', 'DUNESX39', 'DUNESX40', 'DUNESX41', 'DUNESX42', 'DUNESX43', 'DUNESX44']
Complete code is
import re
group_list = ['DUNESX[01:44]']
host_list = getgrandchild(group_list)
def getgrandchild(child):
nodelist = []
if child is None:
return
for nodes in child:
print(nodes)
if re.match(r".*(\[[0-1][0-9]+\:[0-9]+\])", nodes):
m = re.search(r"(\w+)\[([0-9]+)\:([0-9]+)\]", nodes)
lb = int(m.group(2))
ub = int(m.group(3))
for i in range(lb, ub+1):
nodelist.append(m.group(1)+str(i))
elif re.match(r"(\w+)", nodes):
m = re.search(r"(\w+)", nodes)
nodelist.append(m.group(1))
I think I understand what you are trying to do. Here is a code that helps:
def getgrandchild(child):
nodelist = []
for nodes in child:
m = re.search("(\w+)\[([0-9]+)\:([0-9]+)\]",nodes)
for i in range(int(m.group(2)),int(m.group(3))+1):
nodelist.append(m.group(1)+str(i).zfill(len(m.group(2))))
return nodelist
You can see i've skipped some steps, but you can have them. I've focused on the main part.
We use zfill to add numbers of the form '001' or '01', which is explained here.
So, for this code, if you give:
getgrandchild(['DUNESX[01:44]'])
you get:
['DUNESX01', 'DUNESX02', 'DUNESX03', 'DUNESX04', 'DUNESX05', 'DUNESX06', 'DUNESX07', 'DUNESX08', 'DUNESX09', 'DUNESX10', 'DUNESX11', 'DUNESX12', 'DUNESX13', 'DUNESX14', 'DUNESX15', 'DUNESX16', 'DUNESX17', 'DUNESX18', 'DUNESX19', 'DUNESX20', 'DUNESX21', 'DUNESX22', 'DUNESX23', 'DUNESX24', 'DUNESX25', 'DUNESX26', 'DUNESX27', 'DUNESX28', 'DUNESX29', 'DUNESX30', 'DUNESX31', 'DUNESX32', 'DUNESX33', 'DUNESX34', 'DUNESX35', 'DUNESX36', 'DUNESX37', 'DUNESX38', 'DUNESX39', 'DUNESX40', 'DUNESX41', 'DUNESX42', 'DUNESX43', 'DUNESX44']
Also, if you give:
getgrandchild(['PYTHON[001:025]'])
you get:
['PYTHON001', 'PYTHON002', 'PYTHON003', 'PYTHON004', 'PYTHON005', 'PYTHON006', 'PYTHON007', 'PYTHON008', 'PYTHON009', 'PYTHON010', 'PYTHON011', 'PYTHON012', 'PYTHON013', 'PYTHON014', 'PYTHON015', 'PYTHON016', 'PYTHON017', 'PYTHON018', 'PYTHON019', 'PYTHON020', 'PYTHON021', 'PYTHON022', 'PYTHON023', 'PYTHON024', 'PYTHON025']

XML Parsing Python ElementTree - Nested for loops

I'm using Jupyter Notebook and ElementTree (Python 3) to create a dataframe and save as csv from an XML file. Here is the XML format (in Estonian):
<asutused hetk="2020-04-14T03:53:33" ver="2">
<asutus>
<registrikood>10000515</registrikood>
<nimi>Osaühing B.Braun Medical</nimi>
<aadress />
<tegevusload>
<tegevusluba>
<tegevusloa_number>L04647</tegevusloa_number>
<alates>2019-12-10</alates>
<kuni />
<loaliik_kood>1</loaliik_kood>
<loaliik_nimi>Eriarstiabi</loaliik_nimi>
<haiglaliik_kood />
<haiglaliik_nimi />
<tegevuskohad>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
</tegevuskohad>
</tegevusluba>
<tegevusluba>
<tegevusloa_number>L04651</tegevusloa_number>
<alates>2019-12-11</alates>
<kuni />
<loaliik_kood>2</loaliik_kood>
<loaliik_nimi>Õendusabi</loaliik_nimi>
<haiglaliik_kood />
<haiglaliik_nimi />
<tegevuskohad>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
</tegevuskohad>
</tegevusluba>
</tegevusload>
<tootajad>
<tootaja>
<kood>D03091</kood>
<eesnimi>Evo</eesnimi>
<perenimi>Kaha</perenimi>
<kutse_kood>11</kutse_kood>
<kutse_nimi>Arst</kutse_nimi>
<erialad>
<eriala>
<kood>E420</kood>
<nimi>üldkirurgia</nimi>
</eriala>
</erialad>
</tootaja>
<tootaja>
<kood>N01146</kood>
<eesnimi>Karmen</eesnimi>
<perenimi>Mežulis</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
<tootaja>
<kood>N01153</kood>
<eesnimi>Nele</eesnimi>
<perenimi>Terras</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
<tootaja>
<kood>N02767</kood>
<eesnimi>Helena</eesnimi>
<perenimi>Tern</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
<tootaja>
<kood>N12882</kood>
<eesnimi>Hanna</eesnimi>
<perenimi>Leemet</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
</tootajad>
</asutus>
</asutused>
Each "asutus" is a hospital and I need some of the information inside. Here is my code:
tree = ET.parse("od_asutused.xml")
root = tree.getroot()
# open a file for writing
data = open('EE.csv', 'w')
# create the csv writer object
csvwriter = csv.writer(data, delimiter=';')
head = []
count = 0
for member in root.findall('asutus'):
hospital = []
if count == 0:
ident = member.find('registrikood').tag
head.append(id)
name = member.find('nimi').tag
head.append(name)
address = member.find('aadress').tag
head.append(address)
facility_type = member.find('./tegevusload/tegevusluba/haiglaliik_nimi').tag
head.append(facility_type)
site_address = member.find('./tegevusload/tegevusluba/tegevuskohad/tegevuskoht/aadress').tag
head.append(site_address)
for elem in member.findall('tegevusload'):
list_specs = elem.find('./tegevusluba/tegevuskohad/tegevuskoht/teenused/teenus/nimi').tag
head.append(list_specs)
csvwriter.writerow(head)
count = count + 1
ident = member.find('registrikood').text
hospital.append(ident)
name = member.find('nimi').text
hospital.append(name)
address = member.find('aadress').text
hospital.append(address)
facility_type = member.find('./tegevusload/tegevusluba/haiglaliik_nimi').text
hospital.append(facility_type)
site_address = member.find('./tegevusload/tegevusluba/tegevuskohad/tegevuskoht/aadress').text
hospital.append(site_address)
for spec in elem.findall('tegevusload'):
list_specs = spec.find('./tegevusluba/tegevuskohad/tegevuskoht/teenused/teenus/nimi').text
hospital.append(list_specs)
csvwriter.writerow(hospital)
data.close()
#Upload csv for geocoding
df = pd.read_csv(r'EE.csv', na_filter= False, delimiter=';')
#Rename columns
df.rename(columns = {'<built-in function id>':'id',
'nimi':'name',
'aadress':'address',
'haiglaliik_nimi':'facility_type',
'haiglaliik_kood':'facility_type_c',
'aadress.1':'site_address',
'nimi.1':'list_specs'},
inplace = True)
#Add columns
df['country'] = 'Estonia'
df['cc'] = 'EE'
df.head(10)
And the result of the df.head(10):
Result of dataframe
The "list_specs" is blank no matter what I do. How can I populate this field with a list of each 'nimi' for each site address? Thank you.
I found in your code the following points to change:
At least on my computer, calling csv.writer causes that newline chars
are doubled. The remedy I found is to open the output file with
additional parameters:
data = open('EE.csv', 'w', newline='\n', encoding='utf-8')
There is no sense to write head with Estonian column names and then
rename the columns. Note also that in head.append(id) you use an undeclared
variable (id).
But this is not so important, as I changed this whole section with writing
target column names (see below).
As you write the CSV file to be read by read_csv, it should contain a
fixed number of columns. So it is a bad practice to use a loop to write
one element.
Your instruction list_specs = elem.findall(...) was wrong, because
elem is not set in the current loop. Instead you should use member (but
I solved this detail other way).
There is no sense to create a variable only in order to use it once.
More concise and readable code is e.g. hospital.append(member.findtext('nimi')).
To avoid long XPath expressions, with repeated initial part, I decided
to set a temporary variable "in the middle" of this path, e.g.
tgvLb = member.find('tegevusload/tegevusluba') and then use a relative
XPath starting from this node.
Your rename instruction contains one not needed column, namely facility_type_c. You read only 6 columns, not 7.
So change the middle part of your code to:
data = open('EE.csv', 'w', newline='\n', encoding='utf-8')
csvwriter = csv.writer(data, delimiter=';')
head = ['id', 'name', 'address', 'facility_type', 'site_address', 'list_specs']
csvwriter.writerow(head)
for member in root.findall('asutus'):
hospital = []
hospital.append(member.findtext('registrikood'))
hospital.append(member.findtext('nimi'))
hospital.append(member.findtext('aadress'))
tgvLb = member.find('tegevusload/tegevusluba')
hospital.append(tgvLb.findtext('haiglaliik_nimi'))
tgvKoht = tgvLb.find('tegevuskohad/tegevuskoht')
hospital.append(tgvKoht.findtext('aadress'))
hospital.append(tgvKoht.findtext('teenused/teenus/nimi'))
csvwriter.writerow(hospital)
data.close()
df = pd.read_csv(r'EE.csv', na_filter= False, delimiter=';')
and drop df.rename from your code.

Building XML from excel data with Python

I am trying to build an xml file from an excel spreadsheet using python but am having trouble building the structure. The xml schema is unique to a software so the opening few tags and ending few would be easier to be written to the xml file just as variables, shown below. They are constant so are pulled from the "
I believe the script neeeds to loop through another sheet, being the ".XML Framework" sheet to build the .xml structure as these are the values which will be ultimately changing. The structure of this sheet is provided below.
here is the .xml structure, from which the python is outputting well up to the unique values, and the changing values are shown in bold. This shows just one row of the data from the workbook. When the workbook has a second row, the .xml structure repeats again where it starts with .
The data structure in the excel sheet ".XML Framework" is:
col 1 = **equals**
col 2 = **74**
col 3 = **Data**"
col 4 = col 3
col 5 = **Name 07**
col 6 = col 5
col 7 = **wstring**
col 8 = /**SM15-HVAC-SUPP-TM-37250-ST**
THIS IS THE DESIRED XML STRUCTURE
<?xml version="1.0" encoding="UTF-8" ?>
<exchange xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://download.autodesk.com/us/navisworks/schemas/nw-exchange-12.0.xsd" units="m" filename="" filepath="">
<selectionsets>
<selectionset name="Dev_1">
<findspec mode="all" disjoint="0">
<conditions>
<condition test="**equals**" flags="**74**">
<category>
<name internal="**Data**">**Data**</name>
</category>
<property>
<name internal="**Name 07**">**Name 07**</name>
</property>
<value>
<data type="**wstring**">/**SM15-HVAC-SUPP-TM-37250-ST**</data>
</value>
</condition>
</conditions>
<locator>/</locator>
</findspec>
</exchange>
Here is my attempt from the python:
path = (r"C:\\Users\\ciara\\desktop\\")
book = os.path.join(path + "Search_Set.xlsm")
wb = openpyxl.load_workbook(book)
sh = wb.get_sheet_by_name('.XML Framework')
df1 = pd.read_excel(book, "<CLEAN>", header=None)
#opening 5 lines of .xml search
print(df1)
cV1 = df1.iloc[0,0] #xml header
print (cV1)
cV2 = df1.iloc[1,0] #<exchange>
print (cV2)
cV3 = df1.iloc[2,0] #<selectionsets>
print (cV3)
cV4 = df1.iloc[3,0] #<selection set name>
print (cV4)
cV5 = df1.iloc[4,0] #<findspec mode>
print (cV5)
cV6 = df1.iloc[5,0] #<findspec mode>
print (cV6)
E = lxml.builder.ElementMaker()
root = ET.Element(cV1)
doc0 = ET.SubElement(root, cV2)
doc1 = ET.SubElement(doc0, cV3)
doc2 = ET.SubElement(doc1, cV4)
doc3 = ET.SubElement(doc2, cV5)
doc4 = ET.SubElement(doc3, cV6)
the_doc = root(
doc0(
doc1(
doc2(
doc3(
FIELD1('condition test=', name='blah'),
FIELD2('some value2', name='asdfasd'),
)
)
)
)
)
print (lxml.etree.tostring(the_doc, pretty_print=True))
tree = ET.ElementTree(root)
tree.write("filename.xml")

XML to CSV in PYTHON: Extract series of subnodes for every node

My goal is to convert an .XML file into a .CSV file.
This part of the code is already functional.
However, I also want to extract the sub-sub-nodes of one of the "father" nodes.
Maybe an example would be more self explanatory;
Here is the structure of my XML:
<nedisCatalogue>
<headerInfo>
<feedVersion>1-0</feedVersion>
<dateCreated>2018-01-22T23:37:01+0100</dateCreated>
<supplier>Nedis_BENED</supplier>
<locale>nl_BE</locale>
</headerInfo>
<productList>
<product>
<nedisPartnr><![CDATA[VS-150/63BA]]></nedisPartnr>
<nedisArtlid>17005</nedisArtlid>
<vendorPartnr><![CDATA[TONFREQ-ELKOS / BIPOL 150, 5390]]></vendorPartnr>
<brand><![CDATA[Visaton]]></brand>
<EAN>4007540053905</EAN>
<intrastatCode>8532220000</intrastatCode>
<UNSPSC>52161514</UNSPSC>
<headerText><![CDATA[Crossover Foil capacitor]]></headerText>
<internetText><![CDATA[Bipolaire elco met een ruwe folie en een zeer goede prijs/kwaliteits-verhouding voor de bouw van cross-overs. 63 Vdc, 10% tolerantie.]]></internetText>
<generalText><![CDATA[Dimensions 16 x 35 mm
]]></generalText>
<images>
<image type="2" order="15">767736.JPG</image>
</images>
<attachments>
</attachments>
<categories>
<tree name="Internet_Tree_ISHP">
<entry depth="001" id="1067858"><![CDATA[Audio]]></entry>
<entry depth="002" id="1067945"><![CDATA[Speakers]]></entry>
<entry depth="003" id="1068470"><![CDATA[Accessoires]]></entry>
</tree>
</categories>
<properties>
<property id="360" multiplierID="" unitID="" valueID=""><![CDATA[...]]></property>
</properties>
<status>
<code status="NORMAL"></code>
</status>
<packaging quantity="1" weight="8"></packaging>
<introductionDate>2015-10-26</introductionDate>
<serialnumberKeeping>N</serialnumberKeeping>
<priceLevels>
<normalPricing from="2017-02-13" to="2018-01-23">
<price level="1" moq="1" currency="EUR">2.48</price>
</normalPricing>
<specialOfferPricing></specialOfferPricing>
<goingPriceInclVAT currency="EUR" quantity="1">3.99</goingPriceInclVAT>
</priceLevels>
<tax>
</tax>
<stock>
<inStockLocal>25</inStockLocal>
<inStockCentral>25</inStockCentral>
<ATP>
<nextExpectedStockDateLocal></nextExpectedStockDateLocal>
<nextExpectedStockDateCentral></nextExpectedStockDateCentral>
</ATP>
</stock>
</product>
....
</nedisCatalogue>
And here is the code that I have now:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse("/Users/BE07861/Documents/nedis_catalog_2018-01-23_nl_BE_32191_v1-0_xml")
root = tree.getroot()
f = open('/Users/BE07861/Documents/test2.csv', 'w')
csvwriter = csv.writer(f, delimiter='ç')
count = 0
head = ['Nedis Part Number', 'Nedis Article ID', 'Vendor Part Number', 'Brand', 'EAN', 'Header text', 'Internet Text', 'General Text', 'categories']
prdlist = root[1]
prdct = prdlist[5]
cat = prdct[12]
tree1=cat[0]
csvwriter.writerow(head)
for time in prdlist.findall('product'):
row = []
nedis_number = time.find('nedisPartnr').text
row.append(nedis_number)
nedis_art_id = time.find('nedisArtlid').text
row.append(nedis_art_id)
vendor_part_nbr = time.find('vendorPartnr').text
row.append(vendor_part_nbr)
Brand = time.find('brand').text
row.append(Brand)
ean = time.find('EAN').text
row.append(ean)
header_text = time.find('headerText').text
row.append(header_text)
internet_text = time.find('internetText').text
row.append(internet_text)
general_text = time.find('generalText').text
row.append(general_text)
categ = time.find('categories').find('tree').find('entry').text
row.append(categ)
csvwriter.writerow(row)
f.close()
If you run the code, you'll see that I only retrieve the first "entry" of the categories/tree; which is normal. However, I don't know how to create a loop that, for every node "categories", creates new columns such as categories1, categories2 & categories3 with the values: "entry".
My result should look like this
Nedis Part Number Nedis Article ID Vendor Part Number
VS-150/63BA 17005 TONFREQ-ELKOS / BIPOL 150, 5390
Brand EAN Header text Internet Text
Visaton 4,00754E+12 Crossover Foil capacitor Bipolaire elco …
General Text Category1 Categroy2 Category3
Dimensions 16 x 35 mm Audio Speakers Accessoires
I've really tried my best but didn't manage to find the solution.
Any help would be very much appreciated!!! :)
Thanks a lot,
Allan
I think this is what you're looking for:
for child in time.find('categories').find('tree'):
categ = child.text
row.append(categ)
Here's a solution that loops through the xml once to figure out how many headers to add, adds the headers, and then loops through each product's category list:
**Updated to iterate through images in addition to categories. This is the biggest difference:
for child in time.find('categories').find('tree'):
categ = child.text
row.append(categ)
curcat += 1
while curcat < maxcat:
row.append('')
curcat += 1
It's going to figure out the maximum number of categories on a single record and then and that many columns. If a particular record has less categories, this code will stick blank values in as placeholders so the column headers always line up with the data.
For instance:
Cat1 Cat2 Cat3 Img1 Img2 Img3
A B C 1 2 3
D E <blank> 4 <blank> <blank>
Here's the full solution:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse("c:\\python\\xml.xml")
root = tree.getroot()
f = open('c:\\python\\xml.csv', 'w')
csvwriter = csv.writer(f, delimiter=',')
count = 0
head = ['Nedis Part Number', 'Nedis Article ID', 'Vendor Part Number', 'Brand', 'EAN', 'Header text', 'Internet Text', 'General Text']
prdlist = root[1]
maxcat = 0
for time in prdlist.findall('product'):
cur = 0
for child in time.find('categories').find('tree'):
cur += 1
if cur > maxcat:
maxcat = cur
for cnt in range (0, maxcat):
head.append('Category ' + str(cnt + 1))
maximg = 0
for time in prdlist.findall('product'):
cur = 0
for child in time.find('images'):
cur += 1
if cur > maximg:
maximg = cur
for cnt in range(0, maximg):
head.append('Image ' + str(cnt + 1))
csvwriter.writerow(head)
for time in prdlist.findall('product'):
row = []
nedis_number = time.find('nedisPartnr').text
row.append(nedis_number)
nedis_art_id = time.find('nedisArtlid').text
row.append(nedis_art_id)
vendor_part_nbr = time.find('vendorPartnr').text
row.append(vendor_part_nbr)
Brand = time.find('brand').text
row.append(Brand)
ean = time.find('EAN').text
row.append(ean)
header_text = time.find('headerText').text
row.append(header_text)
internet_text = time.find('internetText').text
row.append(internet_text)
general_text = time.find('generalText').text
row.append(general_text)
curcat = 0
for child in time.find('categories').find('tree'):
categ = child.text
row.append(categ)
curcat += 1
while curcat < maxcat:
row.append('')
curcat += 1
curimg = 0
for img in time.find('images'):
image = img.text
row.append(image)
curimg += 1
while curimg < maximg:
row.append('')
curimg += 1
csvwriter.writerow(row)
f.close()

Categories