I have Python script that parses a number of XML files with the same structure, finds relevant elements and prints all tags and attibutes (and writes to file, but I would like to create some structured data instead).
This works perfectly fine, but I would like create a new XML file mirroring the structure of the original, but only with the elements matching the patterns I specified.
Here's the function that searches through the files:
import xml.etree.cElementTree as ET
import glob
filename = "media_code2_output.txt"
def find_mediacode2(inputfile, outputfile):
#find parent node
for parent in root.iter("musa"):
#parent node attribute "dr-production" must be true (as string)
if parent.attrib["dr-production"] == "true":
#each child element must have media-code element be 2.
for mediekode in parent.iter("media-code"):
if mediekode.text == "2":
#pint all fields
for field in parent.iter():
print(field.tag, field.attrib, field.text)
#write all fields to file
outputfile.write(str(field.tag) + " " + str(field.attrib) + " " + str(field.text) + "\n")
#print spacer line
outputfile.write("\n"+"-"*80+"\n")
print("\n"+"-"*80+"\n")
for inputfile in glob.glob('*/*.xml'):
tree = ET.parse(inputfile)
root = tree.getroot()
with open(filename, "a+") as outputfile:
find_mediacode2(root, outputfile)
Here's a sample of the data from the files:
https://pastebin.com/AHEcDv36
Ideally, I would like to represent the data in an Access database.
i have a real (and maybe pretty stupid) problem to convert a xml-file into a dataframe from pandas. Im new in python and need some help. I trying a code from another thread and modificate it but it not works.
I want to iterate through this file:
<objects>
<object id="123" name="some_string">
<object>
<id>123</id>
<site id="456" name="somename" query="some_query_as_string"/>
<create-date>some_date</create-date>
<update-date>some_date</update-date>
<update-user id="567" name="User:xyz" query="some_query_as_string"/>
<delete-date/>
<delete-user/>
<deleted>false</deleted>
<system-object>false</system-object>
<to-string>some_string_notifications</to-string>
</object>
<workflow>
<workflow-type id="12345" name="WorkflowType_some_workflow" query="some_query_as_string"/>
<validated>true</validated>
<name>somestring</name>
<exported>false</exported>
</workflow>
Here is my code:
import xml.etree.ElementTree as ET
import pandas as pd
path = "C:/Users/User/Desktop/test.xml"
with open(path, 'rb') as fp:
content = fp.read()
parser = ET.XMLParser(encoding="utf-8")
tree = ET.fromstring(content, parser=parser)
def xml2df(tree):
root = ET.XML(tree)
all_records = []
for i, child in enumerate(root):
record ={}
for subchild in child:
record[subchild.tag] = subchild.text
all_records.append(record)
return pd.DataFrame(all_records)
Where is the problem? Please help :O
You are passing the file location string to ET.fromstring(), which is not the actual contents of the file. You need to read the contents of the file first, then pass that to ET.fromstring().
path = "C:/Users/User/Desktop/test.xml"
with open(path, 'rb') as fp:
content = fp.read()
parser = ET.XMLParser(encoding="utf-8")
tree = ET.fromstring(content, parser=parser)
I've got a piece of code that extracts coordinates from a KML file. It works beautifully and prints to the screen the way I'd want it to print to a CSV file. However, when I attempt to write it to a CSV file, the resulting file is empty.
I've tried both the method below and the standard text output method using .write and .writerows. All have the same result.
Here is the KML I'm using:
<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2" xmlns:gx="http://www.google.com/kml/ext/2.2" xmlns:kml="http://www.opengis.net/kml/2.2" xmlns:atom="http://www.w3.org/2005/Atom">
<Document>
<name>Test3.kml</name>
<Style id="s_ylw-pushpin">
<IconStyle>
<scale>1.1</scale>
<Icon>
<href>http://maps.google.com/mapfiles/kml/pushpin/ylw-pushpin.png</href>
</Icon>
<hotSpot x="20" y="2" xunits="pixels" yunits="pixels"/>
</IconStyle>
</Style>
<Style id="s_ylw-pushpin_hl">
<IconStyle>
<scale>1.3</scale>
<Icon>
<href>http://maps.google.com/mapfiles/kml/pushpin/ylw-pushpin.png</href>
</Icon>
<hotSpot x="20" y="2" xunits="pixels" yunits="pixels"/>
</IconStyle>
</Style>
<StyleMap id="m_ylw-pushpin">
<Pair>
<key>normal</key>
<styleUrl>#s_ylw-pushpin</styleUrl>
</Pair>
<Pair>
<key>highlight</key>
<styleUrl>#s_ylw-pushpin_hl</styleUrl>
</Pair>
</StyleMap>
<Placemark>
<name>Untitled</name>
<styleUrl>#m_ylw-pushpin</styleUrl>
<LineString>
<tessellate>1</tessellate>
<coordinates>
-117.2983479390361,33.27144940863937,0 -117.2979479084534,33.27158154479859,0 -117.2974695164833,33.27172038778199,0 -117.2975027748323,33.27194103134417,0 -117.297514618297,33.27194834552386,0 -117.2979065026131,33.27210103585357,0 -117.2980671096438,33.27197757139673,0 -117.2980506390891,33.27176546338881,0 -117.2983889177018,33.27174732829762,0 -117.2985056013534,33.27196820309105,0 -117.2984607071796,33.27217535203514,0 -117.2982982520078,33.2722451382993,0 -117.2982714656408,33.2722496045722,0 -117.297926137081,33.27225329696987,0 -117.2979181624345,33.27225324047765,0 -117.297660871735,33.27222714260547,0 -117.2976362532899,33.2722186164706,0 -117.2974159727989,33.27218328409937,0 -117.2974081729552,33.27218350960742,0 -117.2970860609136,33.27208829299941,0 -117.2968393500826,33.27207716108421,0 -117.2967459496535,33.27216774204006,0 -117.2966603938058,33.27233920748802,0 -117.2969907889174,33.27237357387524,0 -117.2970232333844,33.27237306198914,0 -117.2973444433226,33.27239693646774,0 -117.297751764355,33.27242613992279,0 -117.2981731050047,33.27243373303686,0 -117.2981813185804,33.27243372905114,0 -117.2985617246156,33.2723816290589,0 -117.2987498163436,33.27248971415388,0 -117.2987694564539,33.27262188734785,0 -117.2985436721398,33.27267540671544,0 -117.2985270445518,33.27267612619851,0 -117.2981490803383,33.27268345629938,0 -117.2981145841072,33.2726829556605,0 -117.2977420026904,33.27265933276826,0 -117.2977334907908,33.27265936075214,0 -117.2977079525845,33.27265943947727,0 -117.297690884793,33.27265933069783,0 -117.2973143742666,33.2726410594433,0 -117.2972972842265,33.27263660852098,0 -117.2972803621663,33.27263663588342,0 -117.2969673713573,33.27262125275644,0 -117.296756583612,33.27260864705382,0 -117.2965634725893,33.27264899681126,0 -117.2965301429721,33.27279607660442,0 -117.296929900768,33.27282274189361,0 -117.2972917056901,33.27281884120617,0 -117.2975482260676,33.27280094439733,0 -117.2979485409129,33.27281652227333,0 -117.2983940432828,33.2728392485114,0 -117.2987809571886,33.27284381722371,0
</coordinates>
</LineString>
</Placemark>
</Document>
</kml>
And the code:
from xml.dom import minidom
import csv
xmldoc = minidom.parse("Test.kml")
kml = xmldoc.getElementsByTagName("kml")[0]
document = kml.getElementsByTagName("Document")[0]
placemarks = document.getElementsByTagName("Placemark")
for placemark in placemarks:
coords = placemark.getElementsByTagName("coordinates")[0].firstChild.data
list = coords.split(",")
for items in list:
item = items.split(",")
for allitems in item:
latlon = allitems.replace("0 ","")
latlon = latlon.strip()
print(latlon) # <-- Printing to the screen works fine
with open("Output.csv", "w") as output:
writer = csv.writer(output, delimiter='\n')
writer.writerow(latlon)
****SOLVED****
Final working solution is this:
with open("Output.csv", "w") as text_file: # open the file first
#writer = csv.writer(output, delimiter='\n') # and get ready to write
for placemark in placemarks:
coords = placemark.getElementsByTagName("coordinates")[0].firstChild.data
list = coords.split(",")
for items in list:
item = items.split(",")
for allitems in item:
latlon = allitems.replace("0 ","")
latlon = latlon.strip()
print(latlon) # <-- Printing to the screen works fine
text_file.write(latlon + '\n') # Write the row to the already-open file
I abandoned the csv method and went with a text file output, just renaming to csv. I end up with the result I need. Thanks to all that contributed.
The with and writer= should be happening once, at the beginning of your loop. As it is now, you are re-creating the file for each item, throwing away the last item.
with open("Output.csv", "w") as output: # open the file first
writer = csv.writer(output, delimiter='\n') # and get ready to write
for placemark in placemarks:
coords = placemark.getElementsByTagName("coordinates")[0].firstChild.data
list = coords.split(",")
for items in list:
item = items.split(",")
for allitems in item:
latlon = allitems.replace("0 ","")
latlon = latlon.strip()
print(latlon) # <-- Printing to the screen works fine
writer.writerow([latlon]) # Write the row to the already-open file
# EDIT 2 ^ ^
Edit Now there may be another issue: it looks like latlon is a string, but writerow expects a list of items, and fills in the commas between the items automatically. You might want print(latlon + ',', file=output) instead of writer.writerow depending on your specific use case.
Edit 2 Use [latlon] instead of latlon to get the whole line on one row instead of one character per row. The brackets make it a list of one item rather than a string, which behaves in this context like a list of its characters, one at a time.
i have this xml but having issue parsing it into csv, i tried simple print statement but still getting no value:
<?xml version="1.0" encoding="UTF-8"?>
<Document xmlns="urn:iso:std:iso:20022:tech:xsd:pain.008.001.02" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<CstmrDrctDbtInitn>
<GrpHdr>
<MsgId>1820</MsgId>
<CreDtTm>2016-05-17T11:56:12</CreDtTm>
<NbOfTxs>197</NbOfTxs>
<CtrlSum>136661.81</CtrlSum>
<InitgPty>
<Nm>GS Netherlands CDZ C.V.</Nm>
</InitgPty>
</GrpHdr>
</CstmrDrctDbtInitn>
<CstmrDrctDbtInitn>
<GrpHdr>
<CreDtTm>2016-05-18T10:34:51</CreDtTm>
<NbOfTxs>1</NbOfTxs>
<CtrlSum>758.99</CtrlSum>
<InitgPty>
<Nm>GS Netherlands CDZ C.V.</Nm>
</InitgPty></GrpHdr></CstmrDrctDbtInitn>
</Document>
and i want to iterate value for each node.
So far i have written code as below:
import xml.etree.ElementTree as ET
import csv
with open("D:\Python\Dave\\17_05_16_1820_DD201606B10_Base.xml") as myFile:
tree = ET.parse(myFile)
ns = {'d': 'urn:iso:std:iso:20022:tech:xsd:pain.008.001.02'}
# open a file for writing
Resident_data = open('Bank.csv', 'w')
# create the csv writer object
csvwriter = csv.writer(Resident_data)
resident_head = []
#write Header
MsgId = 'MsgId'
resident_head.append(MsgId)
CreDtTm = 'CreDtTm'
resident_head.append(CreDtTm)
NbOfTxs = 'NbOfTxs'
resident_head.append(NbOfTxs)
CtrlSum = 'CtrlSum'
resident_head.append(CtrlSum)
csvwriter.writerow(resident_head)
for member in tree.findall('.//d:Document/d:CstmrDrctDbtInitn/d:GrpHdr/d:MsgId', ns):
resident = []
#write values
MsgId = member.find('MsgId').text
resident.append(MsgId)
CreDtTm = member.find('CreDtTm').text
resident.append(CreDtTm)
NbOfTxs = member.find('NbOfTxs').text
resident.append(NbOfTxs)
CtrlSum = member.find('CtrlSum').text
resident.append(CtrlSum)
csvwriter.writerow(resident)
Resident_data.close()
I get no error and my Bank.csv has only header but no data please help
I am attempting to create a txt file that includes XML files in a dir and the text within each XML files when a tag is present.
I am having trouble reading a csv row in as a variable using the command below. I have attempted to pull the required values multiple ways but continue to run into a brick wall.
Here is the code:
container = raw_input("Choose a filename for your container:")
epub = zipfile.ZipFile( container + ".zip", 'w')
xmlinput = glob.glob('./*.xml')
def xmldrop(dir):
for r,d,f in os.walk(dir):
for files in f:
if files.endswith(".xml"):
dom=parse(os.path.join(r, files))
name = dom.getElementsByTagName('title')
with open('catalog.csv', 'a') as f:
f.write(files + "," + name[0].firstChild.nodeValue + "\n")
xmldrop("./")
line_number = 0
with open('catalog.csv', 'rb') as f:
mycsv = csv.reader(f)
mycsv = list(mycsv)
text = mycsv[line_number+1][1]
list_tpl = '''
<Container>
<FileName>
%(FileName)s
</FileName>
</Container>'''
FileName = ""
for i, xml in enumerate(xmlinput):
basename = os.path.basename(xml)
FileName += ('<Fileid="%i" filename="%s"> <title>%s</title> </Fileid>' %
(i+1, basename, text))
epub.writestr('list.txt', list_tpl % {
'FileName': FileName
})
I am able to successfully pull the information into a csv file as seen with this output:
file_1.xml,Intro
file_2.xml,Assessment
file_3.xml,Review
file_4.xml,Catalog
but the list.txt file that gets generated looks like:
<Container>
<FileName>
<Fileid="1" filename="file_1.xml"> <title>Assessment</title></p> </Fileid>
<Fileid="2" filename="file_2.xml"> <title>Assessment</title></p> </Fileid>
<Fileid="3" filename="file_3.xml"> <title>Assessment</title></p> </Fileid>
<Fileid="4" filename="file_4.xml"> <title>Assessment</title></p> </Fileid>
</FileName>
</Container>
Desired output would be:
<Container>
<FileName>
<Fileid="1" filename="file_1.xml"> <title>Intro</title> </Fileid>
<Fileid="2" filename="file_2.xml"> <title>Assessment</title> </Fileid>
<Fileid="3" filename="file_3.xml"> <title>Review</title> </Fileid>
<Fileid="4" filename="file_4.xml"> <title>Catalog</title> </Fileid>
</FileName>
</Container>
Any assistance is greatly appreciated. I have been trying to pair the two up for over a week now with no success.
You aren't updating the text variable when you are printing out your xml.
You set it once
text = mycsv[line_number+1][1]
but you never update it again, so it keeps outputting Assesment