My program is parsing values from an XML file and then puts them into a dictionary.
Here I've used a for loop to iterate all tags from the file and attributes and also the text
But when there is a subtag like [250][155] which is <name>, it will overwrite the [4] <name>
And all of this is running under the for loop
Now, I want to hinder the loop from overwriting the values once it has been entered into the loop
import pprint as p # Importing pprint/pretty print for formatting dict
import xml.etree.ElementTree as ETT # Importing xml.etree for xml parsing
import csv # Importing csv to write in CSV file
def fetch_data():
# Asking input from user for the file path
xml_file_path = input(
"Enter the path to the file. \n Note* use 'Double back slash' instead of 'single back slash': \n \t \t \t \t \t")
# Initiating variable parser which will parse from the file
parser = ETT.parse(xml_file_path)
# Initiating variable root to get root element which will be useful in further queries
root = parser.getroot()
# Initiating variable main_d which is the dictionary in which the parsed data will be stored
main_d = {}
for w in root.iter(): # Making a for loop which will iter all tags out of the file
value = w.attrib # Initiating variable value for storing attributes where attributes are in the form of dictionaries
value['value'] = w.text # Hence, appending the text/value of the tag in the value dict
if w not in main_d:
main_d[w.tag] = value # Writing all the keys and values in main_d
else:
main_d.pop(w)
p.pprint(main_d, sort_dicts=False, width=200, depth=100)
fetch_data()
This is what the XML would look like
<?xml version="1.0" encoding="UTF-8"?>
<Data data_version="1">
<modified_on_date>some_time</modified_on_date>
<file_version>some version</file_version>
<name>h</name>
<class>Hot</class>
<fct>
<fc_tem di="value1" un="value2" unn="value3">some integer</fc_tem>
<fc_str di="value1" un="value2" unn="value3">some integer</fc_str>
<DataTable name="namee" type="0" columns="2" rows="2" version="some version">
<name>this will be overwritten on the first one up there</name>
<type>0</type>
</DataTable>
</fct>
</Data>
This is my progress so far
Taking into account the confidentiality of the program, that's all I can share
First of all, thanks to #PatrickArtner, his way worked
so you just have to do w.tag instead of w
the full snippet is:
# This program is to fetch/parse data(tags, attribs, text) from the XML/XMT
# file provided
# Importing required libraries
import pprint as p # Importing pprint/pretty print for formatting dict
import xml.etree.ElementTree as ETT # Importing xml.etree for xml parsing
import csv # Importing csv to write in CSV file
# Creating a method/function fetch_data() to fetch/parse data from the given XML/XMT file
def fetch_data():
# Asking input from user for the file path
xml_file_path = input(
"Enter the path to the file \n \t \t :")
# Asking input from user for the name of the csv file which will be created
file_name = input(str("Enter the file name with extension you want as output \n \t \t : "))
# Initiating variable parser which will parse from the file
parser = ETT.parse(xml_file_path)
# Initiating variable root to get root element which will be useful in further queries
root = parser.getroot()
# Initiating variable main_d which is the dictionary in which the parsed data will be stored
main_d = {}
for w in root.iter(): # Making a for loop which will iter all tags out of the file
value = w.attrib # Initiating variable value for storing attributes where attributes are in the form of dictionaries
value['value'] = w.text # Hence, appending the text/value of the tag in the value dict
if w.tag not in main_d: # Checking if the tag exists or not, this will help to avoid overwriting of tag values
main_d[w.tag] = value # Writing all the keys and values in main_d
else:
pass
p.pprint(main_d, sort_dicts=False, width=200) # This is just to check the output
with open(file_name, 'w+', buffering=True) as file: # Opening a file with the filename provided by the user
csvwriter = csv.writer(file, quoting=csv.QUOTE_ALL) # Initiating a variable csvwriter for the file and passing QUOTE_ALL agr.
for x in main_d.keys(): # Creating a loop to write the tags
csvwriter.writerow({x}) # Writing the tags
fetch_data()
Related
I have Python script that parses a number of XML files with the same structure, finds relevant elements and prints all tags and attibutes (and writes to file, but I would like to create some structured data instead).
This works perfectly fine, but I would like create a new XML file mirroring the structure of the original, but only with the elements matching the patterns I specified.
Here's the function that searches through the files:
import xml.etree.cElementTree as ET
import glob
filename = "media_code2_output.txt"
def find_mediacode2(inputfile, outputfile):
#find parent node
for parent in root.iter("musa"):
#parent node attribute "dr-production" must be true (as string)
if parent.attrib["dr-production"] == "true":
#each child element must have media-code element be 2.
for mediekode in parent.iter("media-code"):
if mediekode.text == "2":
#pint all fields
for field in parent.iter():
print(field.tag, field.attrib, field.text)
#write all fields to file
outputfile.write(str(field.tag) + " " + str(field.attrib) + " " + str(field.text) + "\n")
#print spacer line
outputfile.write("\n"+"-"*80+"\n")
print("\n"+"-"*80+"\n")
for inputfile in glob.glob('*/*.xml'):
tree = ET.parse(inputfile)
root = tree.getroot()
with open(filename, "a+") as outputfile:
find_mediacode2(root, outputfile)
Here's a sample of the data from the files:
https://pastebin.com/AHEcDv36
Ideally, I would like to represent the data in an Access database.
I am trying to find product names in an xml file i downloaded. I have figured out how to display every result using a while loop. My problem is, i want to only display the first 10 results. Also, i need be able to call each result individually.
For example: print(read_xml_code.start_tag_5) would print the 5th product in the XML file.
print(read_xml_code.start_tag_10) would print the 10th
here is my code so far:
# Define the Static webpage XML file
static_webpage_1 = 'StaticStock/acoustic_guitar.html'
def Find_static_webpage_product_name():
# Open and read the contents of the first XML file
read_xml_code = open(static_webpage_1, encoding="utf8").read()
# Find and print the static page title.
start_tag = '<title><![CDATA['
end_tag = ']]></title>'
end_position = 0
starting_position = read_xml_code.find(start_tag, end_position)
end_position = read_xml_code.find(end_tag, starting_position)
while starting_position != -1 and end_position!= -1:
print(read_xml_code[starting_position + len(start_tag) : end_position]+ '\n')
starting_position = read_xml_code.find(start_tag, end_position)
end_position = read_xml_code.find(end_tag, starting_position)
#call function
Find_static_webpage_product_name()
There is an HTML parser in the python standard library (python 3):
https://docs.python.org/3/library/html.parser.html
You can easily wait for the tag event and do some counting with a member variable for instance.
Also, do not forget to close your resources (with open(static_webpage_1, encoding="utf8") as f:...)
Below is the sample XML file consisting of 3 data-sources. In each data-source there is a tag having an attribute .
Now, out of 3 data-sources, 2 of them didn't have the attribute and one of the data-source have but the value is false.
I want to add the attribute in the missing one and modify its values to true in data-source where its present.
SAMPLE XML snippets:
Using DOM
# import minidom
import xml.dom.minidom as mdom
# open with minidom parser
DOMtree = mdom.parse("Input.xml")
data_set = DOMtree.documentElement
# get all validation elements from data_Set
validations = data_set.getElementsByTagName("validation")
# read all validation from validations
for validation in validations:
# get the element by tag anme
use-fast-fail = validation.getElementsByTagName('use-fast-fail')
# if the tag exist
if use-fast-fail:
if use-fast-fail[0].firstChild.nodeValue == "false":
# if tag value is false replace with true
use-fast-fail[0].firstChild.nodeValue = "true"
# if tag does not exist add tag
else:
newTag = DOMtree.createElement("use-fast-fail")
newTag.appendChild(DOMtree.createTextNode("true"))
validation.appendChild(newTag)
# write into output file
with open("Output.xml", 'w') as output_xml:
output_xml.write(DOMtree.toprettyxml())
Using simple file read and string search with regex
# import regex
import re
#open the input file with "read" option
input_file = open("intput.xml", "r")
#put content into a list
contents = input_file.readlines()
#close the file
input_file.close()
#loop to check the file line by line
#for every entry - get the index and value
for index, value in enumerate(contents):
#searches the "value" contains attribute with false value
if (re.search('<background-validation>false<background-validation/>',value)):
#if condition true True - changes to desired value
contents[index] = "<background-validation>true<background-validation/>\n"
#searches the "value" contains attribute, which always comes just before the desired attribute
if (re.search('validate-on-match',value)):
#searches the "value" of next element in the list contains attribute
if not (re.search('<background-validation>"',contents[index + 1])):
#if not adding the attribute
contents.insert(index + 1, "<background-validation>true<background-validation/>\n")
#open the file with "write" option
output_file = open("Output.xml", "w")
#joining all contents
contents = "".join(contents)
#write into output file
output_file.write(contents)
output_file.close()
Note: in the second options, addition of line if does not exist is given in an assumption that all data-source block is in same structure and order, else we may need to check multiple conditions.
I am very new to python and SO. The script opens xml files inside of a folder. Using os.walk I iterate over the collection and open the file and then calls the function to iterate over the xml file and update the xml file rewriting the updated file over the original using .writexml. the problem is when i run this program from the command line the it says there is an error
Traceback (most recent call last):
File "./XMLParser.py", line 67, in <module>
xmldoc = minidom.parse(xml)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/minidom.py", line 1918, in parse
return expatbuilder.parse(file)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/expatbuilder.py", line 928, in parse
result = builder.parseFile(file)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/expatbuilder.py", line 207, in parseFile
parser.Parse(buffer, 0)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 5614: ordinal not in range(128)
CODE:
from xml.dom import minidom
import os
import codecs
'''
Function to iterate over the directory that contains the work items
params:
CoreID of new author,
x is the path to the workItem.xml file,
p is the path to the workItem.xml that will be overwritten with new data
'''
def changeauthor(coreid, x, p):
# Gets the content of the xml based within the work item tag.
testcase = x.getElementsByTagName("work-item")[0]
# All fields are stored as a <field> tag with the id attribute being the
# differentiators between them. Fields is a list of all the field tags in the
# document.
fields = testcase.getElementsByTagName("field")
# Loop iterates over the field tags and looks for the one tag where the id
# attribute has a value of author. when this tag is found the tags value is
# updated to the core id passed to the function.
for field in fields:
attribute = field.attributes['id'].value
if attribute == "author":
# print the current author.
print("Previous Author: ", field.firstChild.data)
# Set the author to the core id entered into the script
field.firstChild.data = coreid
# Print the updated author field
print("New Author: ", field.firstChild.data)
# Create a temp file with the same path as the source
tmp_config = p
# Open the new temp file with the write mode set.
with codecs.open(tmp_config, 'w', "utf-8") as f:
# f = open(tmp_config, 'w')
# Write the xml into the file at the same location as the orginal
x.writexml(f)
# Close the file
# f.close()
return
while True:
core = str(input("Enter Core ID of the new author: "))
core = core.upper()
spath = str(input("Please enter the full path to the directory of test cases: "))
count = 0
confirm = str(input("Confirm path and core id (Y/N or Exit to leave script): "))
confirm = confirm.upper()
if confirm == "Y":
'''Hard code path here and comment out line above asking for input either will work.'''
# spath = "/Users/Evan/Desktop/workitems-r583233"
# Loop iterates over the directory. Whenever a workitem.xml file is found the path is stored and the file is
# parsed. the core ID entered and the path as well as the parsed xml doc are passed to the change author
# function.
for roots, dirs, files in os.walk(spath):
for file in files:
title = file.title()
if title == "Workitem.Xml":
path = os.path.join(roots, file)
with codecs.open(path, 'r+', "utf-8") as xml:
xmldoc = minidom.parse(xml)
lst = path.split('/')
wi = lst[5]
print("Updating: ", wi)
changeauthor(core, xmldoc, path)
count += 1
print(wi, "updated succesfully.")
print("-------------------------------")
if count > 0:
# Print how many test cases were updated.
print("All Done", count, "workItems updated!")
else:
print("Please double check path and try again no workItems found to update!")
elif confirm == "N":
continue
elif confirm == "EXIT":
break
I currently have a program in Python that opens an XML file, iterates through the tags and prints out the contents to a text file.
import xml.etree.ElementTree as etree
import xml.etree.ElementTree as ET
import tkFileDialog
#asks for the location of the file to open
file = tkFileDialog.askopenfilename()
#parse in the file
tree = ET.parse(file)
root = tree.getroot()
#asks for the location/name of court file to save as
file = tkFileDialog.asksaveasfilename()
f = open(file, 'w')
f.write("\t\t\tReport: \n\n")
#this prints out all tags and text in the right order
iter_ = tree.getiterator()
for elem in iter_:
#print (elem.tag, elem.text)
f.write("\t")
f.write(elem.tag)
f.write(": \t\t")
f.write(str(elem.text))
f.write("\n")
f.close()
The problem is that some of the tags in the XML file don't have any user input, so when the program iterates through, it places NONE in the space, instead of leaving it blank.
Text File output:
Crime Report:
crime_report: None
case_number: 090
victim_details: None
victim_first_name: j
victim_surname: j
officer_details: None
officer_name: j
officer_ID_number: j
witness_details: None
witness_first_name: j
witness_address: j
Current XML file:
<crime_report>
<case_number caseno="unique identifier associated to the case">090
<victim_details>
<victim_first_name>j</victim_first_name>
<victim_surname>j</victim_surname>
</victim_details>
<officer_details>
<officer_name>j</officer_name>
<officer_ID_number>j</officer_ID_number>
</officer_details>
<witness_details>
<witness_first_name>j</witness_first_name>
<witness_address>j</witness_address>
</witness_details>
</case_number>
</crime_report>
I can't figure out how to iterate through, so that these will just be blank, instead of displaying NONE?