I have Python script that parses a number of XML files with the same structure, finds relevant elements and prints all tags and attibutes (and writes to file, but I would like to create some structured data instead).
This works perfectly fine, but I would like create a new XML file mirroring the structure of the original, but only with the elements matching the patterns I specified.
Here's the function that searches through the files:
import xml.etree.cElementTree as ET
import glob
filename = "media_code2_output.txt"
def find_mediacode2(inputfile, outputfile):
#find parent node
for parent in root.iter("musa"):
#parent node attribute "dr-production" must be true (as string)
if parent.attrib["dr-production"] == "true":
#each child element must have media-code element be 2.
for mediekode in parent.iter("media-code"):
if mediekode.text == "2":
#pint all fields
for field in parent.iter():
print(field.tag, field.attrib, field.text)
#write all fields to file
outputfile.write(str(field.tag) + " " + str(field.attrib) + " " + str(field.text) + "\n")
#print spacer line
outputfile.write("\n"+"-"*80+"\n")
print("\n"+"-"*80+"\n")
for inputfile in glob.glob('*/*.xml'):
tree = ET.parse(inputfile)
root = tree.getroot()
with open(filename, "a+") as outputfile:
find_mediacode2(root, outputfile)
Here's a sample of the data from the files:
https://pastebin.com/AHEcDv36
Ideally, I would like to represent the data in an Access database.
Related
My program is parsing values from an XML file and then puts them into a dictionary.
Here I've used a for loop to iterate all tags from the file and attributes and also the text
But when there is a subtag like [250][155] which is <name>, it will overwrite the [4] <name>
And all of this is running under the for loop
Now, I want to hinder the loop from overwriting the values once it has been entered into the loop
import pprint as p # Importing pprint/pretty print for formatting dict
import xml.etree.ElementTree as ETT # Importing xml.etree for xml parsing
import csv # Importing csv to write in CSV file
def fetch_data():
# Asking input from user for the file path
xml_file_path = input(
"Enter the path to the file. \n Note* use 'Double back slash' instead of 'single back slash': \n \t \t \t \t \t")
# Initiating variable parser which will parse from the file
parser = ETT.parse(xml_file_path)
# Initiating variable root to get root element which will be useful in further queries
root = parser.getroot()
# Initiating variable main_d which is the dictionary in which the parsed data will be stored
main_d = {}
for w in root.iter(): # Making a for loop which will iter all tags out of the file
value = w.attrib # Initiating variable value for storing attributes where attributes are in the form of dictionaries
value['value'] = w.text # Hence, appending the text/value of the tag in the value dict
if w not in main_d:
main_d[w.tag] = value # Writing all the keys and values in main_d
else:
main_d.pop(w)
p.pprint(main_d, sort_dicts=False, width=200, depth=100)
fetch_data()
This is what the XML would look like
<?xml version="1.0" encoding="UTF-8"?>
<Data data_version="1">
<modified_on_date>some_time</modified_on_date>
<file_version>some version</file_version>
<name>h</name>
<class>Hot</class>
<fct>
<fc_tem di="value1" un="value2" unn="value3">some integer</fc_tem>
<fc_str di="value1" un="value2" unn="value3">some integer</fc_str>
<DataTable name="namee" type="0" columns="2" rows="2" version="some version">
<name>this will be overwritten on the first one up there</name>
<type>0</type>
</DataTable>
</fct>
</Data>
This is my progress so far
Taking into account the confidentiality of the program, that's all I can share
First of all, thanks to #PatrickArtner, his way worked
so you just have to do w.tag instead of w
the full snippet is:
# This program is to fetch/parse data(tags, attribs, text) from the XML/XMT
# file provided
# Importing required libraries
import pprint as p # Importing pprint/pretty print for formatting dict
import xml.etree.ElementTree as ETT # Importing xml.etree for xml parsing
import csv # Importing csv to write in CSV file
# Creating a method/function fetch_data() to fetch/parse data from the given XML/XMT file
def fetch_data():
# Asking input from user for the file path
xml_file_path = input(
"Enter the path to the file \n \t \t :")
# Asking input from user for the name of the csv file which will be created
file_name = input(str("Enter the file name with extension you want as output \n \t \t : "))
# Initiating variable parser which will parse from the file
parser = ETT.parse(xml_file_path)
# Initiating variable root to get root element which will be useful in further queries
root = parser.getroot()
# Initiating variable main_d which is the dictionary in which the parsed data will be stored
main_d = {}
for w in root.iter(): # Making a for loop which will iter all tags out of the file
value = w.attrib # Initiating variable value for storing attributes where attributes are in the form of dictionaries
value['value'] = w.text # Hence, appending the text/value of the tag in the value dict
if w.tag not in main_d: # Checking if the tag exists or not, this will help to avoid overwriting of tag values
main_d[w.tag] = value # Writing all the keys and values in main_d
else:
pass
p.pprint(main_d, sort_dicts=False, width=200) # This is just to check the output
with open(file_name, 'w+', buffering=True) as file: # Opening a file with the filename provided by the user
csvwriter = csv.writer(file, quoting=csv.QUOTE_ALL) # Initiating a variable csvwriter for the file and passing QUOTE_ALL agr.
for x in main_d.keys(): # Creating a loop to write the tags
csvwriter.writerow({x}) # Writing the tags
fetch_data()
Below is the sample XML file consisting of 3 data-sources. In each data-source there is a tag having an attribute .
Now, out of 3 data-sources, 2 of them didn't have the attribute and one of the data-source have but the value is false.
I want to add the attribute in the missing one and modify its values to true in data-source where its present.
SAMPLE XML snippets:
Using DOM
# import minidom
import xml.dom.minidom as mdom
# open with minidom parser
DOMtree = mdom.parse("Input.xml")
data_set = DOMtree.documentElement
# get all validation elements from data_Set
validations = data_set.getElementsByTagName("validation")
# read all validation from validations
for validation in validations:
# get the element by tag anme
use-fast-fail = validation.getElementsByTagName('use-fast-fail')
# if the tag exist
if use-fast-fail:
if use-fast-fail[0].firstChild.nodeValue == "false":
# if tag value is false replace with true
use-fast-fail[0].firstChild.nodeValue = "true"
# if tag does not exist add tag
else:
newTag = DOMtree.createElement("use-fast-fail")
newTag.appendChild(DOMtree.createTextNode("true"))
validation.appendChild(newTag)
# write into output file
with open("Output.xml", 'w') as output_xml:
output_xml.write(DOMtree.toprettyxml())
Using simple file read and string search with regex
# import regex
import re
#open the input file with "read" option
input_file = open("intput.xml", "r")
#put content into a list
contents = input_file.readlines()
#close the file
input_file.close()
#loop to check the file line by line
#for every entry - get the index and value
for index, value in enumerate(contents):
#searches the "value" contains attribute with false value
if (re.search('<background-validation>false<background-validation/>',value)):
#if condition true True - changes to desired value
contents[index] = "<background-validation>true<background-validation/>\n"
#searches the "value" contains attribute, which always comes just before the desired attribute
if (re.search('validate-on-match',value)):
#searches the "value" of next element in the list contains attribute
if not (re.search('<background-validation>"',contents[index + 1])):
#if not adding the attribute
contents.insert(index + 1, "<background-validation>true<background-validation/>\n")
#open the file with "write" option
output_file = open("Output.xml", "w")
#joining all contents
contents = "".join(contents)
#write into output file
output_file.write(contents)
output_file.close()
Note: in the second options, addition of line if does not exist is given in an assumption that all data-source block is in same structure and order, else we may need to check multiple conditions.
i'm looking to solve this problem.
When i try to write into the xml file , it writes twice the same thing.
It's the code:
def writeIntoXml(fileName, tagElement, textElement):
tree = ET.ElementTree(file = fileName)
root = tree.getroot()
newElement = ET.SubElement(root, tagElement)
newElement.text =textElement;
newElement.tail ="\n"
root.append(newElement)
tree.write(fileName, encoding='utf-8')
If i have this xml file, with this tags, if i write a new tag( es "Question-3" Example3 "/Question-3") i get a problem
XmlFile before being written:
<Questions>
<Question-1>Example1</Question-1>
<Question-2>Example2</Question-2>
</Questions>
XmlFile after being written:
<Questions>
<Question-1>Example1</Question-1>
<Question-2>Example2</Question-2>
<Question-3>Example3</Question-3>
<Question-3>Example3</Question-3>
</Questions>
Sorry for grammatical errors
Note that ET.SubElement() appends the element automatically. You are adding the element twice, first in SubElement(), next in append().
You should use either just
newElement = ET.SubElement(root, tagElement)
newElement.text = textElement;
newElement.tail = "\n"
or
newElement = ET.Element(tagElement)
newElement.text = textElement;
newElement.tail = "\n"
root.append(newElement)
I currently have a program in Python that opens an XML file, iterates through the tags and prints out the contents to a text file.
import xml.etree.ElementTree as etree
import xml.etree.ElementTree as ET
import tkFileDialog
#asks for the location of the file to open
file = tkFileDialog.askopenfilename()
#parse in the file
tree = ET.parse(file)
root = tree.getroot()
#asks for the location/name of court file to save as
file = tkFileDialog.asksaveasfilename()
f = open(file, 'w')
f.write("\t\t\tReport: \n\n")
#this prints out all tags and text in the right order
iter_ = tree.getiterator()
for elem in iter_:
#print (elem.tag, elem.text)
f.write("\t")
f.write(elem.tag)
f.write(": \t\t")
f.write(str(elem.text))
f.write("\n")
f.close()
The problem is that some of the tags in the XML file don't have any user input, so when the program iterates through, it places NONE in the space, instead of leaving it blank.
Text File output:
Crime Report:
crime_report: None
case_number: 090
victim_details: None
victim_first_name: j
victim_surname: j
officer_details: None
officer_name: j
officer_ID_number: j
witness_details: None
witness_first_name: j
witness_address: j
Current XML file:
<crime_report>
<case_number caseno="unique identifier associated to the case">090
<victim_details>
<victim_first_name>j</victim_first_name>
<victim_surname>j</victim_surname>
</victim_details>
<officer_details>
<officer_name>j</officer_name>
<officer_ID_number>j</officer_ID_number>
</officer_details>
<witness_details>
<witness_first_name>j</witness_first_name>
<witness_address>j</witness_address>
</witness_details>
</case_number>
</crime_report>
I can't figure out how to iterate through, so that these will just be blank, instead of displaying NONE?
I have written a piece of code which lets me extract the table from a file named 195775.html. I save the output in a text file. Now I need to iterate this code for all the 20,000 files which I have in the same directory. In addition, I also want the files to be tagged with their respective file names. i.e. each file should have a column (in the table) which takes the filename as its value. Also, I want the output text files to be named as per the input files (i.e. the names should match).
Here is my code:
import urllib2
import os
import time
import traceback
from bs4 import BeautifulSoup
outfile= open('C:/Users/Manvendra/Dropbox/Python/195775.txt','wb')
rfile = open('C:/Users/Manvendra/Dropbox/PRI/Data/AP/195775.html')
rsoup = BeautifulSoup(rfile)
nodes = rsoup.find('div',{'class':'frmhdtitle'})
if nodes!= None:
#print "div present"
x = nodes.findNext('table')
if x!= None:
#print "table present"
y = x.find('tbody')
if y!= None:
#print "tbody present"
z= y.findAll('tr')
if z!= None:
#print "tr present"
for wx in z[1:]:
num= wx.find('td').get_text()
print num
name= wx.find('td').findNext('td').get_text()
print name
age = wx.find('td').findNext('td').findNext('td').get_text()
print age
caste= wx.find('td').findNext('td').findNext('td').findNext('td').get_text()
print caste
gender= wx.find('td').findNext('td').findNext('td').findNext('td').findNext('td').get_text()
print gender
quali = wx.find('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').get_text()
print quali
occu = wx.find('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').get_text()
print occu
#email = wx.find('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').get_text()
#print email
#ward = wx.find('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').get_text()
#print ward
resr = wx.find('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').get_text()
print resr
outfile.write(str(num) +"\t" + str(name) +"\t" + str(age) +"\t" + str (caste) +"\t" + str(quali) +"\t" + str(occu) + "\t" + str(resr) + str(infile) +"\n")
outfile.close()
Put your code into a separate function and call it for each html file in the directory:
#!/usr/bin/env python2
import os
from glob import glob
dest_dir = 'C:/Users/Manvendra/Dropbox/Python'
for html_filename in glob('C:/Users/Manvendra/Dropbox/PRI/Data/AP/*.html'):
basename = os.path.splitext(os.path.basename(html_filename))[0]
with open(html_filename, 'rb') as html_file, \
open(os.path.join(dest_dir, basename + '.txt'), 'wb') as csv_file:
html2csv(html_file, csv_file)
where html2csv() is:
import logging
from bs4 import BeautifulSoup
log = logging.getLogger(__name__)
def html2csv(html_file, csv_file):
writerow = csv.writer(csv_file, dialect=csv.excel_tab).writerow
div = BeautifulSoup(html_file).find('div', 'frmhdtitle')
try:
rows = div.find_next('table').tbody.find_all('tr')[1:]
except AttributeError:
log.warning("No info in %s file", html_file.name)
else:
for tr in rows:
writerow([td.get_text().encode('utf-8')
for td in tr.find_all('td')[:8]] + [html_file.name])
Note: findNext('td') method in your code searches the html document without any regard for elements boundaries i.e., it may find td that belongs to a different row or even a different table as long as it is further in the document. I rewrote the loop assuming that you want to find eight adjacent <td> elements in each row.
Do something like this:
files = os.listdir(directoryPath)
for file in files:
*your code*
Note that if you want to open the files you need to open the path: directoryPath + "/" + file.
Regarding all the tags and filenames you want to name the files, "file" is now a variable which contains the name of the file you are now processing so do with it what you want.