Write XML filename based of CSV cell Python - python

Trying to save output from this script to a file based on a cell within the csv. I am able to call the variable {file_root_name} to write into the xml file but not as a variable to write the file name. How can I use the variable file_root_name as a variable to generate a file name?
import csv
import sys
from xml.etree import ElementTree
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from xml.dom import minidom
def prettify(elem):
"""Return a pretty-printed XML string for the Element.
"""
rough_string = ElementTree.tostring(elem, 'utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ", encoding = 'utf-8')
doctype = '<!DOCTYPE smil PUBLIC "-//W3C//DTD SMIL 2.0//EN" "http://www.w3.org/2001/SMIL20/SMIL20.dtd">'
video_data = ((256, 336000),
(512, 592000),
(768, 848000),
(1128, 1208000))
with open(sys.argv[1], 'rU') as f:
reader = csv.DictReader(f)
for row in reader:
root = Element('smil')
root.set('xmlns', 'http://www.w3.org/2001/SMIL20/Language')
head = SubElement(root, 'head')
meta = SubElement(head, 'meta base="rtmp://cp23636.edgefcs.net/ondemand"')
body = SubElement(root, 'body')
switch_tag = ElementTree.SubElement(body, 'switch')
for suffix, bitrate in video_data:
attrs = {'src': ("mp4:soundcheck/{year}/{id}/{file_root_name}_{suffix}.mp4"
.format(suffix=str(suffix), **row)),
'system-bitrate': str(bitrate),
}
ElementTree.SubElement(switch_tag, 'video', attrs)
xml, doc = prettify(root).split('\n', 1)
output = open('file_root_name'+'.smil', 'w')
output.write(xml + doctype + doc)
output.close

I'm not sure that I follow, but if the line
attrs = {'src': ("mp4:soundcheck/{year}/{id}/{file_root_name}_{suffix}.mp4"
.format(suffix=str(suffix), **row)),
'system-bitrate': str(bitrate),
}
works then "file_root_name" must be a string key of the dictlike object row. The line
output = open('file_root_name'+'.smil', 'w')
actually combines the string 'file_root_name' with '.smil'. So you'd really want something like
output = open(row['file_root_name']+'.smil', 'w')
BTW, the line
output.close
won't do anything-- you want output.close() instead, or simply
with open(row['file_root_name']+'.smil', 'w') as output:
output.write(xml + doctype + doc)

Related

Conversion of JSON to XML errors out when I try to write to file

I am in the process of doing a conversion of JSON to XML using Python.
I'm giving a presentation of how by starting with one file, CSV, you can convert it through multiple formats in a chain. So, CSV to JSON, that JSON to XML, XML to the next file type in the chain, etc, back to CSV.
I obtained a public domain CSV file from Kaggle (https://www.kaggle.com/canggih/anime-data-score-staff-synopsis-and-genre), then converted it to JSON.
From JSON, I am trying to convert to XML and write to an outfile.
I converted the CSV to JSON using this (no formatting, just a straight conversion):
#This should convert CSV to JSON
import json, os
import pandas as pd
import csv
df = pd.read_csv('dataanime.csv')
df.to_json(r'sassyg_data_Anime.json')
Then, I created my JSON to XML file:
#With help from instructor and CodeSpeedy
#https://www.codespeedy.com/how-to-convert-json-to-xml-using-python/
#Import libraries
import json as j
import xml.etree.ElementTree as et
#load in the json file
with open("sassyg_data_Anime.json") as json_file_format:
d = j.load(json_file_format)
#create the main container element for the entire XML file
r = et.Element("Work")
#creates the subelements for each part of the json file
et.SubElement(r,"Title").text = d["Title"]
et.SubElement(r,"Type").text = d["Type"]
et.SubElement(r,"Episodes").text = d["Episodes"]
et.SubElement(r,"Status").text = d["Status"]
et.SubElement(r,"Start airing").text = str(d["Start airing"])
et.SubElement(r,"End airing").text = str(d["End airing"])
et.SubElement(r,"Starting season").text = d["Starting season"]
et.SubElement(r,"Broadcast time").text = d["Broadcast time"]
et.SubElement(r,"Producers").text = d["Producers"]
et.SubElement(r,"Licensors").text = d["Licensors"]
et.SubElement(r,"Studios").text = d["Studios"]
et.SubElement(r,"Sources").text = d["Sources"]
et.SubElement(r,"Genres").text = d["Genres"]
et.SubElement(r,"Duration").text = str(d["Duration"])
et.SubElement(r,"Rating").text = d["Rating"]
et.SubElement(r,"Score").text = str(d["Score"])
et.SubElement(r,"Scored by").text = str(d["Scored by"])
et.SubElement(r,"Members").text = str(d["Members"])
et.SubElement(r,"Favorites").text = str(d["Favorites"])
et.SubElement(r,"Description").text = d["Description"]
#create the element tree/info for the write file
a = et.ElementTree(r)
#ERROR ERROR
#structure the output for xml via tostring rather than str
#Cannot write an ElementTree to file, errors out
#This was one solution I came up with, still errors out
a_xml_str = et.tostring(a)
print(a_xml_str)
#This might error out as well, I can't get the program to get to this point
#write file it should go to
outfile = open("json_to_xml.xml", 'w', encoding='utf-8')
outfile.write(a_xml_str)
outfile.close()
The error I get is:
Traceback (most recent call last):
File "F:\Data_Int_Final\Gardner_json_to_xml\convert_json_to_xml.py", line 44, in <module>
a_xml_str = et.tostring(a)
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\xml\etree\ElementTree.py", line 1109, in tostring
ElementTree(element).write(stream, encoding,
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\xml\etree\ElementTree.py", line 748, in write
serialize(write, self._root, qnames, namespaces,
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\xml\etree\ElementTree.py", line 873, in _serialize_xml
tag = elem.tag
AttributeError: 'ElementTree' object has no attribute 'tag'
This is the latest version of the code I've tried. Can anyone see a solution?
Update:
I have two other ways to convert to the starting JSON file, would one of these be a better approach?
import json
import csv
def make_json(csvFilePath, jsonFilePath):
data = {}
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
key = rows['Title']
data[key] = rows
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
csvFilePath = r'dataanime.csv'
jsonFilePath = r'dataAnime.json'
make_json(csvFilePath, jsonFilePath)
which errors out my XML conversion when I use this JSON file with it:
Traceback (most recent call last):
File "F:\Data_Int_Final\convert_json_to_xml.py", line 16, in <module>
et.SubElement(r,"Title").text = d["Title"]
KeyError: 'Title'
or:
import csv
import json
import time
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
#read csv file
with open(csvFilePath, encoding='utf-8') as csvf:
#load csv file data using csv library's dictionary reader
csvReader = csv.DictReader(csvf)
#convert each csv row into python dict
for row in csvReader:
#add this python dict to json array
jsonArray.append(row)
#convert python jsonArray to JSON String and write to file
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonString = json.dumps(jsonArray, indent=4)
jsonf.write(jsonString)
csvFilePath = r'dataanime.csv'
jsonFilePath = r'g_d_anime.json'
start = time.perf_counter()
csv_to_json(csvFilePath, jsonFilePath)
finish = time.perf_counter()
print(f"Conversion of all rows completed successfully in {finish - start:0.4f} seconds")
which errors out my XML conversion when I use this created JSON file with it:
Traceback (most recent call last):
File "F:\Data_Int_Final\convert_json_to_xml.py", line 16, in <module>
et.SubElement(r,"Title").text = d["Title"]
TypeError: list indices must be integers or slices, not str
It's simpler to work with the CSV file and generate a XML file from that directly.
Try something like this:
import csv
import xml.etree.ElementTree as et
root = et.Element('WorksXML')
tree = et.ElementTree(root)
with open("dataanime.csv", "r", encoding="utf-8") as fin:
reader = csv.DictReader(fin)
for row in reader:
r = et.SubElement(root, "Work")
# iterate over each of the fields and add to the XML element
for field in reader.fieldnames:
et.SubElement(r, field.replace(' ', '_')).text = row[field]
with open("csv_to_xml.xml", 'wb') as fout:
tree.write(fout, xml_declaration=True, encoding='utf-8')
This generates an XML file with each "work" as a separate sub-element under the root element.
<?xml version="1.0" encoding="utf-8"?>
<WorksXML>
<Work>
<Title>Fullmetal Alchemist: Brotherhood</Title>
<Type>TV</Type>
<Episodes>64</Episodes>
<Status>Finished Airing</Status>
<Start_airing>4/5/2009</Start_airing>
<End_airing>7/4/2010</End_airing>
<Starting_season>Spring</Starting_season>
...
For the CSV to JSON conversion, the first approach creates a dictionary with titles as keys and the second approach creates an array with each item an object with all the attributes.
If any of the works have a duplicate title then the first approach will overwrite the duplicate entries. If not then it's just a matter of how you want to access the data in the JSON file as a dictionary or a list. If you want to generate XML from the JSON file then the second approach with an array will be the better option.
To convert the array-based JSON file to XML then this will do the job.
import json
import xml.etree.ElementTree as ET
def json_to_xml(jsonFilePath, xmlFilePath):
root = ET.Element('WorksXML')
tree = ET.ElementTree(root)
with open(jsonFilePath, "r", encoding="utf-8") as fin:
jdata = json.load(fin)
for obj in jdata:
r = ET.SubElement(root, "Work")
for key, value in obj.items():
ET.SubElement(r, key.replace(' ', '_')).text = value
with open(xmlFilePath, 'wb') as fout:
tree.write(fout, xml_declaration=True, encoding='utf-8')
jsonFilePath = 'g_d_anime.json'
xmlFilePath = 'g_d_anime.xml'
json_to_xml(jsonFilePath, xmlFilePath)

Group branches in an XML tree with Python on a common field

I have a list of order details in a CSV, and want to join all items from the lines together on one order.
Example date is:
Order|Line|Item|Price
123456789|1|IK123456|199.99
654987321|1|MASGE12385|29.95
654987321|2|KLEAN458792|9.99
654987321|3|LP12489|1959.95
I want everything to be listed in an XML with the root as the Order Number, Child as the Line Number and Sub-Children as Item and Price.
I want the output to look like:
<Order number = "123456789">
<Line number = "1">
<Item>IK123456</Item>
<Price>199.99</Price>
</Line>
</Order>
<Order number = "654987321">
<Line = "1">
<Item>MASGE12385</Item>
<Price>29.95</Price>
</Line>
<Line = "2">
<Item>KLEAN458792</Item>
<Price>9.99</Price>
</Line>
<Line = "3">
<Item>LP12489</Item>
<Price>1959.95</Price>
</Line>
</Order>
Here is my code:
import csv
import xml.etree.ElementTree as ET
file = 'C:/github.txt'
with open (file, 'r') as f:
reader = csv.reader(f, delimiter = '|')
header = next(reader)
order_num = reader[0]
root = ET.Element("Order") #BUILD A ROOT FOR THE XML TREE
root.set('number', order_num) #ADD ATTRIBUTE
for row in reader: #ITERATE THROUGH EACH ROW AND POPULATE DATA IN BRANCHES OF XML TREE
line = ET.SubElement(root, 'line', number= reader[1])
item = ET.SubElement(line, 'item code')
item.text = reader[2]
price = ET.SubElement(line, 'price')
price.text = reader[3]
tree = ET.ElementTree(root)
tree.write('C:/github.xml', encoding = 'utf-8', xml_declaration = True)
(NOTE: I moved something and got an error, but not sure what happened)
During loop, consider keeping a tracker on Number to conditionally decide to create an element and keep related underlying items together. Additionally, consider csv.DictReader to iterate csv rows as a dictionary which takes first row headers as keys. Finally, use the built-in minidom to pretty print output. Below will incorporate all XML items under the single <Orders> root:
import csv
import xml.etree.ElementTree as ET
import xml.dom.minidom as mn
file = 'C:/github.txt'
curr_order = None
with open (file, 'r') as f:
reader = csv.DictReader(f, delimiter = '|')
# BUILD A ROOT FOR THE XML TREE
root = ET.Element("Orders")
# ITERATE THROUGH EACH ROW AS DICTIONARY
for d in reader:
# CONDITIONALLY BUILD ORDER ELEMENT
if curr_order != str(d['Order']):
orderElem = ET.SubElement(root, "Order")
curr_order = str(d['Order'])
# CREATE DESCENDANTS OF ORDER
orderElem.set('number', str(d['Order']))
line = ET.SubElement(orderElem, 'line', number = str(d['Line']))
ET.SubElement(line, 'item_code').text = str(d['Item'])
ET.SubElement(line, 'price').text = str(d['Price'])
# PRETTY PRINT OUTPUT
dom = mn.parseString(ET.tostring(root, encoding = 'utf-8'))
with open('C:/github.xml', 'wb') as f:
f.write(dom.toprettyxml(indent=" ", encoding = 'utf-8'))
Online Demo

Closing the xml tag in one line

The loop goes through the list
for file in files:
if id == file['param_id']:
resources_dict = {'fileNo': str(i), 'startPageNo': str(i), 'endPageNo': str(i),
'format': 'cpk:JPEG'}
ET.SubElement(cpf_resources, 'cpf:ContentFile', resources_dict).text = 'cid:{}'.format(str(file['filename']))
i = i + 1
then the data is written to the file as follows:
tree = ET.ElementTree(jobticket)
filename = '{}\\{}.xml'.format(os.getcwd(), get_af_value(project_data, id, 'filename'))
tree.write(filename, encoding="UTF-8", xml_declaration=True)
In the end file, the data are displayed as follows:
<cpf:Resources>
<cpf:ContentFile endPageNo="1" fileNo="1" format="cpk:JPEG" startPageNo="1">cid:page_0005.jpg
</cpf:ContentFile>
<cpf:ContentFile endPageNo="2" fileNo="2" format="cpk:JPEG" startPageNo="2">cid:page_0009.jpg
</cpf:ContentFile>
</cpf:Resources>
Is there a way to display the closing of the tag </cpf:ContentFile>in the same line?
<cpf:ContentFile endPageNo="2" fileNo="2" format="cpk:JPEG" startPageNo="2">cid:page_0009.jpg</cpf:ContentFile>
After a few curses I managed to create something like this:
tree = tree.getroot()
tree = ET.tostring(tree)
xmlstr = minidom.parseString(tree).toprettyxml(indent = " ", encoding='UTF-8')
with open("filename.xml", "w") as f:
f.write(xmlstr)
Maybe somebody could use it.

Outputting child nodes to CSV with Python

Edit: I've replaced the example XML with real data and provided my code at the bottom.
I have several xml-files containing from 1 to 10+ lines of the following data:
<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2" xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2" xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2" xmlns:cec="urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2" xmlns:soapenv="http://www.w3.org/2003/05/soap-envelope" xmlns:wsu="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd" xmlns:xenc="http://www.w3.org/2001/04/xmlenc#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2 UBL-Invoice-2.0.xsd">
<cac:LegalMonetaryTotal>
<cbc:PayableAmount currencyID="DKK">2586.61</cbc:PayableAmount>
</cac:LegalMonetaryTotal>
<cac:InvoiceLine>
<cbc:ID>1</cbc:ID>
<cbc:InvoicedQuantity unitCode="HUR">1.50</cbc:InvoicedQuantity>
<cbc:LineExtensionAmount currencyID="DKK">1633.65</cbc:LineExtensionAmount>
</cac:InvoiceLine>
<cac:InvoiceLine>
<cbc:ID>2</cbc:ID>
<cbc:InvoicedQuantity unitCode="HUR">1.00</cbc:InvoicedQuantity>
<cbc:LineExtensionAmount currencyID="DKK">952.96</cbc:LineExtensionAmount>
</cac:InvoiceLine>
</Invoice>
And I want to output the data to a CSV-file in the following structure:
filename,lineId,lineQuantity,lineAmount,payableAmount
file1,1,1.50,1633.65,2586.61
file1,2,1.00,952.96,2586.61
file2,.,.,.
...where there's a row for each line per file coupled with the filename and total amount.
This is my code:
from os import listdir, path, walk
import xml.etree.ElementTree as ET
import csv
def invoicelines(self):
filename = path.splitext(path.split(file)[1])[0]
lineId = root.find('./InvoiceLine/ID').text
lineQuantity = root.find('./InvoiceLine/InvoicedQuantity').text
lineAmount = root.find('./InvoiceLine/LineExtensionAmount').text
payableAmount = root.find('./LegalMonetaryTotal/PayableAmount').text
row = [
filename,
lineId,
lineQuantity,
lineAmount,
payableAmount
]
return row
csvfile = 'output.csv'
def csv_write_header(csvfile):
with open(csvfile, 'w', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow([
'filename',
'lineId',
'lineQuantity',
'lineAmount',
'payableAmount'
])
xml_files = []
for root, dirs, files in walk('mypath'):
for file in files:
if file.endswith('.xml'):
xml_files.append(path.join(root, file))
csv_write_header(csvfile)
for file in xml_files:
tree = ET.iterparse(file)
for _, el in tree:
el.tag = el.tag.split('}', 1)[1] # ignores namespaces
root = tree.root
if 'Invoice' in root.tag: # only invoice files
for e in root.iter('InvoiceLine'):
with open(csvfile, 'a', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow(invoicelines(e))
And the output I get if I just parse the above file is:
filename,lineId,lineQuantity,lineAmount,payableAmount
file1,1,1.50,1633.65,2586.61
file1,1,1.50,1633.65,2586.61
...so I'm guessing it's something with my iteration.
The following code achieves your desired result.
import os
import xml.etree.ElementTree as ET
def extract_line_id_data(line_element):
line_id = line_element[0].text
quantity = line_element[1].text
line_amount = line_element[2].text
return line_id, quantity, line_amount
# Iterate over all files in a directory
for _, dirs, files in os.walk('/path/to_folder/with/xml_files/'):
with open('output.csv', 'a') as output:
output.write('Filename,LineID,Quantity,LineAmount,TotalAmount\n') # Headers
for xml_file in files:
# If not all files in the folder files are XML you'll need to catch an exception here
tree = ET.parse(xml_file) # might need to use os.path.abspath
root = tree.getroot()
total_amount = root[0][0].text # Get total amount value
# Iterate over all "Line" elements
for e in root[1:]:
output.write('{},{},{},{},{}\n'.format(xml_file, * extract_line_id_data(e), total_amount))
Tested with your file and a "file2.xml" with a TotalAmount of 350, output looks like this:
Filename,LineID,Quantity,LineAmount,TotalAmount
file.xml,1,4,132,407
file.xml,2,1,72,407
file.xml,3,7,203,407
file2.xml,1,4,132,350
file2.xml,2,1,72,350
file2.xml,3,7,203,350
I hope this works for you. I have used ElementTree as preferred, although I would have used lxml myself.
Try following code :
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Data;
using System.Xml;
using System.Xml.Linq;
using System.IO;
namespace ConsoleApp2
{
class Program
{
const string FILENAME = #"c:\temp\text.csv";
static void Main()
{
string[] filenames = Directory.GetFiles(#"c:\temp", "*.xml");
StreamWriter writer = new StreamWriter(FILENAME);
foreach (string filename in filenames)
{
XDocument doc = XDocument.Load(filename);
string amount = (string)doc.Descendants("TotalAmount").FirstOrDefault();
foreach (XElement line in doc.Descendants("Line"))
{
writer.WriteLine(string.Join(",",
filename,
(string)line.Element("LineID"),
(string)line.Element("Quantity"),
(string)line.Element("LineAmount"),
amount));
}
}
writer.Flush();
writer.Close();
}
}
}

How to get xml output in a file with new line using python xml.etree?

I am generating xml file using "from xml.etree import ElementTree" and placing the generated output in to a new file "test.xml". The output is getting placed inside the test.xml but there is no new line its a big big line. So, what shall i do to have new line inside "test.xml" . Following is the script:
from xml.etree import ElementTree
from xml.dom import minidom
from lxml import etree
def prettify(elem):
"""Return a pretty-printed XML string for the Element.
"""
rough_string = ElementTree.tostring(elem, 'utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ")
top = Element('my_document')
comment = Comment('Practising')
top.append(comment)
child = SubElement(top, 'my_information')
childs = SubElement(child,'my_name')
childs.text = 'This child contains text.'
print prettify(top)
file = open("test.xml", 'w')
xml.ElementTree(top).write(file)
file.close()
Why don't you use the return value of the prettify? Writing the return value of the function will solve your problem.
...
top = Element('my_document')
comment = Comment('Practising')
top.append(comment)
child = SubElement(top, 'my_information')
childs = SubElement(child,'my_name')
childs.text = 'This child contains text.'
with open("test.xml", 'w') as f: # <--
f.write(prettify(top)) # <--

Categories