Index out of range while converting file from csv to xml - python

While running the following code for converting csv to xml I'm getting index out of range error.
I used the code below a small subset of file with 16 columns it works fine but when I try it on more than 30 its giving following error
Traceback (most recent call last):
File "csv2xml.py", line 40, in <module>
+ rowData[i] + '</' + tags[i] + '>' + "\n")
IndexError: list index out of range
#!/usr/bin/python
import sys
import os
import glob
delimiter = "," # "\t" "|" # delimiter used in the CSV file(s)
# the optional command-line argument maybe a CSV file or a folder
if len(sys.argv) == 2:
arg = sys.argv[1].lower()
if arg.endswith('.csv'): # if a CSV file then convert only that file
csvFiles = [arg]
else: # if a folder path then convert all CSV files in the that folder
os.chdir(arg)
csvFiles = glob.glob('*.csv')
# if no command-line argument then convert all CSV files in the current folder
elif len(sys.argv) == 1:
csvFiles = glob.glob('*.csv')
else:
os._exit(1)
for csvFileName in csvFiles:
xmlFile = csvFileName[:-4] + '.xml'
# read the CSV file as binary data in case there are non-ASCII characters
csvFile = open(csvFileName, 'rb')
csvData = csvFile.readlines()
csvFile.close()
tags = csvData.pop(0).strip().replace(' ', '_').split(delimiter)
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0" encoding="UTF-8" ?>' + "\n")
# there must be only one top-level tag
xmlData.write('<CTS>' + "\n")
for row in csvData:
rowData = row.strip().split(delimiter)
xmlData.write('<Product>' + "\n")
for i in range(len(tags)):
xmlData.write(' ' + '<' + tags[i] + '>'
+ rowData[i] + '</' + tags[i] + '>' + "\n")
xmlData.write('</Product>' + "\n")
xmlData.write('</CTS>' + "\n")
xmlData.close()

It sounds like your for loop over the lines of data should check the length of rowData like this:
tags_length = len(tags)
for row in csvData:
rowData = row.strip().split(delimiter)
xmlData.write('<Product>' + "\n")
if len(rowData) >= tags_length:
for i in range(tags_length):
xmlData.write(' ' + '<' + tags[i] + '>'
+ rowData[i] + '</' + tags[i] + '>' + "\n")
xmlData.write('</Product>' + "\n")

Related

Python for loop incomplete

I am trying to convert all csv files in a folder to XML but having issues, code as follows:
import glob
import csv
fileCount = 0
path = "csvs\*.csv"
for fname in glob.glob(path):
print(fname)
for fname in glob.glob(path):
csvFile = fname
xmlFile = "csvs\myData" + str(fileCount) + ".xml"
print (xmlFile)
print (csvFile)
print (fileCount)
fileCount +=1
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0" encoding="UTF-8"?>' + "\n")
# there must be only one top-level tag
xmlData.write('<userforms>' + "\n")
rowNum = 0
for row in csvData:
if rowNum == 0:
tags = row
# replace spaces w/ underscores in tag names
for i in range(len(tags)):
tags[i] = tags[i].replace(' ', ' ')
else:
xmlData.write('<userform>' + "\n")
for i in range(len(tags)):
xmlData.write('<response>' + "\n" + ' <field>' + tags[i] + '</field>' + "\n" + ' <value>' + row[i] + '</value>'+ "\n" + '</response>' + "\n")
xmlData.write('</userform>' + "\n")
rowNum +=1
#print (fileCount)
#print (xmlFile)
#print (csvFile)
xmlData.write('</userforms>' + "\n")
#xmlData.write('</csv_data>' + "\n")
xmlData.close()
There are 4 csv files in the original folder, the content of each is the same but the names are 1.csv, 2.csv, 3.csv and 4.csv. This code does generate 4 xml files but the first three are incomplete with just the xml header created.
Is there abyway to add a delay/check to a for loop to ensure it completes?
Console output is clean with only print info available:
csvs\myData0.xml
csvs\1.csv
0
csvs\myData1.xml
csvs\2.csv
1
csvs\myData2.xml
csvs\3.csv
2
csvs\myData3.xml
csvs\4.csv
3

Opening a text file to read in Windows from idLE (Python)

I want to read a file which is on my drive at location C:\Users\PITA SHIVAYA\Desktop\BIGDATA\test.txt. How can I run so that I can use this .txt file as input for above code.
import sys
for line in sys.stdin:
line = line.strip()
items = line.split(' ')
print((str)(items[2] + "\t" + items[4] + "\t" + items[6] + "\t" + items[9] + "\t1"))
Can you change the code ? You can open your file instead of using stdin
file_path = r'C:\Users\PITA SHIVAYA\Desktop\BIGDATA\test.txt'
with open(file_path, 'r') as file:
for line in file:
line = line.strip()
items = line.split(' ')
print((str)(items[2] + "\t" + items[4] + "\t" + items[6] + "\t" + items[9] + "\t1"))
Not that I use a raw string for file_path (a rbefore the opening quote), to avoid backslashes to be interpreted as special characters
Another solution is to redirect the file to stdin when you execute python:
python my_file.py < C:\Users\PITA SHIVAYA\Desktop\BIGDATA\test.txt

python script to convert csv to xml

Please help to correct the python script to get the required output
I have written below code to convert csv to xml.
In input file have column from 1 to 278. In output file need to have tag from A1 to A278,
Code :
#!/usr/bin/python
import sys
import os
import csv
if len(sys.argv) != 2:
os._exit(1)
path=sys.argv[1] # get folder as a command line argument
os.chdir(path)
csvFiles = [f for f in os.listdir('.') if f.endswith('.csv') or f.endswith('.CSV')]
for csvFile in csvFiles:
xmlFile = csvFile[:-4] + '.xml'
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0"?>' + "\n")
# there must be only one top-level tag
xmlData.write('<TariffRecords>' + "\n")
rowNum = 0
for row in csvData:
if rowNum == 0:
tags = Tariff
# replace spaces w/ underscores in tag names
for i in range(len(tags)):
tags[i] = tags[i].replace(' ', '_')
else:
xmlData.write('<Tariff>' + "\n")
for i in range(len(tags)):
xmlData.write(' ' + '<' + tags[i] + '>' \
+ row[i] + '</' + tags[i] + '>' + "\n")
xmlData.write('</Tariff>' + "\n")
rowNum +=1
xmlData.write('</TariffRecords>' + "\n")
xmlData.close()
Getting below error from script:
Traceback (most recent call last):
File "ctox.py", line 20, in ?
tags = Tariff
NameError: name 'Tariff' is not defined
Sample Input file.(this is a sample record in actual input file will contain 278 columns).
If input file has two or three records, same needs to be appended in one XML file.
name,Tariff Summary,Record ID No.,Operator Name,Circle (Service Area),list
Prepaid Plan Voucher,test_All calls 2p/s,TT07PMPV0188,Ta Te,Gu,
Prepaid Plan Voucher,test_All calls 3p/s,TT07PMPV0189,Ta Te,HR,
Sample output file
The above two TariffRecords, tariff will be hard coded at the beginning and end of xml file.
<TariffRecords>
<Tariff>
<A1>Prepaid Plan Voucher</A1>
<A2>test_All calls 2p/s</A2>
<A3>TT07PMPV0188</A3>
<A4>Ta Te</A4>
<A5>Gu</A5>
<A6></A6>
</Tariff>
<Tariff>
<A1>Prepaid Plan Voucher</A1>
<A2>test_All calls 3p/s</A2>
<A3>TT07PMPV0189</A3>
<A4>Ta Te</A4>
<A5>HR</A5>
<A6></A6>
</Tariff>
</TariffRecords>
First off you need to replace
tags = Tariff
with
tags = row
Secondly you want to replace the write line to not write tags name but write A1, A2 etc..
Complete code:
import sys
import os
import csv
if len(sys.argv) != 2:
os._exit(1)
path=sys.argv[1] # get folder as a command line argument
os.chdir(path)
csvFiles = [f for f in os.listdir('.') if f.endswith('.csv') or f.endswith('.CSV')]
for csvFile in csvFiles:
xmlFile = csvFile[:-4] + '.xml'
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0"?>' + "\n")
# there must be only one top-level tag
xmlData.write('<TariffRecords>' + "\n")
rowNum = 0
for row in csvData:
if rowNum == 0:
tags = row
# replace spaces w/ underscores in tag names
for i in range(len(tags)):
tags[i] = tags[i].replace(' ', '_')
else:
xmlData.write('<Tariff>' + "\n")
for i, index in enumerate(range(len(tags))):
xmlData.write(' ' + '<' + 'A%s' % (index+1) + '>' \
+ row[i] + '</' + 'A%s' % (index+1) + '>' + "\n")
xmlData.write('</Tariff>' + "\n")
rowNum +=1
xmlData.write('</TariffRecords>' + "\n")
xmlData.close()
Output:
<?xml version="1.0"?>
<TariffRecords>
<Tariff>
<A1>Prepaid Plan Voucher</A1>
<A2>test_All calls 2p/s</A2>
<A3>TT07PMPV0188</A3>
<A4>Ta Te</A4>
<A5>Gu</A5>
<A6></A6>
</Tariff>
<Tariff>
<A1>Prepaid Plan Voucher</A1>
<A2>test_All calls 3p/s</A2>
<A3>TT07PMPV0189</A3>
<A4>Ta Te</A4>
<A5>HR</A5>
<A6></A6>
</Tariff>
</TariffRecords>
import pandas as pd
from xml.etree import ElementTree as xml
df = pd.read_csv("file_path")
csv_data = df.values
root = xml.Element("TariffRecords")
tariff = xml.subelement("Tariff", root)
for index, data in enumarate(csv_data):
row = xml.Element("A"+str(index), tariff)
row.set(str(data))

Why does my Python XML parser break after the first file?

I am working on a Python (3) XML parser that should extract the text content of specific nodes from every xml file within a folder. Then, the script should write the collected data into a tab-separated text file. So far, all the functions seem to be working. The script returns all the information that I want from the first file, but it always breaks, I believe, when it starts to parse the second file.
When it breaks, it returns "TypeError: 'str' object is not callable." I've checked the second file and found that the functions work just as well on that as the first file when I remove the first file from the folder. I'm very new to Python/XML. Any advice, help, or useful links would be greatly appreciated. Thanks!
import xml.etree.ElementTree as ET
import re
import glob
import csv
import sys
content_file = open('WWP Project/WWP_texts.txt','wt')
quotes_file = open('WWP Project/WWP_quotes.txt', 'wt')
list_of_files = glob.glob("../../../Documents/WWPtextbase/distribution/*.xml")
ns = {'wwp':'http://www.wwp.northeastern.edu/ns/textbase'}
def content(tree):
lines = ''.join(ET.tostring(tree.getroot(),encoding='unicode',method='text')).replace('\n',' ').replace('\t',' ').strip()
clean_lines = re.sub(' +',' ', lines)
return clean_lines.lower()
def quotes(tree):
quotes_list = []
for node in tree.findall('.//wwp:quote', namespaces=ns):
quote = ET.tostring(node,encoding='unicode',method='text')
clean_quote = re.sub(' +',' ', quote)
quotes_list.append(clean_quote)
return ' '.join(str(v) for v in quotes_list).replace('\t','').replace('\n','').lower()
def pid(tree):
for node in tree.findall('.//wwp:sourceDesc//wwp:author/wwp:persName[1]', namespaces=ns):
pid = node.attrib.get('ref')
return pid.replace('personography.xml#','') # will need to replace 'p:'
def trid(tree): # this function will eventually need to call OT (.//wwp:publicationStmt//wwp:idno)
for node in tree.findall('.//wwp:sourceDesc',namespaces=ns):
trid = node.attrib.get('n')
return trid
content_file.write('pid' + '\t' + 'trid' + '\t' +'text' + '\n')
quotes_file.write('pid' + '\t' + 'trid' + '\t' + 'quotes' + '\n')
for file_name in list_of_files:
file = open(file_name, 'rt')
tree = ET.parse(file)
file.close()
pid = pid(tree)
trid = trid(tree)
content = content(tree)
quotes = quotes(tree)
content_file.write(pid + '\t' + trid + '\t' + content + '\n')
quotes_file.write(pid + '\t' + trid + '\t' + quotes + '\n')
content_file.close()
quotes_file.close()
You are overwriting your function calls with the values they returned. changing the function names should fix it.
import xml.etree.ElementTree as ET
import re
import glob
import csv
import sys
content_file = open('WWP Project/WWP_texts.txt','wt')
quotes_file = open('WWP Project/WWP_quotes.txt', 'wt')
list_of_files = glob.glob("../../../Documents/WWPtextbase/distribution/*.xml")
ns = {'wwp':'http://www.wwp.northeastern.edu/ns/textbase'}
def get_content(tree):
lines = ''.join(ET.tostring(tree.getroot(),encoding='unicode',method='text')).replace('\n',' ').replace('\t',' ').strip()
clean_lines = re.sub(' +',' ', lines)
return clean_lines.lower()
def get_quotes(tree):
quotes_list = []
for node in tree.findall('.//wwp:quote', namespaces=ns):
quote = ET.tostring(node,encoding='unicode',method='text')
clean_quote = re.sub(' +',' ', quote)
quotes_list.append(clean_quote)
return ' '.join(str(v) for v in quotes_list).replace('\t','').replace('\n','').lower()
def get_pid(tree):
for node in tree.findall('.//wwp:sourceDesc//wwp:author/wwp:persName[1]', namespaces=ns):
pid = node.attrib.get('ref')
return pid.replace('personography.xml#','') # will need to replace 'p:'
def get_trid(tree): # this function will eventually need to call OT (.//wwp:publicationStmt//wwp:idno)
for node in tree.findall('.//wwp:sourceDesc',namespaces=ns):
trid = node.attrib.get('n')
return trid
content_file.write('pid' + '\t' + 'trid' + '\t' +'text' + '\n')
quotes_file.write('pid' + '\t' + 'trid' + '\t' + 'quotes' + '\n')
for file_name in list_of_files:
file = open(file_name, 'rt')
tree = ET.parse(file)
file.close()
pid = get_pid(tree)
trid = get_trid(tree)
content = get_content(tree)
quotes = get_quotes(tree)
content_file.write(pid + '\t' + trid + '\t' + content + '\n')
quotes_file.write(pid + '\t' + trid + '\t' + quotes + '\n')
content_file.close()
quotes_file.close()

Converting CSV to XML

I'm currently trying to make the input file for a hydrologic model (HBV-light) compatible with external calibration software (PEST). HBV-light requires that it's input files be in XML format, while PEST can only read text files. My issue relates to writing a script that will automatically convert a parameter set written by PEST (in CSV format) to an XML file that can be read by HBV-light.
Here's a short example of a text file that can be written by PEST:
W,X,Y,Z
1,2,3,4
and this is how I'm attempting to organize the XML file:
<Parameters>
<GroupA>
<W>1</W>
<X>2</X>
</GroupA>
<GroupB>
<Y>3</Y>
<Z>4</Z>
</GroupB>
</Parameters>
I don't have very much programming experience whatsoever, but here is a python code that I wrote so far:
import csv
csvFile = 'myCSVfile.csv'
xmlFile = 'myXMLfile.xml'
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0" encoding="utf-8"?>' + "\n")
# there must be only one top-level tag
xmlData.write('<Catchment xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">' + "\n")
xmlData.write('<CatchmentParamters>' + "\n")
rowNum = 0
for row in csvData:
if rowNum == 0:
tags = row
# replace spaces w/ underscores in tag names
for i in range(0, 2):
tags[i] = tags[i].replace(' ', '_')
else:
for i in range(0, 2):
xmlData.write(' ' + '<' + tags[i] + '>' \
+ row[i] + '</' + tags[i] + '>' + "\n")
rowNum +=1
xmlData.write('</CatchmentParameters>' + "\n")
xmlData.write('<VegetationZone>' + "\n")
xmlData.write('<VegetationZoneParameters>' + "\n")
rowNum = 0
for row in csvData:
if rowNum == 0:
tags = row
# replace spaces w/ underscores in tag names
for i in range(3, 5):
tags[i] = tags[i].replace(' ', '_')
else:
for i in range(3, 5):
xmlData.write(' ' + '<' + tags[i] + '>' \
+ row[i] + '</' + tags[i] + '>' + "\n")
rowNum +=1
xmlData.write('</VegetationZoneParameters>' + "\n")
xmlData.write('</VegetationZone>' + "\n")
xmlData.write('</Catchment>' + "\n")
xmlData.close()
I can get the Group A (or CathmentParameters specifically) to be written, but the Group B section is NOT being written. Not sure what to do!
I think that the loop is wrong.
Try if this works for you
#! /usr/bin/env python
# coding= utf-8
import csv
csvFile = 'myCSVfile.csv'
xmlFile = 'myXMLfile.xml'
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0" encoding="utf-8"?>' + "\n")
# there must be only one top-level tag
xmlData.write('<Catchment xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">' + "\n")
xmlData.write('<CatchmentParamters>' + "\n")
rowNum = 0
for row in csvData:
if rowNum == 0:
tags = row
# replace spaces w/ underscores in tag names
for i in range(0, 2):
tags[i] = tags[i].replace(' ', '_')
else:
for i in range(0, 2):
xmlData.write(' ' + '<' + tags[i] + '>' \
+ row[i] + '</' + tags[i] + '>' + "\n")
xmlData.write('</CatchmentParameters>' + "\n")
xmlData.write('<VegetationZone>' + "\n")
xmlData.write('<VegetationZoneParameters>' + "\n")
for i in range(2, 4):
xmlData.write(' ' + '<' + tags[i] + '>' \
+ row[i] + '</' + tags[i] + '>' + "\n")
xmlData.write('</VegetationZoneParameters>' + "\n")
xmlData.write('</VegetationZone>' + "\n")
rowNum +=1
xmlData.write('</Catchment>' + "\n")
xmlData.close()
I think the issue is in your range definition in the second part... range(3, 5) means elements 4 and 5, what you want is probably range(2,4) meaning elements 3 and 4.
The problem is that you iterate over the contents of the csv file twice - it appears that you need to "rewind" after your first loop. There is also a minor indexing issue, with the second range needing to be range(2,4) and not range(3,5) as was already pointed out.
I created a piece of code that appears to work. It can probably be improved upon by people who understand Python properly. Note - I added a couple of print statements to convince myself I understood what is happening. If you don't open the csvFile a second time (at "starting the second for loop"), then no rows get printed. That's your clue that this is the problem.
import csv
csvFile = 'myCSVfile.csv'
xmlFile = 'myXMLfile.xml'
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0" encoding="utf-8"?>' + "\n")
# there must be only one top-level tag
xmlData.write('<Catchment xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">' + "\n")
xmlData.write('<CatchmentParamters>' + "\n")
rowNum = 0
for row in csvData:
print "row is ", row
if rowNum == 0:
tags = row
# replace spaces w/ underscores in tag names
for i in range(0, 2):
tags[i] = tags[i].replace(' ', '_')
else:
for i in range(0, 2):
xmlData.write(' ' + '<' + tags[i] + '>' \
+ row[i] + '</' + tags[i] + '>' + "\n")
rowNum +=1
xmlData.write('</CatchmentParameters>' + "\n")
xmlData.write('<VegetationZone>' + "\n")
xmlData.write('<VegetationZoneParameters>' + "\n")
rowNum = 0
print "starting the second for loop"
csvData = csv.reader(open(csvFile))
for row in csvData:
print "row is now ", row
if rowNum == 0:
tags = row
# replace spaces w/ underscores in tag names
for i in range(2, 4):
tags[i] = tags[i].replace(' ', '_')
else:
for i in range(2, 4):
xmlData.write(' ' + '<' + tags[i] + '>' \
+ row[i] + '</' + tags[i] + '>' + "\n")
rowNum +=1
xmlData.write('</VegetationZoneParameters>' + "\n")
xmlData.write('</VegetationZone>' + "\n")
xmlData.write('</Catchment>' + "\n")
xmlData.close()
Using the above with the little test file you had given resulted in the following XML file:
<?xml version="1.0" encoding="utf-8"?>
<Catchment xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<CatchmentParamters>
<W>1</W>
<X>2</X>
</CatchmentParameters>
<VegetationZone>
<VegetationZoneParameters>
<Y>3</Y>
<Z>4</Z>
</VegetationZoneParameters>
</VegetationZone>
</Catchment>
Problem solved?

Categories