I am working on a Python (3) XML parser that should extract the text content of specific nodes from every xml file within a folder. Then, the script should write the collected data into a tab-separated text file. So far, all the functions seem to be working. The script returns all the information that I want from the first file, but it always breaks, I believe, when it starts to parse the second file.
When it breaks, it returns "TypeError: 'str' object is not callable." I've checked the second file and found that the functions work just as well on that as the first file when I remove the first file from the folder. I'm very new to Python/XML. Any advice, help, or useful links would be greatly appreciated. Thanks!
import xml.etree.ElementTree as ET
import re
import glob
import csv
import sys
content_file = open('WWP Project/WWP_texts.txt','wt')
quotes_file = open('WWP Project/WWP_quotes.txt', 'wt')
list_of_files = glob.glob("../../../Documents/WWPtextbase/distribution/*.xml")
ns = {'wwp':'http://www.wwp.northeastern.edu/ns/textbase'}
def content(tree):
lines = ''.join(ET.tostring(tree.getroot(),encoding='unicode',method='text')).replace('\n',' ').replace('\t',' ').strip()
clean_lines = re.sub(' +',' ', lines)
return clean_lines.lower()
def quotes(tree):
quotes_list = []
for node in tree.findall('.//wwp:quote', namespaces=ns):
quote = ET.tostring(node,encoding='unicode',method='text')
clean_quote = re.sub(' +',' ', quote)
quotes_list.append(clean_quote)
return ' '.join(str(v) for v in quotes_list).replace('\t','').replace('\n','').lower()
def pid(tree):
for node in tree.findall('.//wwp:sourceDesc//wwp:author/wwp:persName[1]', namespaces=ns):
pid = node.attrib.get('ref')
return pid.replace('personography.xml#','') # will need to replace 'p:'
def trid(tree): # this function will eventually need to call OT (.//wwp:publicationStmt//wwp:idno)
for node in tree.findall('.//wwp:sourceDesc',namespaces=ns):
trid = node.attrib.get('n')
return trid
content_file.write('pid' + '\t' + 'trid' + '\t' +'text' + '\n')
quotes_file.write('pid' + '\t' + 'trid' + '\t' + 'quotes' + '\n')
for file_name in list_of_files:
file = open(file_name, 'rt')
tree = ET.parse(file)
file.close()
pid = pid(tree)
trid = trid(tree)
content = content(tree)
quotes = quotes(tree)
content_file.write(pid + '\t' + trid + '\t' + content + '\n')
quotes_file.write(pid + '\t' + trid + '\t' + quotes + '\n')
content_file.close()
quotes_file.close()
You are overwriting your function calls with the values they returned. changing the function names should fix it.
import xml.etree.ElementTree as ET
import re
import glob
import csv
import sys
content_file = open('WWP Project/WWP_texts.txt','wt')
quotes_file = open('WWP Project/WWP_quotes.txt', 'wt')
list_of_files = glob.glob("../../../Documents/WWPtextbase/distribution/*.xml")
ns = {'wwp':'http://www.wwp.northeastern.edu/ns/textbase'}
def get_content(tree):
lines = ''.join(ET.tostring(tree.getroot(),encoding='unicode',method='text')).replace('\n',' ').replace('\t',' ').strip()
clean_lines = re.sub(' +',' ', lines)
return clean_lines.lower()
def get_quotes(tree):
quotes_list = []
for node in tree.findall('.//wwp:quote', namespaces=ns):
quote = ET.tostring(node,encoding='unicode',method='text')
clean_quote = re.sub(' +',' ', quote)
quotes_list.append(clean_quote)
return ' '.join(str(v) for v in quotes_list).replace('\t','').replace('\n','').lower()
def get_pid(tree):
for node in tree.findall('.//wwp:sourceDesc//wwp:author/wwp:persName[1]', namespaces=ns):
pid = node.attrib.get('ref')
return pid.replace('personography.xml#','') # will need to replace 'p:'
def get_trid(tree): # this function will eventually need to call OT (.//wwp:publicationStmt//wwp:idno)
for node in tree.findall('.//wwp:sourceDesc',namespaces=ns):
trid = node.attrib.get('n')
return trid
content_file.write('pid' + '\t' + 'trid' + '\t' +'text' + '\n')
quotes_file.write('pid' + '\t' + 'trid' + '\t' + 'quotes' + '\n')
for file_name in list_of_files:
file = open(file_name, 'rt')
tree = ET.parse(file)
file.close()
pid = get_pid(tree)
trid = get_trid(tree)
content = get_content(tree)
quotes = get_quotes(tree)
content_file.write(pid + '\t' + trid + '\t' + content + '\n')
quotes_file.write(pid + '\t' + trid + '\t' + quotes + '\n')
content_file.close()
quotes_file.close()
Related
I have output that I can write into a CSV. However, because of how i setup my XML to text, the output iterates itself incorrectly. I've tried a lot to fix my XML output, but I don't see any way to fix it.
I've tried a lot, including modifying my XML statements to trying to write to CSV in different ways, but I can't seem to get the rows to match up the way I need them to be, because of the the for in statements that have different depths.
I don't really care how it's done, so long as it matches up, because the data is ultimately fed into my SQL database.
Below is my code,
import os
import sys
import glob
import xml.etree.ElementTree as ET
firstFile = open("myfile.csv", "a")
firstFile.write("V-ID,")
firstFile.write("HostName,")
firstFile.write("Status,")
firstFile.write("Comments,")
firstFile.write("Finding Details,")
firstFile.write("STIG Name,")
basePath = os.path.dirname(os.path.realpath(__file__))
xmlFile = os.path.join(basePath, "C:\\Users\\myUserName\\Desktop\\Scripts\\Python\\XMLtest.xml")
tree = ET.parse(xmlFile)
root = tree.getroot()
for child in root.findall('{http://checklists.nist.gov/xccdf/1.2}title'):
d = child.text
for child in root:
for children in child.findall('{http://checklists.nist.gov/xccdf/1.2}target'):
b = children.text
for child in root.findall('{http://checklists.nist.gov/xccdf/1.2}Group'):
x = (str(child.attrib))
x = (x.split('_')[6])
a = x[:-2]
firstFile.write("\n" + a + ',')
for child in root:
for children in child:
for childrens in children.findall('{http://checklists.nist.gov/xccdf/1.2}result'):
x = childrens.text
if ('pass' in x):
c = 'Completed'
else:
c = 'Ongoing'
firstFile.write('\t' + '\n' + ',' + b + ',' + c + ',' + ',' + ',' + d)
firstFile.close()
below is my CSV current output,
below is the output I need,
try to change this
x = (x.split('_')[0])
Think of what your CSV output looks like, then think of what it SHOULD look like.
Your CSV is generated by these two loops:
for child in root.findall('{http://checklists.nist.gov/xccdf/1.2}Group'):
x = (str(child.attrib))
x = (x.split('_')[6])
a = x[:-2]
firstFile.write("\n" + a + ',')
for child in root:
for children in child:
for childrens in children.findall('{http://checklists.nist.gov/xccdf/1.2}result'):
x = childrens.text
if ('pass' in x):
c = 'Completed'
else:
c = 'Ongoing'
firstFile.write('\t' + '\n' + ',' + b + ',' + c + ',' + ',' + ',' + d)
That means that anything you add to your CSV file in the second loop will be written AFTER what was added in the first loop.
I can think of two ideas to solve this from the top of my head:
Somehow fuse the loops into one, thus generating each row in one loop iteration. I don't know if that works with your parameters, though.
Don't add to the end of the CSV file, but add to the row you want to add by specifically telling the "write" function where to write.
Disclaimer: I'm not familiar with Python at all, this is just the logical source of the problem and two solutions that should work in theory. I do not know if either of them is practicable.
This fixed it for me. I basically left it as is, put it into a CSV, then read the CSV, stored the columns as a list, removed blank spaces and exported it back out.
import os
import sys
import glob
import csv
import xml.etree.ElementTree as ET
firstFile = open("myfile.csv", "a")
path = 'C:\\Users\\JT\\Desktop\\Scripts\\Python\\xccdf\\'
for fileName in glob.glob(os.path.join(path, '*.xml')):
with open('C:\\Users\\JT\\Desktop\\Scripts\\Python\\myfile1.csv', 'w', newline='') as csvFile1:
csvWriter = csv.writer(csvFile1, delimiter=',')
# do your stuff
tree = ET.parse(fileName)
root = tree.getroot()
# Stig Title
for child in root.findall('{http://checklists.nist.gov/xccdf/1.2}title'):
d = child.text
# hostName
for child in root:
for children in child.findall('{http://checklists.nist.gov/xccdf/1.2}target'):
b = children.text
# V-ID
for child in root.findall('{http://checklists.nist.gov/xccdf/1.2}Group'):
x = (str(child.attrib))
x = (x.split('_')[6])
a = x[:-2]
firstFile.write(a + '\n')
# Status
for child in root:
for children in child:
for childrens in children.findall('{http://checklists.nist.gov/xccdf/1.2}result'):
x = childrens.text
firstFile.write(',' + b + ',' + x + ',' + ',' + ',' + d + '\n')
with open('C:\\Users\\JT\\Desktop\\Scripts\\Python\\myfile.csv', 'r') as csvFile:
csvReader = csv.reader(csvFile, delimiter=',')
vIDs = []
hostNames = []
status = []
stigTitles = []
for line in csvReader:
vID = line[0]
vIDs.append(vID)
try:
hostName = line[1]
hostNames.append(hostName)
except:
pass
try:
state = line[2]
status.append(state)
except:
pass
try:
stigTitle = line[5]
stigTitles.append(stigTitle)
except:
pass
with open('C:\\Users\\JT\\Desktop\\Scripts\\Python\\myfile1.csv', 'a', newline='') as csvFile1:
csvWriter = csv.writer(csvFile1, delimiter=',')
vIDMod = list(filter(None, vIDs))
hostNameMod = list(filter(None, hostNames))
statusMod = list(filter(None, status))
stigTitlesMod = list(filter(None, stigTitles))
csvWriter.writerows(zip(vIDMod, hostNameMod, statusMod, stigTitlesMod))
firstFile.close()
I able to write hostname in the /tmp/filter.log but any hint how can i write all three values[hostname, owner, seats] in the file?
def list_hosts(nc):
resp = nc.send_service_request('ListHosts', json.dumps({}))
result = resp['result']
l = []
f=open("/tmp/filter.log", "w+")
for r in result:
if "team-prod" in r['owner']:
print r['owner'], r['hostname'], r['seats']
f.write(r['hostname'] + "\n")
f.close()
l.append(r['hostname'])
return l
nc = create_client('zone', 'team_PROD_USERNAME', 'team_PROD_PASSWORD')
l = list_hosts(nc)
print l
The file should have entries as below:
team-prod\*, np-team-052, [u'123123123-18d1-483d-9af8-169ac66b26e4']
Current entry is:
np-team-052
f.write(str(r['owner']) + ', ' + str(r['hostname']) + ', ' + str(r['seats']) + '\n')
Please help to correct the python script to get the required output
I have written below code to convert csv to xml.
In input file have column from 1 to 278. In output file need to have tag from A1 to A278,
Code :
#!/usr/bin/python
import sys
import os
import csv
if len(sys.argv) != 2:
os._exit(1)
path=sys.argv[1] # get folder as a command line argument
os.chdir(path)
csvFiles = [f for f in os.listdir('.') if f.endswith('.csv') or f.endswith('.CSV')]
for csvFile in csvFiles:
xmlFile = csvFile[:-4] + '.xml'
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0"?>' + "\n")
# there must be only one top-level tag
xmlData.write('<TariffRecords>' + "\n")
rowNum = 0
for row in csvData:
if rowNum == 0:
tags = Tariff
# replace spaces w/ underscores in tag names
for i in range(len(tags)):
tags[i] = tags[i].replace(' ', '_')
else:
xmlData.write('<Tariff>' + "\n")
for i in range(len(tags)):
xmlData.write(' ' + '<' + tags[i] + '>' \
+ row[i] + '</' + tags[i] + '>' + "\n")
xmlData.write('</Tariff>' + "\n")
rowNum +=1
xmlData.write('</TariffRecords>' + "\n")
xmlData.close()
Getting below error from script:
Traceback (most recent call last):
File "ctox.py", line 20, in ?
tags = Tariff
NameError: name 'Tariff' is not defined
Sample Input file.(this is a sample record in actual input file will contain 278 columns).
If input file has two or three records, same needs to be appended in one XML file.
name,Tariff Summary,Record ID No.,Operator Name,Circle (Service Area),list
Prepaid Plan Voucher,test_All calls 2p/s,TT07PMPV0188,Ta Te,Gu,
Prepaid Plan Voucher,test_All calls 3p/s,TT07PMPV0189,Ta Te,HR,
Sample output file
The above two TariffRecords, tariff will be hard coded at the beginning and end of xml file.
<TariffRecords>
<Tariff>
<A1>Prepaid Plan Voucher</A1>
<A2>test_All calls 2p/s</A2>
<A3>TT07PMPV0188</A3>
<A4>Ta Te</A4>
<A5>Gu</A5>
<A6></A6>
</Tariff>
<Tariff>
<A1>Prepaid Plan Voucher</A1>
<A2>test_All calls 3p/s</A2>
<A3>TT07PMPV0189</A3>
<A4>Ta Te</A4>
<A5>HR</A5>
<A6></A6>
</Tariff>
</TariffRecords>
First off you need to replace
tags = Tariff
with
tags = row
Secondly you want to replace the write line to not write tags name but write A1, A2 etc..
Complete code:
import sys
import os
import csv
if len(sys.argv) != 2:
os._exit(1)
path=sys.argv[1] # get folder as a command line argument
os.chdir(path)
csvFiles = [f for f in os.listdir('.') if f.endswith('.csv') or f.endswith('.CSV')]
for csvFile in csvFiles:
xmlFile = csvFile[:-4] + '.xml'
csvData = csv.reader(open(csvFile))
xmlData = open(xmlFile, 'w')
xmlData.write('<?xml version="1.0"?>' + "\n")
# there must be only one top-level tag
xmlData.write('<TariffRecords>' + "\n")
rowNum = 0
for row in csvData:
if rowNum == 0:
tags = row
# replace spaces w/ underscores in tag names
for i in range(len(tags)):
tags[i] = tags[i].replace(' ', '_')
else:
xmlData.write('<Tariff>' + "\n")
for i, index in enumerate(range(len(tags))):
xmlData.write(' ' + '<' + 'A%s' % (index+1) + '>' \
+ row[i] + '</' + 'A%s' % (index+1) + '>' + "\n")
xmlData.write('</Tariff>' + "\n")
rowNum +=1
xmlData.write('</TariffRecords>' + "\n")
xmlData.close()
Output:
<?xml version="1.0"?>
<TariffRecords>
<Tariff>
<A1>Prepaid Plan Voucher</A1>
<A2>test_All calls 2p/s</A2>
<A3>TT07PMPV0188</A3>
<A4>Ta Te</A4>
<A5>Gu</A5>
<A6></A6>
</Tariff>
<Tariff>
<A1>Prepaid Plan Voucher</A1>
<A2>test_All calls 3p/s</A2>
<A3>TT07PMPV0189</A3>
<A4>Ta Te</A4>
<A5>HR</A5>
<A6></A6>
</Tariff>
</TariffRecords>
import pandas as pd
from xml.etree import ElementTree as xml
df = pd.read_csv("file_path")
csv_data = df.values
root = xml.Element("TariffRecords")
tariff = xml.subelement("Tariff", root)
for index, data in enumarate(csv_data):
row = xml.Element("A"+str(index), tariff)
row.set(str(data))
This is the code. However, this code can only parse 4 characters of Arabian only. I want it to parse dynamically. So, the number of characters does not matter. Therefore, it can parse 1 character, 2 character or more based on the number of existing characters.
import xml.etree.ElementTree as ET
import os, glob
import csv
from time import time
#read xml path
xml_path = glob.glob('D:\1. Thesis FINISH!!!\*.xml')
#create file declaration for saving the result
file = open("parsing.csv","w")
#file = open("./%s" % ('parsing.csv'), 'w')
#create variable of starting time
t0=time()
#create file header
file.write('wordImage_id'+'|'+'paw1'+'|'+'paw2'+'|' + 'paw3' + '|' + 'paw4' + '|'+'font_size'+'|'+'font_style'+
'|'+'font_name'+'|'+'specs_effect'+'|'+'specs_height'+'|'+'specs_height'
+'|'+'specs_width'+'|'+'specs_encoding'+'|'+'generation_filtering'+
'|'+'generation_renderer'+'|'+'generation_type' + '\n')
for doc in xml_path:
print 'Reading file - ', os.path.basename(doc)
tree = ET.parse(doc)
#tree = ET.parse('D:\1. Thesis FINISH!!!\Image_14_AdvertisingBold_13.xml')
root = tree.getroot()
#get wordimage id
image_id = root.attrib['id']
#get paw 1 and paw 2
paw1 = root[0][0].text
paw2 = root[0][1].text
paw3 = root[0][2].text
paw4 = root[0][3].text
#get properties of font
for font in root.findall('font'):
size = font.get('size')
style = font.get('fontStyle')
name = font.get('name')
#get properties of specs
for specs in root.findall('specs'):
effect = specs.get('effect')
height = specs.get('height')
width = specs.get('width')
encoding = specs.get('encoding')
#get properties for generation
for generation in root.findall('generation'):
filtering = generation.get('filtering')
renderer = generation.get('renderer')
types = generation.get('type')
#save the result in csv
file.write(image_id + '|' + paw1 + '|' + paw2 + '|' + paw3 + '|' + paw4 + '|' + size + '|' +
style + '|' + name + '|' + effect + '|' + height + '|'
+ width + '|' + encoding + '|' + filtering + '|' + renderer + '|' + types + '\n')
#close the file
file.close()
#print time execution
print("process done in %0.3fs." % (time() - t0))
I've put together a tkinter form and python script for downloading files from an ftp site. The filenames are in the attribute table of a shapefile, as well as an overall Name that the filenames correspond too. In other words I look up a Name such as "CABOT" and download the filename 34092_18.tif. However, if a Name has an apostrophe, such as "O'KEAN", it's giving me trouble. I try to replace the apostrophe, like I've done in previous scripts, but it doesn't download anything....
whereExp = quadField + " = " + "'" + quadName.replace("'", '"') + "'"
quadFields = ["FILENAME"]
c = arcpy.da.SearchCursor(collarlessQuad, quadFields, whereExp)
for row in c:
tifFile = row[0]
tifName = quadName.replace("'", '') + '_' + tifFile
#fullUrl = ftpUrl + tifFile
local_filename = os.path.join(downloadDir, tifName)
lf = open(local_filename, "wb")
ftp.retrbinary('RETR ' + tifFile, lf.write)
lf.close()
Here is an example of a portion of a script that works fine by replacing the apostrophe....
where_clause = quadField + " = " + "'" + quad.replace("'", '"') + "'"
#out_quad = quad.replace("'", "") + ".shp"
arcpy.MakeFeatureLayer_management(quadTable, "quadLayer")
select_out_feature_class = arcpy.SelectLayerByAttribute_management("quadLayer", "NEW_SELECTION", where_clause)