Need help figuring out why my csv output is blank? - python

I wrote this code to create a .csv report from an .xml file, but when I open the .csv that's generated it's blank. Feel free to rip my code apart, by the way, I'm super new to this and want to learn!
There are multiple "Subjectkeys" in the xml, but only some have an "AuditRecord". I only want to pull ones with an audit record, and then for those, I want to pull their info from "SubjectData", "FormData" and "AuditRecord"
import csv
import xml.etree.cElementTree as ET
tree = ET.parse("response.xml")
root = tree.getroot()
xml_data_to_csv =open("query.csv", 'w')
AuditRecord_head = []
SubjectData_head = []
FormData_head = []
csvwriter=csv.writer(xml_data_to_csv)
count=0
for member in root.findall("AuditRecord"):
AuditRecord = []
Subjectdata = []
FormData = []
if count == 0:
Subject = member.find("SubjectKey").tag
Subjectdata_head.append(Subject)
Form = member.find("p1Name").tag
FormData_head.append(Form)
Action = member.find("Action").tag
AuditRecord_head.append(Action)
csvwriter.writerow(Auditrecord_head)
count = count + 1
Subject = member.find('SubjectKey').text
Subjectdata.append(Subject)
Form = member.find('p1Name').text
FormData.append(Form)
Action = member.find("Action").text
AuditRecord.append(Action)
csvwriter.writerow(Subjectdata)
xml_data_to_csv.close()
I expect the output to be a table with column headings: Subject, Form, Action.
Here is sample .xml:
</ClinicalData>
<ClinicalData StudyOID="SMK-869-002" MetaDataVersionOID="2.0">
<SubjectData SubjectKey="865-015">
</AuditRecord>
</FormData>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="0"/>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="1">
<p1:QueryAction InitialComment="Please enter start date for condition" UserType="User" UserOID="bailey#protocolfirst.com" Action="query" DateTimeStamp="2019-07-12T14:08:43.893Z"/>
</AuditRecord>

First of all your xml file has a lot of errors, to me it has to look like:
<?xml version="1.0"?>
<root xmlns:p1="http://some-url.com">
<ClinicalData StudyOID="SMK-869-002" MetaDataVersionOID="2.0"></ClinicalData>
<SubjectData SubjectKey="865-015"></SubjectData>
<AuditRecord>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="0"/>
<FormData p1:Name="Medical History" p1:Started="Y" FormOID="mh" FormRepeatKey="1"/>
<p1:QueryAction InitialComment="Please enter start date for condition" UserType="User" UserOID="bailey#protocolfirst.com" Action="query" DateTimeStamp="2019-07-12T14:08:43.893Z"/>
</AuditRecord>
</root>
ElementTree always expects only a single root node, and a well-formed document.
I do not understand very well what your trying to do, but I hope this could help you:
import xml.etree.cElementTree as ET
tree = ET.parse("response.xml")
root = tree.getroot()
xml_data_to_csv = open("query.csv", 'w')
list_head=[]
count=0
for member in root.findall("AuditRecord"):
AuditRecord = []
Subjectdata = []
FormData = []
if count == 0:
Subjectdata.append(root.find('./SubjectData').attrib['SubjectKey'])
for formData in root.findall('./AuditRecord/FormData'):
#print(formData.attrib['{http://some-url.com}Name'])
FormData.append(formData.attrib['{http://some-url.com}Name'])
AuditRecord.append(root.find('./AuditRecord/{http://some-url.com}QueryAction').attrib['Action'])
xml_data_to_csv.write(Subjectdata[0] + "," + FormData[0] + "," + FormData[1] + "," + AuditRecord[0])
count = count + 1
xml_data_to_csv.close()
This will produce a csv file with the following content:
865-015,Medical History,Medical History,query

Related

Adding \n after every element in an XML python

I'm making an automation that takes an existing XML file, changes one of the values and overwrites the file.
My main problem is that its important for me to keep the formatting of the original file and i can't manage to do that, the new data has no line breaks and its all just a long line.
<StationConfig StationId="8706" SportType="null" StationType="null" UseMetricSystem="US" LocalTempStorageDrive="C:\" LocalStorageDrive="C:\">
<ClubManagementEnable ClubManagementStaticHours="">false</ClubManagementEnable>
</StationConfig>
My code is:
parser = etree.XMLParser()
read = etree.parse("C:\StationConfig.xml", parser=parser).getroot()
read.set("StationId", "8706")
tree = etree.ElementTree(read)
tree.write("C:\devtree\Station.xml", pretty_print=True)
How can i add an \n after each element?
Thanks!
As far as I understand, the below is what you are looking for.
import xml.etree.ElementTree as ET
xml = '''<?xml version="1.0" encoding="UTF-8"?>
<StationConfig StationId="8706" SportType="null" StationType="null" UseMetricSystem="US" LocalTempStorageDrive="C:\" LocalStorageDrive="C:\">
<ClubManagementEnable ClubManagementStaticHours="">false</ClubManagementEnable>
</StationConfig>'''
def _pretty_print(current, parent=None, index=-1, depth=0):
for i, node in enumerate(current):
_pretty_print(node, current, i, depth + 1)
if parent is not None:
if index == 0:
parent.text = '\n' + ('\t' * depth)
else:
parent[index - 1].tail = '\n' + ('\t' * depth)
if index == len(parent) - 1:
current.tail = '\n' + ('\t' * (depth - 1))
root = ET.fromstring(xml)
# change something in the xml
root.attrib['StationId'] = '123'
# save it back to disk
_pretty_print(root)
tree = ET.ElementTree(root)
tree.write("out.xml")
out.xml below
<StationConfig StationId="123" SportType="null" StationType="null" UseMetricSystem="US" LocalTempStorageDrive="C:" LocalStorageDrive="C:">
<ClubManagementEnable ClubManagementStaticHours="">false</ClubManagementEnable>
</StationConfig>

split() issues with pdf extractText()

I'm working on a minor content analysis program that I was hoping that I could have running through several pdf-files and return the sum of frequencies that some specific words are mentioned in the text. The words that are searched for are specified in a separate text file (list.txt) and can be altered. The program runs just fine through files with .txt format, but the result is completely different when running the program on a .pdf file. To illustrate, the test text that I have the program running trhough is the following:
"Hello
This is a product development notice
We’re working with innovative measures
A nice Innovation
The world that we live in is innovative
We are currently working on a new process
And in the fall, you will experience our new product development introduction"
The list of words grouped in categories are the following (marked in .txt file with ">>"):
innovation: innovat
product: Product, development, introduction
organization: Process
The output from running the code with a .txt file is the following:
Whereas the ouput from running it with a .pdf is the following:
As you can see, my issue is pertaining to the splitting of the words, where in the .pdf output i can have a string like "world" be split into 'w','o','rld'. I have tried to search for why this happens tirelessly, without success. As I am rather new to Python programming, I would appreciate any answe or direction to where I can fin and answer to why this happens, should you know any source.
Thanks
The code for the .txt is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.txt'
textfile = open(f)
text = textfile.read().split() # lowercase the text
print (text)
textfile.close()
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in text:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
While the code for the .pdf is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.pdf'
textfile = open(f, 'rb')
text = PyPDF2.PdfFileReader(textfile)# lowercase the text
for pageNum in range(0, text.numPages):
texts = text.getPage(pageNum)
textfile = texts.extractText().split()
print (textfile)
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in textfile:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])

Define an XML schema from a txt file With Python script and store it in a single XML file

I am trying to write an xml schema using python from a .txt file. I tried the following code but it didn't read the values in lines of the text.
The data is like:
# File format is Team:Player:Salary:Position
New York Yankees :"Acevedo Juan" : 900000: Pitcher
New York Yankees :"Anderson Jason": 300000: Pitcher
............
and the code:
import re
import xml.etree.ElementTree as ET
root = ET.Element('root')
root.text = '\n' # newline before the celldata element
f = open("C:/baseball.txt")
lines = f.readlines()
for l in lines:
elems = l.split(":")
if len(elems) == 4:
elems = map(lambda x: x.strip(), elems)
playerdata = ET.SubElement(root, "playerdata")
playerdata.text = '\n'
playerdata.tail = '\n\n'
team = ET.SubElement(playerdata, "team")
player = ET.SubElement(playerdata, "player")
salary = ET.SubElement(playerdata, "salary")
position = ET.SubElement(playerdata, "position")
ET.dump(root)
tree = ET.ElementTree(root)
tree.write("test1.xml", encoding='utf-8', xml_declaration=True)
The output I got looks like this:
<root>
<playerdata>
<team /><player /><salary /><position /></playerdata>
<playerdata>
<team /><player /><salary /><position /></playerdata>
<playerdata>
<team /><player /><salary /><position /></playerdata>
.
.
.
<playerdata>
<team /><player /><salary /><position /></playerdata>
</root>
A few changes, plus the fix. The changes are:
Using a context manager to ensure the input file gets closed
correctly.
Using the fact that iterating a file yields each line in
turn (rather than f.readlines()).
Combining the splitting and stripping of the parts of the line into one statement.
The fix is to set the text value for the subelements just after they are created.
import xml.etree.ElementTree as ET
root = ET.Element('root')
root.text = '\n' # newline before the celldata element
with open("C:/baseball.txt") as f:
for l in f:
elems = [ x.strip() for x in l.split(":") ]
if len(elems) == 4:
playerdata = ET.SubElement(root, "playerdata")
playerdata.text = '\n'
playerdata.tail = '\n\n'
team = ET.SubElement(playerdata, "team")
team.text = elems[0]
player = ET.SubElement(playerdata, "player")
player.text = elems[1]
salary = ET.SubElement(playerdata, "salary")
salary.text = elems[2]
position = ET.SubElement(playerdata, "position")
position.text = elems[3]
ET.dump(root)
tree = ET.ElementTree(root)
tree.write("test1.xml", encoding='utf-8', xml_declaration=True)

python: gettng multiple results for getElementsByTagName

I'm trying to get each instance of an XML tag but I can only seem to return one or none.
#!/usr/software/bin/python
# import libraries
import urllib
from xml.dom.minidom import parseString
# variables
startdate = "2014-01-01"
enddate = "2014-05-01"
rest_client = "test"
rest_host = "restprd.test.com"
rest_port = "80"
rest_base_url = "asup-rest-interface/ASUP_DATA"
rest_date = "/start_date/%s/end_date/%s/limit/5000" % (startdate,enddate)
rest_api = "http://" + rest_host + ":" + rest_port + "/" + rest_base_url + "/" + "client_id" + "/" + rest_client
response = urllib.urlopen(rest_api + rest_date + '/sys_serial_no/700000667725')
data = response.read()
response.close()
dom = parseString(data)
xmlVer = dom.getElementsByTagName('sys_version').toxml()
xmlDate = dom.getElementsByTagName('asup_gen_date').toxml()
xmlVerTag=xmlVer.replace('<sys_version>','').replace('</sys_version>','')
xmlDateTag=xmlDate.replace('<asup_gen_date>','').replace('</asup_gen_date>','').replace('T',' ')[0:-6]
print xmlDateTag , xmlVerTag
The above code generates the following error:
Traceback (most recent call last):
File "./test.py", line 23, in <module>
xmlVer = dom.getElementsByTagName('sys_version').toxml()
AttributeError: 'NodeList' object has no attribute 'toxml'
If I change the .toxml() to [0].toxml() I can get the first element, but I need to get all the elements. Any ideas?
Also, if I try something like this I get no output at all:
response = urllib.urlopen(rest_api + rest_date + '/sys_serial_no/700000667725')
DOMTree = xml.dom.minidom.parse(response)
collection = DOMTree.documentElement
if collection.hasAttribute("results"):
print collection.getAttribute("sys_version")
The original data looks like this.
There are repeating sections of XML like this:
<xml><status request_id="58f39198-2c76-4e87-8e00-f7dd7e69519f1416354337206" response_time="00:00:00:833"></status><results start="1" limit="1000" total_results_count="1" results_count="1"><br/><system><tests start="1" limit="50" total_results_count="18" results_count="18"><test> <biz_key>C|BF02F1A3-3C4E-11DC-8AAE-0015171BBD90|8594169899|700000667725</biz_key><test_id>2014071922090465</test_id><test_subject>HA Group Notification (WEEKLY_LOG) INFO</test_subject><test_type>DOT-REGULAR</test_type><asup_gen_date>2014-07-20T00:21:40-04:00</asup_gen_date><test_received_date>Sat Jul 19 22:09:19 PDT 2014</test_received_date><test_gen_zone>EDT</test_gen_zone><test_is_minimal>false</test_is_minimal><sys_version>9.2.2X22</sys_version><sys_operating_mode>Cluster-Mode</sys_operating_mode><hostname>rerfdsgt</hostname><sys_domain>test.com</sys_domain><cluster_name>bbrtp</cluster_name> ... etc
<xml>
<results>
<system>
-<sys_version>
<asup>
-<asup_gen_date>
I simply want to extract the sys_version and asup_gen_date
9.2.2X22 2014-07-20 00:21:40
9.2.2X21 2014-06-31 12:51:40
8.5.2X1 2014-07-20 04:33:22
You need to loop over the results of getElementsByTagName():
for version in dom.getElementsByTagName('sys_version'):
version = version.toxml()
version = version.replace('<sys_version>','').replace('</sys_version>','')
print version
Also, instead of replacing opening and closing tags, you probably want yo use getText():
def getText(nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
for version in dom.getElementsByTagName('sys_version'):
print getText(version.childNodes)
Another point is that it would be much more easy and pleasant to parse xml with xml.etree.ElementTree, example:
import xml.etree.ElementTree as ET
tree = ET.parse(response)
root = tree.getroot()
for version in root.findall('sys_version'):
print version.text

python, fetch sequence from DAS by coordinates

ucsc DAS server, which get DNA sequences by coordinate.
URL: http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr20:30037432,30038060
sample file:
<DASDNA>
<SEQUENCE id="chr20" start="30037832" stop="30038060" version="1.00">
<DNA length="229">
gtggcacccaaagatgctggaatctttatggcaaatgccgttacagatgc
tccaagaaggaaagagtctatgtttactgcataaataataaaatgtgctg
cgtgaagcccaagtaccagccaaaagaaaggtggtggccattttaactgc
tttgaagcctgaagccatgaaaatgcagatgaagctcccagtggattccc
acactctatcaataaacacctctggctga
</DNA>
</SEQUENCE>
</DASDNA>
what I want is this part:
gtggcacccaaagatgctggaatctttatggcaaatgccgttacagatgc
tccaagaaggaaagagtctatgtttactgcataaataataaaatgtgctg
cgtgaagcccaagtaccagccaaaagaaaggtggtggccattttaactgc
tttgaagcctgaagccatgaaaatgcagatgaagctcccagtggattccc
acactctatcaataaacacctctggctga
I want to get the sequence part from thousands of this kind urls, how should i do it?
I tried to write the data to file and parse the file, it worked ok, but is there any way to parse the xml-like string directly? i tried some example from other posts, but they didn't work.
Here, I added my solution. Thanks to the 2 answers below.
Solution 1:
def getSequence2(chromosome, start, end):
base = 'http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment='
url = base + chromosome + ':' + str(start) + ',' + str(end)
doc = etree.parse(url,parser=etree.XMLParser())
if doc != '':
sequence = doc.xpath('SEQUENCE/DNA/text()')[0].replace('\n','')
else:
sequence = 'THE SEQUENCE DOES NOT EXIST FOR GIVEN COORDINATES'
return sequence
Solution 2:
def getSequence1(chromosome, start, end):
base = 'http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment='
url = base + chromosome + ':' + str(start) + ',' + str(end)
xml = urllib2.urlopen(url).read()
if xml != '':
w = open('temp.xml', 'w')
w.write(xml)
w.close()
dom = parse('temp.xml')
data = dom.getElementsByTagName('DNA')
sequence = data[0].firstChild.nodeValue.replace('\n','')
else:
sequence = 'THE SEQUENCE DOES NOT EXIST FOR GIVEN COORDINATES'
return sequence
Of course they will need to import some necessary libraries.
>>> from lxml import etree
>>> doc = etree.parse("http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr20:30037432,30038060",parser=etree.XMLParser())
>>> doc.xpath('SEQUENCE/DNA/text()')
['\natagtggcacatgtctgttgtcctagctcctcggggaaactcaggtggga\ngagtcccttgaactgggaggaggaggtttgcagtgagccagaatcattcc\nactgtactccagcctaggtgacagagcaagactcatctcaaaaaaaaaaa\naaaaaaaaaaaaaagacaatccgcacacataaaggctttattcagctgat\ngtaccaaggtcactctctcagtcaaaggtgggaagcaaaaaaacagagta\naaggaaaaacagtgatagatgaaaagagtcaaaggcaagggaaacaaggg\naccttctatctcatctgtttccattcttttacagacctttcaaatccgga\ngcctacttgttaggactgatactgtctcccttctttctgctttgtgtcag\ngtggcacccaaagatgctggaatctttatggcaaatgccgttacagatgc\ntccaagaaggaaagagtctatgtttactgcataaataataaaatgtgctg\ncgtgaagcccaagtaccagccaaaagaaaggtggtggccattttaactgc\ntttgaagcctgaagccatgaaaatgcagatgaagctcccagtggattccc\nacactctatcaataaacacctctggctga\n']
Use a Python XML parsing library like lxml, load the XML file with that parser, and then use a selector (e.g. using XPath) to grab the node/element that you need.

Categories