I'm trying to convert a JSON file to XML using a small python script, but for some reason the loop only seems to read the first line of the JSON file.
from xml.dom import minidom
from json import JSONDecoder
import json
import sys
import csv
import os
import re
import dicttoxml
from datetime import datetime, timedelta
from functools import partial
reload(sys)
sys.setdefaultencoding('utf-8')
nav = 'navigation_items.bson.json'
df = 'testxmloutput.txt'
def json2xml(json_obj, line_padding=""):
result_list = list()
json_obj_type = type(json_obj)
if json_obj_type is list:
for sub_elem in json_obj:
result_list.append(json2xml(sub_elem, line_padding))
return "\n".join(result_list)
if json_obj_type is dict:
for tag_name in json_obj:
sub_obj = json_obj[tag_name]
result_list.append("%s<%s>" % (line_padding, tag_name))
result_list.append(json2xml(sub_obj, "\t" + line_padding))
result_list.append("%s</%s>" % (line_padding, tag_name))
return "\n".join(result_list)
return "%s%s" % (line_padding, json_obj)
def json_parse(fileobj, decoder=JSONDecoder(), buffersize=2048):
buffer = ''
for chunk in iter(partial(fileobj.read, buffersize), ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
def converter(data):
f = open(df,'w')
data = open(nav)
for line in json_parse(data):
f.write(dicttoxml.dicttoxml(line, attr_type=False))
f.close()
converter(nav)
I was under the assumption that the iter would read the first line to memory and move on to the next. The converted output looks great but im too sure where to look on how to get it to loop through to the next line in file.
Try json.load to load the file into a dict and then iterate the dict for your output.
import sys
import json
json_file = sys.argv[1]
data = {}
with open(json_file) as data_file:
data = json.load(data_file)
for key in data:
#do your things
Related
I am trying to get the reverse sequences orientated correctly in a file. This is the code:
import os
import sys import pysam
from Bio import SeqIO, Seq, SeqRecord
def main(in_file):
out_file = "%s.fa" % os.path.splitext(in_file)[0]
with open(out_file, "w") as out_handle:
# Write records from the BAM file one at a time to the output file.
# Works lazily as BAM sequences are read so will handle large files.
SeqIO.write(bam_to_rec(in_file), out_handle, "fasta")
def bam_to_rec(in_file):
"""Generator to convert BAM files into Biopython SeqRecords.
"""
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
if read.is_reverse:
seq = seq.reverse_complement()
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
if __name__ == "__main__":
main(*sys.argv[1:])`
When I print out the reverse sequences, the code works. But when in the file it is printed out as a reverse sequence. Can anyone help me to find out what is going wrong?
Here is the link to my infile:
https://www.dropbox.com/sh/68ui8l7nh5fxatm/AABUr82l01qT1nL8I_XgJaeTa?dl=0
Note the ugly counter is just to print 10000 sequences, not more.
comparing one without ever reversing with one that reverses if needed
Here's the output on a couple of seqs, feel free to test it, I think your issue is that yield returns an iterator but you are not iterating it, unless I am missunderstanding what you are doing:
Original:
SOLEXA-1GA-2:2:93:1281:961#0
GGGTTAGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG
Becomes:
SOLEXA-1GA-2:2:93:1281:961#0
CTAACCCTAACCCTAACCCTAACCCTAACCTAACCC
And if not reverse:
Original:
SOLEXA-1GA-2:2:12:96:1547#0
ACACACAAACACACACACACACACACACACACCCCC
Becomes:
SOLEXA-1GA-2:2:12:96:1547#0
ACACACAAACACACACACACACACACACACACCCCC
Here's my code:
import os
import sys
import pysam
from Bio import SeqIO, Seq, SeqRecord
def main(in_file):
out_file = "%s.fa" % os.path.splitext(in_file)[0]
with open('test_non_reverse.txt', 'w') as non_reverse:
with open(out_file, "w") as out_handle:
# Write records from the BAM file one at a time to the output file.
# Works lazily as BAM sequences are read so will handle large files.
i = 0
for s in bam_to_rec(in_file):
if i == 10000:
break
i +=1
SeqIO.write(s, out_handle, "fasta")
i = 0
for s in convert_to_seq(in_file):
if i == 10000:
break
i +=1
SeqIO.write(s, non_reverse, 'fasta')
def convert_to_seq(in_file):
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
def bam_to_rec(in_file):
"""Generator to convert BAM files into Biopython SeqRecords.
"""
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
if read.is_reverse:
seq = seq.reverse_complement()
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
if __name__ == "__main__":
main(*sys.argv[1:])
I have below code to write a list items as csv file. But while doing that, I see special character  is occurring in the output csv file. For testing in the local, I have defined a list with character '\xao' included in the list items and tested various ways to remove that and replace with space. But, I still get the special character in the csv output. Can anyone help?
import csv
from flask import make_response
import StringIO
csv_list = [['hfhf\xa0 fsdg','dsf'],['fsdgs fsdfs','fsdfsd'],['dsf\xa0 sf','asfg']]
def download_csv_summary(csv_list):
si = StringIO.StringIO()
cw = csv.writer(si)
filename = 'Test'
cw.writerows(csv_list)
output = make_response(si.getvalue())
output.headers['Content-Disposition'] = \
'attachment; filename={filename}.csv'.format(filename=filename)
output.headers['Content-Type'] = 'text/csv'
return output
The writerows line needs to be changed to replace the \xa0s:
import csv
from flask import make_response
import StringIO
csv_list = [
['hfhf\xa0 fsdg','dsf'],
['fsdgs fsdfs','fsdfsd'],
['dsf\xa0 sf','asfg']
]
def download_csv_summary(csv_list):
si = StringIO.StringIO()
cw = csv.writer(si)
filename = 'Test'
cw.writerows([[str(x).replace('\xa0', '') for x in l] for l in csv_list])
output = make_response(si.getvalue())
output.headers['Content-Disposition'] = \
'attachment; filename={filename}.csv'.format(filename=filename)
output.headers['Content-Type'] = 'text/csv'
return output
I'm have a csv file which contains hundred thousands of rows and below are some sample lines..,
1,Ni,23,28-02-2015 12:22:33.2212-02
2,Fi,21,28-02-2015 12:22:34.3212-02
3,Us,33,30-03-2015 12:23:35-01
4,Uk,34,31-03-2015 12:24:36.332211-02
I need to get the last column of csv data which is in wrong datetime format. So I need to get default datetimeformat("YYYY-MM-DD hh:mm:ss[.nnn]") from last column of the data.
I have tried the following script to get lines from it and write into flow file.
import json
import java.io
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback
class PyStreamCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
text = IOUtils.readLines(inputStream, StandardCharsets.UTF_8)
for line in text[1:]:
outputStream.write(line + "\n")
flowFile = session.get()
if (flowFile != None):
flowFile = session.write(flowFile,PyStreamCallback())
flowFile = session.putAttribute(flowFile, "filename", flowFile.getAttribute('filename'))
session.transfer(flowFile, REL_SUCCESS)
but I am not able to find a way to convert it like below output.
1,Ni,23,28-02-2015 12:22:33.221
2,Fi,21,29-02-2015 12:22:34.321
3,Us,33,30-03-2015 12:23:35
4,Uk,34,31-03-2015 12:24:36.332
I have checked solutions with my friend(google) and was still not able to find solution.
Can anyone guide me to convert those input data into my required output?
In this transformation the unnecessary data located at the end of each line, so it's really easy to manage transform task with regular expression.
^(.*:\d\d)((\.\d{1,3})(\d*))?(-\d\d)?
Check the regular expression and explanation here:
https://regex101.com/r/sAB4SA/2
As soon as you have a large file - better not to load it into the memory. The following code loads whole the file into the memory:
IOUtils.readLines(inputStream, StandardCharsets.UTF_8)
Better to iterate line by line.
So this code is for ExecuteScript nifi processor with python (Jython) language:
import sys
import re
import traceback
from org.apache.commons.io import IOUtils
from org.apache.nifi.processor.io import StreamCallback
from org.python.core.util import StringUtil
from java.lang import Class
from java.io import BufferedReader
from java.io import InputStreamReader
from java.io import OutputStreamWriter
class TransformCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
try:
writer = OutputStreamWriter(outputStream,"UTF-8")
reader = BufferedReader(InputStreamReader(inputStream,"UTF-8"))
line = reader.readLine()
p = re.compile('^(.*:\d\d)((\.\d{1,3})(\d*))?(-\d\d)?')
while line!= None:
# print line
match = p.search(line)
writer.write( match.group(1) + (match.group(3) if match.group(3)!=None else '') )
writer.write('\n')
line = reader.readLine()
writer.flush()
writer.close()
reader.close()
except:
traceback.print_exc(file=sys.stdout)
raise
flowFile = session.get()
if flowFile != None:
flowFile = session.write(flowFile, TransformCallback())
# Finish by transferring the FlowFile to an output relationship
session.transfer(flowFile, REL_SUCCESS)
And as soon as question is about nifi, here are alternatives that seems to be easier
the same code as above but in groovy for nifi ExecuteScript processor:
def ff = session.get()
if(!ff)return
ff = session.write(ff, {rawIn, rawOut->
// ## transform streams into reader and writer
rawIn.withReader("UTF-8"){reader->
rawOut.withWriter("UTF-8"){writer->
reader.eachLine{line, lineNum->
if(lineNum>1) { // # skip the first line
// ## let use regular expression to transform each line
writer << line.replaceAll( /^(.*:\d\d)((\.\d{1,3})(\d*))?(-\d\d)?/ , '$1$3' ) << '\n'
}
}
}
}
} as StreamCallback)
session.transfer(ff, REL_SUCCESS)
ReplaceText processor
And if regular expression is ok - the easiest way in nifi is a ReplaceText processor that could do regular expression replace line-by-line.
In this case you don't need to write any code, just build the regular expression and configure your processor correctly.
Just using pure jython. It is an example that can be adapted to OP's needs.
Define a datetime parser for this csv file
from datetime import datetime
def parse_datetime(dtstr):
mydatestr='-'.join(dtstr.split('-')[:-1])
try:
return datetime.strptime(mydatestr,'%d-%m-%Y %H:%M:%S.%f').strftime('%d-%m-%Y %H:%M:%S.%f')[:-3]
except ValueError:
return datetime.strptime(mydatestr,'%d-%m-%Y %H:%M:%S').strftime('%d-%m-%Y %H:%M:%S')
my test.csv includes data like this: ( 2015 didnt have 29 Feb had to change OP's example ).
1,Ni,23,27-02-2015 12:22:33.2212-02
2,Fi,21,28-02-2015 12:22:34.3212-02
3,Us,33,30-03-2015 12:23:35-01
4,Uk,34,31-03-2015 12:24:36.332211-02
now the solution
with open('test.csv') as fi:
for line in fi:
line_split=line.split(',')
out_line = ', '.join(word if i<3 else parse_datetime(word) for i,word in enumerate(line_split))
#print(out_line)
#you can write this out_line to a file here.
printing out_line looks like this
1, Ni, 23, 27-02-2015 12:22:33.221
2, Fi, 21, 28-02-2015 12:22:34.321
3, Us, 33, 30-03-2015 12:23:35
4, Uk, 34, 31-03-2015 12:24:36.332
You can get them with regex :
(\d\d-\d\d-\d\d\d\d\ \d\d:\d\d:)(\d+(?:\.\d+)*)(-\d\d)$
Then just replace #2 with a rounded version of #2
See regex example at regexr.com
You could even do it "nicer" by getting every single value with a capturing group and then put them into a datetime.datetime object and print it from there, but imho that would be an overkill in maintainability and loose you too much performance.
Code had no possibility to test
import re
...
pattern = '^(.{25})(\d+(?:\.\d+)*)(-\d\d)$' //used offset for simplicity
....
for line in text[1:]:
match = re.search(pattern, line)
line = match.group(1) + round(match.group(2),3) + match.group(3)
outputStream.write(line + "\n")
My specs:
Python 3.4.3
Windows 7
IDE is Jupyter Notebooks
What I have referenced:
how-to-properly-escape-single-and-double-quotes
python-escaping-strings-for-use-in-xml
escaping-characters-in-a-xml-file-with-python
Here is the data and script, respectively, below (I have tried variations on serializing Column 'E' using both Sax and ElementTree):
Data
A,B,C,D,E,F,G,H,I,J
"3","8","1","<Request TransactionID="3" RequestType="FOO"><InstitutionISO /><CallID>23</CallID><MemberID>12</MemberID><MemberPassword /><RequestData><AccountNumber>2</AccountNumber><AccountSuffix>85</AccountSuffix><AccountType>S</AccountType><MPIAcctType>Checking</MPIAcctType><TransactionCount>10</TransactionCount></RequestData></Request>","<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000',0001,0070,</ShareList></Response>","1967-12-25 22:18:13.471000","2005-12-25 22:18:13.768000","2","70","0"
Script
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os.path
import sys
import csv
from io import StringIO
import xml.etree.cElementTree as ElementTree
from xml.etree.ElementTree import XMLParser
import xml
import xml.sax
from xml.sax import ContentHandler
class MyHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self._charBuffer = []
self._result = []
def _getCharacterData(self):
data = ''.join(self._charBuffer).strip()
self._charBuffer = []
return data.strip() #remove strip() if whitespace is important
def parse(self, f):
xml.sax.parse(f, self)
return self._result
def characters(self, data):
self._charBuffer.append(data)
def startElement(self, name, attrs):
if name == 'Response':
self._result.append({})
def endElement(self, name):
if not name == 'Response': self._result[-1][name] = self._getCharacterData()
def read_data(path):
with open(path, 'rU', encoding='utf-8') as data:
reader = csv.DictReader(data, delimiter =',', quotechar="'", skipinitialspace=True)
for row in reader:
yield row
if __name__ == "__main__":
empty = ''
Response = 'sample.csv'
for idx, row in enumerate(read_data(Response)):
if idx > 10: break
data = row['E']
print(data) # The before
data = data[1:-1]
data = ""'{}'"".format(data)
print(data) # Sanity check
# data = '<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000',0001,0070,</ShareList></Response>'
try:
root = ElementTree.XML(data)
# print(root)
except StopIteration:
raise
pass
# xmlstring = StringIO(data)
# print(xmlstring)
# Handler = MyHandler().parse(xmlstring)
Specifically, due to the quoting in the CSV file (which is beyond my control), I have had to resort to slicing the string (line 51) and then formatting it (line 52).
However the print out from the above attempt is as follows:
"<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000'
<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000
File "<string>", line unknown
ParseError: no element found: line 1, column 69
Interestingly - if I assign the variable "data" (as in line 54) I receive this:
File "<ipython-input-80-7357c9272b92>", line 56
data = '<Response TransactionID="2" RequestType="HoldInquiry"><ShareList>0000',0001,0070,</ShareList></Response>'
^
SyntaxError: invalid token
I seek feedback and information on how to address utilizing the most Pythonic means to do so. Ideally, is there a method that can leverage ElementTree. Thank you, in advance, for your feedback and guidance.
It seems that You have badly formatted (well, badly quoted) csv data.
If csv file is beyond Your control I suggest not using csv reader to read them,
instead - if You can rely on each field being properly quoted - split them yourself.
with open(Response, 'rU', encoding='utf-8') as data:
separated = data.read().split('","')
try:
x = ElementTree.XML(separated[3])
print(x)
xml.etree.ElementTree.dump(x)
y = ElementTree.XML(separated[4])
xml.etree.ElementTree.dump(y)
except Exception as e:
print(e)
outputs
<Element 'Request' at 0xb6d973b0>
<Request RequestType="FOO" TransactionID="3"><InstitutionISO /><CallID>23</CallID><MemberID>12</MemberID><MemberPassword /><RequestData><AccountNumber>2</AccountNumber><AccountSuffix>85</AccountSuffix><AccountType>S</AccountType><MPIAcctType>Checking</MPIAcctType><TransactionCount>10</TransactionCount></RequestData></Request>
<Response RequestType="HoldInquiry" TransactionID="2"><ShareList>0000',0001,0070,</ShareList></Response>
I am working on pulling logs through an web API and so far when pulling the logs they return in the following format (3 events below starting with and ending with . My question is what would be the best way to loop through each line and concatenate them so that the result event looks like below.
Current output
<attack_headlines version="1.0.1">
<attack_headline>
<site_id>1</site_id>
<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category>
<subcategory>SUlTIEhlbHA=</subcategory>
<client_ip>172.17.1.126</client_ip>
<date>1363735940</date>
<gmt_diff>0</gmt_diff>
<reference_id>6D13-DE3D-9539-8980</reference_id>
</attack_headline>
</attack_headlines>
<attack_headlines version="1.0.1">
<attack_headline>
<site_id>1</site_id>
<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category>
<subcategory>SUlTIEhlbHA=</subcategory>
<client_ip>172.17.1.136</client_ip>
<date>1363735971</date>
<gmt_diff>0</gmt_diff>
<reference_id>6D13-DE3D-9539-8981</reference_id>
</attack_headline>
</attack_headlines>
<attack_headlines version="1.0.1">
<attack_headline>
<site_id>1</site_id>
<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category>
<subcategory>SUlTIEhlbHA=</subcategory>
<client_ip>172.17.1.156</client_ip>
<date>1363735975</date>
<gmt_diff>0</gmt_diff>
<reference_id>6D13-DE3D-9539-8982</reference_id>
</attack_headline>
</attack_headlines>
Expected output
<attack_headlines version="1.0.1"><attack_headline><site_id>1</site_id<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category<subcategory>SUlTIEhlbHA=</subcategory><client_ip>172.17.1.156</client_ip<date>1363735975</date><gmt_diff>0</gmt_diff<reference_id>6D13-DE3D-9539-8982</reference_id></attack_headline</attack_headlines>
Thanks in advance!
import json
import os
from suds.transport.https import WindowsHttpAuthenticated
class Helpers:
def set_connection(self,conf):
#SUDS BUG FIXER(doctor)
protocol=conf['protocol']
hostname=conf['hostname']
port=conf['port']
path=conf['path']
file=conf['file']
u_name=conf['login']
passwrd=conf['password']
auth_type = conf['authType']
from suds.xsd.doctor import ImportDoctor, Import
from suds.client import Client
url = '{0}://{1}:{2}/{3}/{4}?wsdl'.format(protocol,
hostname,port, path, file)
imp = Import('http://schemas.xmlsoap.org/soap/encoding/')
d = ImportDoctor(imp)
if(auth_type == 'ntlm'):
ntlm = WindowsHttpAuthenticated(username=u_name, password=passwrd)
client = Client(url, transport=ntlm, doctor=d)
else:
client = Client(url, username=u_name, password=passwrd, doctor=d)
return client
def read_from_file(self, filename):
try:
fo = open(filename, "r")
try:
result = fo.read()
finally:
fo.close()
return result
except IOError:
print "##Error opening/reading file {0}".format(filename)
exit(-1)
def read_json(self,filename):
string=self.read_from_file(filename)
return json.loads(string)
def get_recent_attacks(self, client):
import time
import base64
from xml.dom.minidom import parseString
epoch_time_now = int(time.time())
epochtimeread = open('epoch_last', 'r')
epoch_time_last_read = epochtimeread.read()
epochtimeread.close()
epoch_time_last = int(float(epoch_time_last_read))
print client.service.get_recent_attacks("",epoch_time_last,epoch_time_now,1,"",15)
If this is just a single, large string object with line-breaks, you can simply delete them:
import re
text = re.sub('\s*\n\s*', '', text)
To leave the line breaks in that follow the </attack_headline> delimiter, try:
re.sub('(?<!<\/attack_headline>)\s*\n\s*', '', x)
You could use:
oneline = "".join(multiline.split())
Edit 1 (I've just seen your edit) - I will change your code like this:
with open(filename, "r") as fo:
result = []
for line in fo.readlines():
result.append(line.strip())
return result
Edit 2 (I've read your comment on the other answer) - You could do like this:
with open(filename, "r") as fo:
partial = []
for line in fo.readlines():
if line.startswith("<"):
yield "".join(partial)
partial = []
else:
clean = line.strip()
if clean:
partial.append(clean)
import re
# remove all newline whitespace stuff as in answer given before:
text = re.sub(r'\s*\n\s*', '', text)
# break again at desired points:
text = re.sub(r'</attack_headlines>', '</attack_headlines>\n', text)