It's a small thing, really: I have this function that converts dict objects to xml.
Here's the function:
def dictToXml(d):
from xml.sax.saxutils import escape
def unicodify(o):
if o is None:
return u'';
return unicode(o)
lines = []
def addDict(node, offset):
for name, value in node.iteritems():
if isinstance(value, dict):
lines.append(offset + u"<%s>" % name)
addDict(value, offset + u" " * 4)
lines.append(offset + u"</%s>" % name)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
lines.append(offset + u"<%s>" % name)
addDict(item, offset + u" " * 4)
lines.append(offset + u"</%s>" % name)
else:
lines.append(offset + u"<%s>%s</%s>" % (name, escape(unicodify(item)), name))
else:
lines.append(offset + u"<%s>%s</%s>" % (name, escape(unicodify(value)), name))
addDict(d, u"")
lines.append(u"")
return u"\n".join(lines)
For example, it converts this dictionary
{ 'site': { 'name': 'stackoverflow', 'blogger': [ 'Jeff', 'Joel' ] } }
to:
<site>
<name>stackoverflow</name>
<blogger>jeff</blogger>
<blogger>joel</blogger>
</site>
It works, but the addDict function looks a little too repetitive. I'm sure there's a way to refactor it into 3 co-recursive functions named addDict, addList and addElse, but my brain is stuck. Any help?
Also, any way to get rid of the offset + thing in every line would be nice.
NOTE: I chose these semantics because I'm trying to match the behavior of the json-to-xml converter in org.json, which I use in a different part of my project. If you got to this page just looking for a dictionary to xml converter, there are some really good options in some of the answers. (Especially pyfo).
>>> from pyfo import pyfo
>>> d = ('site', { 'name': 'stackoverflow', 'blogger': [ 'Jeff', 'Joel' ] } )
>>> result = pyfo(d, pretty=True, prolog=True, encoding='ascii')
>>> print result.encode('ascii', 'xmlcharrefreplace')
<?xml version="1.0" encoding="ascii"?>
<site>
<blogger>
Jeff
Joel
</blogger>
<name>stackoverflow</name>
</site>
To install pyfo:
$ easy_install pyfo
I noticed you have commonality in adding items. Using this commonality I would refactor adding an item to a separate function.
def addItem(item, name, offset):
if isinstance(item, dict):
lines.append(offset + u"<%s>" % name)
addDict(item, offset + u" " * 4)
lines.append(offset + u"</%s>" % name)
else:
lines.append(offset + u"<%s>%s</%s>" % (name, escape(unicodify(item)), name))
def addList(value,name, offset):
for item in value:
addItem(item, name, offset)
def addDict(node, offset):
for name, value in node.iteritems():
if isinstance(value, list):
addList(value, name, offset)
else:
addItem(value, name, offset)
Advisory warning: this code is not tested or written by anybody who actually uses Python.
To get rid of repeated "offset+":
offset = 0
def addLine(str):
lines.append(u" " * (offset * 4) + str
then
...
addLine(u"<%s>" % name)
offset = offset + 1
addDict(value)
offset = offset - 1
addLine(u"</%s>" % name)
Don't have access to an interpreter here, so take this with a grain of salt :(
Your original code produce malformed XML and can produce the same XML for two different dictionaries (is not injective, speaking mathematically).
For example, if you have a list as a value of the only key in a dictionary:
d = { 'list': [1,2,3] }
I expect that your code would produce
<list>1</list><list>2</list><list>3</list>
and there is no root element. Any XML should have one and only one root element.
Then given the XML produced by your code, it is impossible to say if this XML
<tag>1</tag>
was produced from { 'tag': 1 } or from { 'tag': [1] }.
So, I suggest
always start from the root element
represent lists with either two special tags (e.g. <list/> and <item/>) or mark them as such in attributes
Then, after decisions about these conceptual shortcomings we can generate correct and unambiguous XML. I chose to use attributes to markup lists, and used ElementTree to construct the XML tree automatically. Also, recursion helps (add_value_to_xml is called recursively):
from xml.etree.ElementTree import Element, SubElement, tostring
def is_scalar(v):
return isinstance(v,basestring) or isinstance(v,float) \
or isinstance(v,int) or isinstance(v,bool)
def add_value_to_xml(root,v):
if type(v) == type({}):
for k,kv in v.iteritems():
vx = SubElement(root,unicode(k))
vx = add_value_to_xml(vx,kv)
elif type(v) == list:
root.set('type','list')
for e in v:
li = SubElement(root,root.tag)
li = add_value_to_xml(li,e)
li.set('type','item')
elif is_scalar(v):
root.text = unicode(v)
else:
raise Exception("add_value_to_xml: unsuppoted type (%s)"%type(v))
return root
def dict_to_xml(d,root='dict'):
x = Element(root)
x = add_value_to_xml(x,d)
return x
d = { 'float': 5194.177, 'str': 'eggs', 'int': 42,
'list': [1,2], 'dict': { 'recursion': True } }
x = dict_to_xml(d)
print tostring(x)
The result of the conversion of the test dict is:
<dict><int>42</int><dict><recursion>True</recursion></dict><float>5194.177</float><list type="list"><list type="item">1</list><list type="item">2</list></list><str>eggs</str></dict>
Here is my short sketch for a solution:
have a general addSomething() function that dispatches based on the type of the value to addDict(), addList() or addElse(). Those functions recursively call addSomething() again.
Basically you are factoring out the parts in the if clause and add a recursive call.
Here's what I find helpful when working with XML. Actually create the XML node structure first, then render this into text second.
This separates two unrelated concerns.
How do I transform my Python structure into an XML object model?
How to I format that XML object model?
It's hard when you put these two things together into one function. If, on the other hand, you separate them, then you have two things. First, you have a considerably simpler function to "walk" your Python structure and return an XML node. Your XML Nodes can be rendered into text with some preferred encoding and formatting rules applied.
from xml.sax.saxutils import escape
class Node( object ):
def __init__( self, name, *children ):
self.name= name
self.children= children
def toXml( self, indent ):
if len(self.children) == 0:
return u"%s<%s/>" % ( indent*4*u' ', self.name )
elif len(self.children) == 1:
child= self.children[0].toXml(0)
return u"%s<%s>%s</%s>" % ( indent*4*u' ', self.name, child, self.name )
else:
items = [ u"%s<%s>" % ( indent*4*u' ', self.name ) ]
items.extend( [ c.toXml(indent+1) for c in self.children ] )
items.append( u"%s</%s>" % ( indent*4*u' ', self.name ) )
return u"\n".join( items )
class Text( Node ):
def __init__( self, value ):
self.value= value
def toXml( self, indent ):
def unicodify(o):
if o is None:
return u'';
return unicode(o)
return "%s%s" % ( indent*4*u' ', escape( unicodify(self.value) ), )
def dictToXml(d):
def dictToNodeList(node):
nodes= []
for name, value in node.iteritems():
if isinstance(value, dict):
n= Node( name, *dictToNodeList( value ) )
nodes.append( n )
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
n= Node( name, *dictToNodeList( value ) )
nodes.append( n )
else:
n= Node( name, Text( item ) )
nodes.append( n )
else:
n= Node( name, Text( value ) )
nodes.append( n )
return nodes
return u"\n".join( [ n.toXml(0) for n in dictToNodeList(d) ] )
Related
Say that I have a large dictionary full of nested values such as this:
large_dic ={
...
"key":{"sub-key1" :{"sub-key2": "Test"}},
"0key":{"0sub-key1": "0Test"},
"1key":{"1sub-key1":{"1sub-key2":{"1sub-key3":"1Test"}}}
...
}
What I would like to do is to be able to get for example from the final value:
"1Test"
the key(s) to access it, such as in this case:
large_dic["1key"]["1sub-key1"]["1sub-key2"]["1sub-key3"]
Thanks for the support.
Edit to add more infos: The dictionary trees I'm talking about are linear(YAML files converted into a python dictionary structure), there is never more than one key, the ending leaf values may not be unique.
Since OP is looking for hierarchical keys instead
I made this class :
class PointingSlice:
def __init__(self, obj, *slices) -> None:
self.obj = obj
self.slices = slices
def __str__(self):
return f"{str(self.obj)}{''.join(map(self._repr_slice, self.slices))}"
def _repr_slice(self, sliced: slice):
sqbrackets = "[{}]"
if not isinstance(sliced, slice):
return sqbrackets.format(repr(sliced))
items = [sliced.start, sliced.stop, sliced.step]
fn = lambda x: str() if x is None else str(x)
return sqbrackets.format(":".join(map(fn, items)))
def resolve(self):
obj = self.obj
for sliced in self.slices:
obj = obj.__getitem__(sliced)
return obj
and this function for instantiation :
def find_longest(mapping, key):
keys = [key]
value = mapping[key]
while isinstance(value, dict):
((k, value),) = value.items()
keys.append(k)
return PointingSlice(mapping, *keys)
Example use:
print(find_longest(large_dic, "1key"))
# output:
# {'key': {'sub-key1': {'sub-key2': 'Test'}}, '0key': {'0sub-key1': '0Test'}, '1key': {'1sub-key1': {'1sub-key2': {'1sub-key3': '1Test'}}}}['1key']['1sub-key1']['1sub-key2']['1sub-key3']
# do note that it is the same thing as large_dic['1key']['1sub-key1']['1sub-key2']['1sub-key3']
print(find_longest(large_dic, "1key").resolve()) # 1Test
So I made some changes and now it supports additional repr options matching your exact use case :
class PointingSlice:
def __init__(self, obj, *slices, object_name=None) -> None:
self.obj = obj
self.slices = slices
self.object_name = object_name
def __str__(self):
return f"{self.object_name or str(self.obj)}{''.join(map(self._repr_slice, self.slices))}"
def _repr_slice(self, sliced: slice):
sqbrackets = "[{}]"
if not isinstance(sliced, slice):
return sqbrackets.format(repr(sliced))
items = [sliced.start, sliced.stop, sliced.step]
fn = lambda x: str() if x is None else str(x)
return sqbrackets.format(":".join(map(fn, items)))
def resolve(self):
obj = self.obj
for sliced in self.slices:
obj = obj.__getitem__(sliced)
return obj
large_dic = {
"key": {"sub-key1": {"sub-key2": "Test"}},
"0key": {"0sub-key1": "0Test"},
"1key": {"1sub-key1": {"1sub-key2": {"1sub-key3": "1Test"}}},
}
def find_longest(mapping, key):
keys = [key]
value = mapping[key]
while isinstance(value, dict):
((k, value),) = value.items()
keys.append(k)
return PointingSlice(mapping, *keys)
f = find_longest(large_dic, "1key")
f.object_name = "large_dic" # for representational purposes, it works without this
print(f) # large_dic['1key']['1sub-key1']['1sub-key2']['1sub-key3']
print(f.resolve()) # 1Test
There are numerous ways to achieve this. You might want to look up "prefix tree traversal" (or "trie traversal").
A simple recursive solution with poor memory efficiency could look like this:
def find_trie_leaf_path(trie: dict, leaf_value, trie_path: list[str] = []):
for key, value in trie.items():
if isinstance(value, dict):
yield from find_trie_leaf_path(value, leaf_value, trie_path + [key])
elif value == leaf_value:
yield trie_path + [key]
large_dic = {
"key": {"sub-key1": {"sub-key2": "Test"}},
"0key": {"0sub-key1": "0Test"},
"1key": {"1sub-key1": {"1sub-key2": {"1sub-key3": "Test"}}},
}
first_match = next(find_trie_leaf_path(large_dic, "Test"))
all_matches = list(find_trie_leaf_path(large_dic, "Test"))
This should work even if your trie is very wide. If it is very high, I'd rather use an iterative algorithm.
I want to point out, though, that prefix trees are usually used the other way round. If you find yourself needing this search a lot, you should consider a different data structure.
Yes, it's totally possible. Here's the function to get the deeply nested value:
def get_final_value(mapping, key):
value = mapping[key]
while isinstance(value, dict):
(value,) = value.values()
return value
Example use:
>>> get_final_value(large_dic, "key")
'Test'
>>> get_final_value(large_dic, "0key")
'0Test'
>>> get_final_value(large_dic, "1key")
'1Test'
>>>
Can the parent keys be deduced from your final value in any way or is the tree structure rather random? If latter is the case then you'll probably just end up searching your tree until you find your value, what path search algorithm you choose for that again depends on the tree structure you have. As already asked in the comments, does each node only have one other node or is it binary or can it have many child nodes?
I'm writing python code that tests a REST Endpoint to get Scientific numbers from a DB and validate that the scientific format is returned from the database in correct JSON scientific number format.
The issue that I'm having is that some scientific numbers are converted. For instance the JSON loader will convert the e to upper case and some values are converted into integers. Here is some example code. The code isn't doing exactly what I'm doing since you won't have the DB back end.
import json
import decimal
class DecimalEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, decimal.Decimal):
print 'In here: ' + str( o )
return str(o)
return super(DecimalEncoder, self).default(o)
class JSONUtils:
def __init__( self, response ):
self.response = response
self.jsonData = None
self.LoadData( )
# print 'jsonData: ' + json.dumps( self.jsonData, cls=DecimalEncoder, indent=2 )
def GetData( self ):
return self.jsonData
def GetAsStr( self ):
return json.dumps(self.GetData(), cls=DecimalEncoder )
def LoadData ( self ):
if ( self.jsonData == None ):
if ( type( self.response ) == str or type( self.response ) == unicode ):
print '****type1'
self.jsonData = json.loads(self.response, parse_float=decimal.Decimal )
elif ( type( self.response ) == dict ):
print '****type2'
dump = json.dumps( self.response, cls=DecimalEncoder )
self.jsonData = json.loads(dump, parse_float=decimal.Decimal)
def GetJSONChunk( self, path ):
returnValue = ''
curPath = ''
try:
if ( type( path ) == str ):
returnValue = self.jsonData[path]
elif (type( path ) == list):
temp = ''
firstTime = True
for curPath in path:
if firstTime == True:
temp = self.jsonData[curPath]
firstTime = False
else:
temp = temp[curPath]
returnValue = temp
else:
print 'Unknown type in GetJSONChunk: ' + unicode( type( path ))
except KeyError as err:
ti.DBG_OUT( 'JSON chunk doesn\'t have value: ' + unicode( path ))
returnValue = self.kNoNode
except IndexError as err:
ti.DBG_OUT( 'Index does not exist: ' + unicode( curPath ))
returnValue = self.kInvalidIndex
return returnValue
info = { "fldName":1.47e-10 } # converts to 1.47e-10 (e to E)
# info = { "fldName":1.47e10 } # convers to 14700000000.0
# info = { "fldName":1.47e+10 } # convers to 14700000000.0
# info = { "fldName":1.47E+10 } # convers to 14700000000.0
# info = { "fldName":12345678901234567890123 } # example shows larger # support
print 'info: ' + str ( info )
myJSON = JSONUtils( info )
print 'type: ' + str( myJSON.jsonData )
print 'myJSON: ' + myJSON.GetAsStr( )
value = myJSON.GetJSONChunk ( 'fldName' )
print 'Type: ' + str( type( value ))
print value
What I need to do is compare the DB result to an expected value. Is there a way to identify ONLY scientific numbers? NOT doubles / decimal values and return those as strings. As you can see I'm already trying to protect Doubles that are being returned to be sure that meet the criteria / capabilities of the back end database. Which can be 20+ digits to the left or right of the decimal place.
The actual results are documented by each of the lines of code that start with # info.
I'm not entirely clear on your question, so if this is way off base let me know.
I think there may be some confusion between how Python displays a number vs. the actual value of the number.
For example, I can write the number 1000 as:
>>> 1000
1000.0
>>> 1E3
1000.0
>>> 10E2
1000.0
>>> 1e3
1000.0
>>> 1e+3
1000.0
Those are all different representations of the number, but they are all numerically equivalent. JSON is similarly flexible; all of the above representations are also valid in JSON.
Similarly, I can write:
10000000000000000000000000000.0
But the print statement will display it as:
1e+28
But it's still the same number. It hasn't been "converted" in any way. Python will use E notation once your number is >= 1E16.
So if I receive JSON that looks like this:
{
"val1": 1e+3,
"val2": 1e+20
}
The output of the following:
values = json.loads('{"val1": 1e+3, "val2": 1e+20}')
for k, v in values.items():
print(k, '=', v)
Would be:
val1 = 1000.0
val2 = 1e+20
I have the input file :
sun vehicle
one number
two number
reduce command
one speed
five speed
zero speed
speed command
kmh command
I used the following code:
from collections import OrderedDict
output = OrderedDict()
with open('final') as in_file:
for line in in_file:
columns = line.split(' ')
if len(columns) >= 2:
word,tag = line.strip().split()
if output.has_key(tag) == False:
output[tag] = [];
output[tag].append(word)
else:
print ""
for k, v in output.items():
print '<{}> {} </{}>'.format(k, ' '.join(v), k)
output = OrderedDict()
I am getting the output as:
<vehicle> sun </vehicle>
<number> one two </number>
<command> reduce speed kmh </command>
<speed> one five zero </speed>
But my expected output should be:
<vehicle> sun </vehicle>
<number> one two </number>
<command> reduce
<speed> one five zero </speed>
speed kmh </command>
Can someone help me in solving this?
It looks like the output you want to achieve is underspecified!
You presumably want the code to "know in advance" that speed is a part of command, before you get to the line speed command.
To do what you want, you will need a recursive function.
How about
for k, v in output.items():
print expandElements(k, v,output)
and somewhere you define
def expandElements(k,v, dic):
out = '<' +k + '>'
for i in v:
# check each item of v for matches in dic.
# if no match, then out=out+i
# otherwise expand using a recursive call of expandElements()
# and out=out+expandElements
out = out + '<' +k + '>'
It looks like you want some kind of tree structure for your output?
You are printing out with print '<{}> {} </{}>'.format(k, ' '.join(v), k) so all of your output is going to have the form of '<{}> {} </{}>'.
If you want to nest things you are going to need a nested structure to represent them.
For recursivly parsing the input file I would make a class representing the tag. Each tag can have its children. Every children is first a string added manually with tag.children.append("value") or by calling tag.add_value(tag.name, "value").
class Tag:
def __init__(self, name, parent=None):
self.name = name
self.children = []
self.has_root = True
self.parent = parent
def __str__(self):
""" compose string for this tag (recursivly) """
if not self.children:
return self.name
children_str = ' '.join([str(child) for child in self.children])
if not self.parent:
return children_str
return '<%s>%s</%s>' % (self.name, children_str, self.name)
#classmethod
def from_file(cls, file):
""" create root tag from file """
obj = cls('root')
columns = []
with open(file) as in_file:
for line in in_file:
value, tag = line.strip().split(' ')
obj.add_tag(tag, value)
return obj
def search_tag(self, tag):
""" search for a tag in the children """
if self.name == tag:
return self
for i, c in enumerate(self.children):
if isinstance(c, Tag) and c.name == tag:
return c
elif isinstance(c, str):
if c.strip() == tag.strip():
self.children[i] = Tag(tag, self)
return self.children[i]
else:
result = c.search_tag(tag)
if result:
return result
def add_tag(self, tag, value):
"""
add a value, tag pair to the children
Firstly this searches if the value is an child. If this is the
case it moves the children to the new location
Afterwards it searches the tag in the children. When found
the value is added to this tag. If not a new tag object
is created and added to this Tag. The flag has_root
is set to False so the element can be moved later.
"""
value_tag = self.search_tag(value)
if value_tag and not value_tag.has_root:
print("Found value: %s" % value)
if value_tag.parent:
i = value_tag.parent.children.index(value_tag)
value = value_tag.parent.children.pop(i)
value.has_root = True
else:
print("not %s" % value)
found = self.search_tag(tag)
if found:
found.children.append(value)
else:
# no root
tag_obj = Tag(tag, self)
self.children.append(tag_obj)
tag_obj.add_tag(tag, value)
tag_obj.has_root = False
tags = Tag.from_file('final')
print(tags)
I know in this example the speed-Tag is not added twice. I hope that's ok.
Sorry for the long code.
I have a project that has a non-standard file format something like:
var foo = 5
load 'filename.txt'
var bar = 6
list baz = [1, 2, 3, 4]
And I want to parse this into a data structure much like BeautifulSoup does. But this format isn't supported by BeautifulSoup. What is the pythonic way to build a parse tree so that I can modify the values and re-write it out? In the end I would like to do something like:
data = parse_file('file.txt')
data.foo = data.foo * 2
data.write_file('file_new.txt')
Here is a solution using pyparsing... it works in your case. Beware that i'm not an expert therefore depending on your standards the code could be ugly... cheers
class ConfigFile (dict):
"""
Configuration file data
"""
def __init__ (self, filename):
"""
Parses config file.
"""
from pyparsing import Suppress, Word, alphas, alphanums, nums, \
delimitedList, restOfLine, printables, ZeroOrMore, Group, \
Combine
equal = Suppress ("=")
lbrack = Suppress ("[")
rbrack = Suppress ("]")
delim = Suppress ("'")
string = Word (printables, excludeChars = "'")
identifier = Word (alphas, alphanums + '_')
integer = Word (nums).setParseAction (lambda t: int (t[0]))
real = Combine( Word(nums) + '.' + Word(nums) ).setParseAction (lambda t: float(t[0]))
value = real | integer
var_kwd = Suppress ("var")
load_kwd = Suppress ("load")
list_kwd = Suppress ("list")
var_stm = Group (var_kwd + identifier + equal + value +
restOfLine.suppress ()).setParseAction (
lambda tok: tok[0].insert(len(tok[0]), 0))
load_stm = Group (load_kwd + delim + string + delim +
restOfLine.suppress ()).setParseAction (
lambda tok: tok[0].insert(len(tok[0]), 1))
list_stm = Group (list_kwd + identifier + equal + lbrack +
Group ( delimitedList (value, ",") ) +
rbrack + restOfLine.suppress ()).setParseAction (
lambda tok: tok[0].insert(len(tok[0]), 2))
cnf_file = ZeroOrMore (var_stm | load_stm | list_stm)
lines = cnf_file.parseFile (filename)
self._lines = []
for line in lines:
self._lines.append ((line[-1], line[0]))
if line[-1] != 1: dict.__setitem__(self, line[0], line[1])
self.__initialized = True
# after initialisation, setting attributes is the same as setting an item
def __getattr__ (self, key):
try:
return dict.__getitem__ (self, key)
except KeyError:
return None
def __setattr__ (self, key, value):
"""Maps attributes to values. Only if we are initialised"""
# this test allows attributes to be set in the __init__ method
if not self.__dict__.has_key ('_ConfigFile__initialized'):
return dict.__setattr__(self, key, value)
# any normal attributes are handled normally
elif self.__dict__.has_key (key):
dict.__setattr__(self, key, value)
# takes care of including new 'load' statements
elif key == 'load':
if not isinstance (value, str):
raise ValueError, "Invalid data type"
self._lines.append ((1, value))
# this is called when setting new attributes after __init__
else:
if not isinstance (value, int) and \
not isinstance (value, float) and \
not isinstance (value, list):
raise ValueError, "Invalid data type"
if dict.has_key (self, key):
if type(dict.__getitem__(self, key)) != type (value):
raise ValueError, "Cannot modify data type."
elif not isinstance (value, list): self._lines.append ((0, key))
else: self._lines.append ((2, key))
dict.__setitem__(self, key, value)
def Write (self, filename):
"""
Write config file.
"""
fid = open (filename, 'w')
for d in self._lines:
if d[0] == 0: fid.write ("var %s = %s\n" % (d[1], str(dict.__getitem__(self, d[1]))))
elif d[0] == 1: fid.write ("file '%s'\n" % (d[1]))
else: fid.write ("list %s = %s\n" % (d[1], str(dict.__getitem__(self, d[1]))))
if __name__ == "__main__":
input="""var foo = 5
load 'filename.txt'
var bar = 6
list baz = [1, 2, 3, 4]"""
file ("test.txt", 'w').write (input)
config = ConfigFile ("test.txt")
# Modify existent items
config.foo = config.foo * 2
# Add new items
config.foo2 = [4,5,6,7]
config.foo3 = 12.3456
config.load = 'filenameX.txt'
config.load = 'filenameXX.txt'
config.Write ("test_new.txt")
EDIT
I have modified the class to use
__getitem__, __setitem__
methods to mimic the 'access to member' syntax to parsed items as required by our poster. Enjoy!
PS
Overloading of the
__setitem__
method should be done with care to avoid interferences between setting of 'normal' attributes (class members) and the parsed items (that are accesses like attributes). The code is now fixed to avoid these problems. See the following reference http://code.activestate.com/recipes/389916/ for more info. It was funny to discover this!
What you have is a custom language you need to parse.
Use one of the many existing parsing libraries for Python. I personally recommend PLY. Alternatively, Pyparsing is also good and widely used & supported.
If your language is relatively simple, you can also implement a hand-written parser. Here is an example
This question already has answers here:
How to make a class JSON serializable
(41 answers)
Closed 6 months ago.
class gpagelet:
"""
Holds 1) the pagelet xpath, which is a string
2) the list of pagelet shingles, list
"""
def __init__(self, parent):
if not isinstance( parent, gwebpage):
raise Exception("Parent must be an instance of gwebpage")
self.parent = parent # This must be a gwebpage instance
self.xpath = None # String
self.visibleShingles = [] # list of tuples
self.invisibleShingles = [] # list of tuples
self.urls = [] # list of string
class gwebpage:
"""
Holds all the datastructure after the results have been parsed
holds: 1) lists of gpagelets
2) loc, string, location of the file that represents it
"""
def __init__(self, url):
self.url = url # Str
self.netloc = False # Str
self.gpagelets = [] # gpagelets instance
self.page_key = "" # str
Is there a way for me to make my class json serializable? The thing that I am worried is the recursive reference.
Write your own encoder and decoder, which can be very simple like return __dict__
e.g. here is a encoder to dump totally recursive tree structure, you can enhance it or use as it is for your own purpose
import json
class Tree(object):
def __init__(self, name, childTrees=None):
self.name = name
if childTrees is None:
childTrees = []
self.childTrees = childTrees
class MyEncoder(json.JSONEncoder):
def default(self, obj):
if not isinstance(obj, Tree):
return super(MyEncoder, self).default(obj)
return obj.__dict__
c1 = Tree("c1")
c2 = Tree("c2")
t = Tree("t",[c1,c2])
print json.dumps(t, cls=MyEncoder)
it prints
{"childTrees": [{"childTrees": [], "name": "c1"}, {"childTrees": [], "name": "c2"}], "name": "t"}
you can similarly write a decoder but there you will somehow need to identify is it is your object or not, so may be you can put a type too if needed.
Indirect answer: instead of using JSON, you could use YAML, which has no problem doing what you want. (JSON is essentially a subset of YAML.)
Example:
import yaml
o1 = gwebpage("url")
o2 = gpagelet(o1)
o1.gpagelets = [o2]
print yaml.dump(o1)
In fact, YAML nicely handles cyclic references for you.
I implemented a very simple todict method with the help of https://stackoverflow.com/a/11637457/1766716
Iterate over properties that is not starts with __
Eliminate methods
Eliminate some properties manually which is not necessary (for my case, coming from sqlalcemy)
And used getattr to build dictionary.
class User(Base):
id = Column(Integer, primary_key=True)
firstname = Column(String(50))
lastname = Column(String(50))
password = Column(String(20))
def props(self):
return filter(
lambda a:
not a.startswith('__')
and a not in ['_decl_class_registry', '_sa_instance_state', '_sa_class_manager', 'metadata']
and not callable(getattr(self, a)),
dir(self))
def todict(self):
return {k: self.__getattribute__(k) for k in self.props()}
My solution for this was to extend the 'dict' class and perform checks around required/allowed attributes by overriding init, update, and set class methods.
class StrictDict(dict):
required=set()
at_least_one_required=set()
cannot_coexist=set()
allowed=set()
def __init__(self, iterable={}, **kwargs):
super(StrictDict, self).__init__({})
keys = set(iterable.keys()).union(set(kwargs.keys()))
if not keys.issuperset(self.required):
msg = str(self.__class__.__name__) + " requires: " + str([str(key) for key in self.required])
raise AttributeError(msg)
if len(list(self.at_least_one_required)) and len(list(keys.intersection(self.at_least_one_required))) < 1:
msg = str(self.__class__.__name__) + " requires at least one: " + str([str(key) for key in self.at_least_one_required])
raise AttributeError(msg)
for key, val in iterable.iteritems():
self.__setitem__(key, val)
for key, val in kwargs.iteritems():
self.__setitem__(key, val)
def update(self, E=None, **F):
for key, val in E.iteritems():
self.__setitem__(key, val)
for key, val in F.iteritems():
self.__setitem__(key, val)
super(StrictDict, self).update({})
def __setitem__(self, key, value):
all_allowed = self.allowed.union(self.required).union(self.at_least_one_required).union(self.cannot_coexist)
if key not in list(all_allowed):
msg = str(self.__class__.__name__) + " does not allow member '" + key + "'"
raise AttributeError(msg)
if key in list(self.cannot_coexist):
for item in list(self.cannot_coexist):
if key != item and item in self.keys():
msg = str(self.__class__.__name__) + "does not allow members '" + key + "' and '" + item + "' to coexist'"
raise AttributeError(msg)
super(StrictDict, self).__setitem__(key, value)
Example usage:
class JSONDoc(StrictDict):
"""
Class corresponding to JSON API top-level document structure
http://jsonapi.org/format/#document-top-level
"""
at_least_one_required={'data', 'errors', 'meta'}
allowed={"jsonapi", "links", "included"}
cannot_coexist={"data", "errors"}
def __setitem__(self, key, value):
if key == "included" and "data" not in self.keys():
msg = str(self.__class__.__name__) + " does not allow 'included' member if 'data' member is not present"
raise AttributeError(msg)
super(JSONDoc, self).__setitem__(key, value)
json_doc = JSONDoc(
data={
"id": 5,
"type": "movies"
},
links={
"self": "http://url.com"
}
)