Configuring ruamel.yaml to allow duplicate keys - python

I'm trying to use the ruamel.yaml library to process a Yaml document that contains duplicate keys. In this case the duplicate key happens to be a merge key <<:.
This is the yaml file, dupe.yml:
foo: &ref1
a: 1
bar: &ref2
b: 2
baz:
<<: *ref1
<<: *ref2
c: 3
This is my script:
import ruamel.yaml
yml = ruamel.yaml.YAML()
yml.allow_duplicate_keys = True
doc = yml.load(open('dupe.yml'))
assert doc['baz']['a'] == 1
assert doc['baz']['b'] == 2
assert doc['baz']['c'] == 3
When run, it raises this error:
Traceback (most recent call last):
File "rua.py", line 5, in <module>
yml.load(open('dupe.yml'))
File "/usr/local/lib/python3.7/site-packages/ruamel/yaml/main.py", line 331, in load
return constructor.get_single_data()
File "/usr/local/lib/python3.7/site-packages/ruamel/yaml/constructor.py", line 111, in get_single_data
return self.construct_document(node)
File "/usr/local/lib/python3.7/site-packages/ruamel/yaml/constructor.py", line 121, in construct_document
for _dummy in generator:
File "/usr/local/lib/python3.7/site-packages/ruamel/yaml/constructor.py", line 1543, in construct_yaml_map
self.construct_mapping(node, data, deep=True)
File "/usr/local/lib/python3.7/site-packages/ruamel/yaml/constructor.py", line 1448, in construct_mapping
value = self.construct_object(value_node, deep=deep)
File "/usr/local/lib/python3.7/site-packages/ruamel/yaml/constructor.py", line 174, in construct_object
for _dummy in generator:
File "/usr/local/lib/python3.7/site-packages/ruamel/yaml/constructor.py", line 1543, in construct_yaml_map
self.construct_mapping(node, data, deep=True)
File "/usr/local/lib/python3.7/site-packages/ruamel/yaml/constructor.py", line 1399, in construct_mapping
merge_map = self.flatten_mapping(node)
File "/usr/local/lib/python3.7/site-packages/ruamel/yaml/constructor.py", line 1350, in flatten_mapping
raise DuplicateKeyError(*args)
ruamel.yaml.constructor.DuplicateKeyError: while constructing a mapping
in "dupe.yml", line 8, column 3
found duplicate key "<<"
in "dupe.yml", line 9, column 3
To suppress this check see:
http://yaml.readthedocs.io/en/latest/api.html#duplicate-keys
Duplicate keys will become an error in future releases, and are errors
by default when using the new API.
How can I make ruamel read this file without errors? The documentation says that allow_duplicate_keys = True will make the loader tolerate duplicated keys, but it doesn't seem to work.
I'm using Python 3.7 and ruamel.yaml 0.15.90.

That
yaml.allow_duplicate_keys = True
only works for non-merge keys in versions before 0.15.91.
In 0.15.91+ this works and the merge key assumes the value of the first instantiation of the key (like with non-merge keys), that means it works as if you had written:
baz:
<<: *ref1
c: 3
and not as if you had written:
baz:
<<: [*ref1, *ref2]
c: 3
If you need that you have to monkey-patch the flatten routine that handles the merge keys (and that affects loading of all following YAML files with double merge keys):
import sys
import ruamel.yaml
yaml_str = """\
foo: &ref1
a: 1
bar: &ref2
b: 2
baz:
<<: *ref1
<<: *ref2
c: 3
"""
def my_flatten_mapping(self, node):
def constructed(value_node):
# type: (Any) -> Any
# If the contents of a merge are defined within the
# merge marker, then they won't have been constructed
# yet. But if they were already constructed, we need to use
# the existing object.
if value_node in self.constructed_objects:
value = self.constructed_objects[value_node]
else:
value = self.construct_object(value_node, deep=False)
return value
merge_map_list = []
index = 0
while index < len(node.value):
key_node, value_node = node.value[index]
if key_node.tag == u'tag:yaml.org,2002:merge':
if merge_map_list and not self.allow_duplicate_keys: # double << key
args = [
'while constructing a mapping',
node.start_mark,
'found duplicate key "{}"'.format(key_node.value),
key_node.start_mark,
"""
To suppress this check see:
http://yaml.readthedocs.io/en/latest/api.html#duplicate-keys
""",
"""\
Duplicate keys will become an error in future releases, and are errors
by default when using the new API.
""",
]
if self.allow_duplicate_keys is None:
warnings.warn(DuplicateKeyFutureWarning(*args))
else:
raise DuplicateKeyError(*args)
del node.value[index]
# if key/values from later merge keys have preference you need
# to insert value_node(s) at the beginning of merge_map_list
# instead of appending
if isinstance(value_node, ruamel.yaml.nodes.MappingNode):
merge_map_list.append((index, constructed(value_node)))
elif isinstance(value_node, ruamel.yaml.nodes.SequenceNode):
for subnode in value_node.value:
if not isinstance(subnode, ruamel.yaml.nodes.MappingNode):
raise ruamel.yaml.constructor.ConstructorError(
'while constructing a mapping',
node.start_mark,
'expected a mapping for merging, but found %s' % subnode.id,
subnode.start_mark,
)
merge_map_list.append((index, constructed(subnode)))
else:
raise ConstructorError(
'while constructing a mapping',
node.start_mark,
'expected a mapping or list of mappings for merging, '
'but found %s' % value_node.id,
value_node.start_mark,
)
elif key_node.tag == u'tag:yaml.org,2002:value':
key_node.tag = u'tag:yaml.org,2002:str'
index += 1
else:
index += 1
return merge_map_list
ruamel.yaml.constructor.RoundTripConstructor.flatten_mapping = my_flatten_mapping
yaml = ruamel.yaml.YAML()
yaml.allow_duplicate_keys = True
data = yaml.load(yaml_str)
for k in data['baz']:
print(k, '>', data['baz'][k])
The above gives:
c > 3
a > 1
b > 2

After reading the library source code, I found a workaround. Setting the option to None prevents the error.
yml.allow_duplicate_keys = None
A warning is still printed to the console, but it's not fatal and the program will continue.

Related

Replace fields in yaml file by python

I need to edit the next yaml file with python code :
# This is a sample YAML file for running est.
# - Here I have the comments 1
# - Here I have comments 2
# Robust compare
# - Here I have comments 3
# - Here I have comments 4
job_name: Field1
strategy_num: &strategy_num
dir_base : &dir_base high_vals
from_date_is: &from_date 20150101
to_date_is : &to_date 20161231
# Here I have comments 5
dir: D:\Alex
run_mode : debug
analyses:
# Simulate for all dates (IS)
- kind: RunStrat
label: tr
done: false
dry_run: false
from_date: *from_date
to_date : *to_date
configs_to_test_dir: configs_temp
configs_to_run_dir: configs_to_run
I need to replace high_vals by other_high_vals and configs_temp by other_configs_temp.
Everything I tried doesn't work
Last try:
def change_yaml(path_to_yaml):
try:
with open(path_to_yaml) as yaml_file:
print(path_to_yaml)
doc = yaml.safe_load(yaml_file)
doc['dir_base'][1] = 'other_dir_base'
doc['analyses']['configs_to_test_dir'] = other_configs_temp
except EnvironmentError as e:
raise ValueError('Could not open yaml file {}:\n{}'.format(path_to_yaml, e))
try:
with open(path_to_yaml, 'w') as yaml_file:
yaml.dump(doc, yaml_file)
except EnvironmentError as e:
raise ValueError('Could not write to yaml file {}:\n{}'.format( path_to_yaml , e)
Thank you in advance
This line:
doc['dir_base'][1] = 'other_dir_base'
assumes that the value for the key dir_base can be indexed, but that value is the scalar high_vals and since that is loaded as a string in Python you are trying to do the equivalent of:
s= 'high_vals'
s[1] = 'other_dir_base'
which will give a TypeError.
You probably want to do:
doc['dir_base'] = 'other_dir_base'

task() got multiple values for keyword argument 'appdynhost'

I am simply trying to receive an input, run it through some if, elif conditions and replace certain text from a bunch of lines :
def task(appdynhost):
if (appdynhost) == "security":
appdynpass = 'xxxxxxxx'
elif (appdynhost) == "security2":
appdynpass = 'yyyyyyyy'
elif (appdynhost) == "security3":
appdynpass = 'zzzzzzzz'
elif (appdynhost) == "security4":
appdynpass = 'wwwwwwww'
replacements = {'<controller_name>':appdynhost,'<controller_password>';appdynpass}
s = r"""
"standardenv::v2_0_0_0_0":
"appgroups::v1_0_0_0_0":
definitions:
jdk: {gid: 5007}
app: {gid: 5008}
"appusers::v1_0_0_0_0":
definitions:
jdk: {group: jdk, uid: 5007}
app: {group: app, uid: 5008}
"jdk::v1_7_45_2_0::standard":
instance_owner: jdk
instance_group: jdk
"app::v1_1_2_1_0::standard":
instance_owner: app
instance_group: app
controller_host: "<controller_name>.saas.com"
account_name: "<controller_name>"
account_password: "<controller_password>" """
line=s.split()
for line in s.splitlines():
for src, target in replacements.iteritems():
line = line.replace(src, target)
lines.append(line)
with open('DC_mcp.yaml', 'w') as outfile:
for line in lines:
outfile.write(line + '\n')
print (line)
I am getting this error :
Traceback (most recent call last):
File "build/bdist.linux-x86_64/egg/bdblib/task_runner.py", line 72, in run_bdblib_task
File "build/bdist.linux-x86_64/egg/bdblib/run.py", line 38, in run
return task_module.task(env, **inputs)
TypeError: task() got multiple values for keyword argument 'appdynhost'
I am getting the input for appdynhost as a raw input. I am not sure whats going wrong.
You definition of "task" function only has one argument which you named "appdynhost".
def task(appdynhost):
...
Your call however has several:
return task_module.task(env, **inputs)
TypeError: task() got multiple values for keyword argument 'appdynhost'
At either the function or the call you'll need to fix the inputs.

Programming error with SQlite3

i am getting the below errors
Traceback (most recent call last):
File "D:\PYTHON SUPER INP\DB test - Copy1220.py", line 29, in <module>
if __name__ == "__main__":main()
File "D:\PYTHON SUPER INP\DB test - Copy1220.py", line 17, in main
addCust("Zach")
File "D:\PYTHON SUPER INP\DB test - Copy1220.py", line 13, in addCust
VALUES(?)""",(name))
ProgrammingError: Incorrect number of bindings supplied. The current
statement uses 1, and there are 4 supplied.
with my follwing code
import sqlite3
createDb = sqlite3.connect(":memory:")
queryCurs = createDb.cursor()
def createTable():
queryCurs.execute(''' CREATE TABLE customers
(id INTEGER PRIMARY KEY, name TEXT)''')
def addCust(name):
queryCurs.execute("""INSERT INTO customers (name)
VALUES(?)""",(name))
def main():
createTable()
addCust("Zach")
createDb.commit()
queryCurs.execute("SELECT * FROM customers")
for i in queryCurs:
print "/n"
for j in i:
print j
queryCurs.close
if __name__ == "__main__":main()
Here:
def addCust(name):
queryCurs.execute("""INSERT INTO customers (name)
VALUES(?)""",(name))
When you do (name) it's the same as name - so if name is "Zach" then it's a string, that's iterable and has a length of 4, instead, make it a one tuple or a list, eg:
(name,) # notice the trailing comma
eg:
>>> len(('Zach')) # same as len('Zach')
4
>>> len(('Zach',)) # now a one-tuple containing a string of length 4
1
Couple of other notes - see code comments:
for i in queryCurs:
print "/n" # <-- to print a new line - use `\n` - not `/n`
for j in i:
print j
queryCurs.close # this should be queryCurs.close() (eg - call the method)

Python 3 TypeError: bytes or integer address expected instead of str instance

I am trying to get Python 2 code to run on Python 3, and this line
argv = (c_char_p * len(args))(*args)
causes this error
File "/Users/hanxue/Code/Python/gsfs/src/gsfs.py", line 381, in main
fuse = FUSE(GoogleStorageFUSE(username, password, logfile=logfile), mount_point, **fuse_args)
File "/Users/hanxue/Code/Python/gsfs/src/fuse.py", line 205, in __init__
argv = (c_char_p * len(args))(*args)
TypeError: bytes or integer address expected instead of str instance
This is the full method
class FUSE(object):
"""This class is the lower level interface and should not be subclassed
under normal use. Its methods are called by fuse"""
def __init__(self, operations, mountpoint, raw_fi=False, **kwargs):
"""Setting raw_fi to True will cause FUSE to pass the fuse_file_info
class as is to Operations, instead of just the fh field.
This gives you access to direct_io, keep_cache, etc."""
self.operations = operations
self.raw_fi = raw_fi
args = ['fuse']
if kwargs.pop('foreground', False):
args.append('-f')
if kwargs.pop('debug', False):
args.append('-d')
if kwargs.pop('nothreads', False):
args.append('-s')
kwargs.setdefault('fsname', operations.__class__.__name__)
args.append('-o')
args.append(','.join(key if val == True else '%s=%s' % (key, val)
for key, val in kwargs.items()))
args.append(mountpoint)
argv = (c_char_p * len(args))(*args)
Which is invoked by this line
fuse = FUSE(GoogleStorageFUSE(username, password, logfile=logfile), mount_point, **fuse_args)
How do I avoid the error by changing the args into byte[]?
In Python 3 all string literals are, by default, unicode. So the phrases 'fuse', '-f', '-d', etc, all create str instances. In order to get bytes instances instead you will need to do both:
pass bytes into the FUSE (username, password, logfile, mount_point, and each arg in fuse_args
change all the string literals in FUSE itself to be bytes: b'fuse', b'-f', b'-d', etc.
This is not a small job.

In Python, how can you load YAML mappings as OrderedDicts?

I'd like to get PyYAML's loader to load mappings (and ordered mappings) into the Python 2.7+ OrderedDict type, instead of the vanilla dict and the list of pairs it currently uses.
What's the best way to do that?
Python >= 3.6
In python 3.6+, it seems that dict loading order is preserved by default without special dictionary types. The default Dumper, on the other hand, sorts dictionaries by key. Starting with pyyaml 5.1, you can turn this off by passing sort_keys=False:
a = dict(zip("unsorted", "unsorted"))
s = yaml.safe_dump(a, sort_keys=False)
b = yaml.safe_load(s)
assert list(a.keys()) == list(b.keys()) # True
This can work due to the new dict implementation that has been in use in pypy for some time. While still considered an implementation detail in CPython 3.6, "the insertion-order preserving nature of dicts has been declared an official part of the Python language spec" as of 3.7+, see What's New In Python 3.7.
Note that this is still undocumented from PyYAML side, so you shouldn't rely on this for safety critical applications.
Original answer (compatible with all known versions)
I like #James' solution for its simplicity. However, it changes the default global yaml.Loader class, which can lead to troublesome side effects. Especially, when writing library code this is a bad idea. Also, it doesn't directly work with yaml.safe_load().
Fortunately, the solution can be improved without much effort:
import yaml
from collections import OrderedDict
def ordered_load(stream, Loader=yaml.SafeLoader, object_pairs_hook=OrderedDict):
class OrderedLoader(Loader):
pass
def construct_mapping(loader, node):
loader.flatten_mapping(node)
return object_pairs_hook(loader.construct_pairs(node))
OrderedLoader.add_constructor(
yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
construct_mapping)
return yaml.load(stream, OrderedLoader)
# usage example:
ordered_load(stream, yaml.SafeLoader)
For serialization, you could use the following funcion:
def ordered_dump(data, stream=None, Dumper=yaml.SafeDumper, **kwds):
class OrderedDumper(Dumper):
pass
def _dict_representer(dumper, data):
return dumper.represent_mapping(
yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
data.items())
OrderedDumper.add_representer(OrderedDict, _dict_representer)
return yaml.dump(data, stream, OrderedDumper, **kwds)
# usage:
ordered_dump(data, Dumper=yaml.SafeDumper)
In each case, you could also make the custom subclasses global, so that they don't have to be recreated on each call.
2018 option:
oyaml is a drop-in replacement for PyYAML which preserves dict ordering. Both Python 2 and Python 3 are supported. Just pip install oyaml, and import as shown below:
import oyaml as yaml
You'll no longer be annoyed by screwed-up mappings when dumping/loading.
Note: I'm the author of oyaml.
The yaml module allow you to specify custom 'representers' to convert Python objects to text and 'constructors' to reverse the process.
_mapping_tag = yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG
def dict_representer(dumper, data):
return dumper.represent_dict(data.iteritems())
def dict_constructor(loader, node):
return collections.OrderedDict(loader.construct_pairs(node))
yaml.add_representer(collections.OrderedDict, dict_representer)
yaml.add_constructor(_mapping_tag, dict_constructor)
2015 (and later) option:
ruamel.yaml is a drop in replacement for PyYAML (disclaimer: I am the author of that package). Preserving the order of the mappings was one of the things added in the first version (0.1) back in 2015. Not only does it preserve the order of your dictionaries, it will also preserve comments, anchor names, tags and does support the YAML 1.2 specification (released 2009)
The specification says that the ordering is not guaranteed, but of course there is ordering in the YAML file and the appropriate parser can just hold on to that and transparently generate an object that keeps the ordering. You just need to choose the right parser, loader and dumperĀ¹:
import sys
from ruamel.yaml import YAML
yaml_str = """\
3: abc
conf:
10: def
3: gij # h is missing
more:
- what
- else
"""
yaml = YAML()
data = yaml.load(yaml_str)
data['conf'][10] = 'klm'
data['conf'][3] = 'jig'
yaml.dump(data, sys.stdout)
will give you:
3: abc
conf:
10: klm
3: jig # h is missing
more:
- what
- else
data is of type CommentedMap which functions like a dict, but has extra information that is kept around until being dumped (including the preserved comment!)
Note: there is a library, based on the following answer, which implements also the CLoader and CDumpers: Phynix/yamlloader
I doubt very much that this is the best way to do it, but this is the way I came up with, and it does work. Also available as a gist.
import yaml
import yaml.constructor
try:
# included in standard lib from Python 2.7
from collections import OrderedDict
except ImportError:
# try importing the backported drop-in replacement
# it's available on PyPI
from ordereddict import OrderedDict
class OrderedDictYAMLLoader(yaml.Loader):
"""
A YAML loader that loads mappings into ordered dictionaries.
"""
def __init__(self, *args, **kwargs):
yaml.Loader.__init__(self, *args, **kwargs)
self.add_constructor(u'tag:yaml.org,2002:map', type(self).construct_yaml_map)
self.add_constructor(u'tag:yaml.org,2002:omap', type(self).construct_yaml_map)
def construct_yaml_map(self, node):
data = OrderedDict()
yield data
value = self.construct_mapping(node)
data.update(value)
def construct_mapping(self, node, deep=False):
if isinstance(node, yaml.MappingNode):
self.flatten_mapping(node)
else:
raise yaml.constructor.ConstructorError(None, None,
'expected a mapping node, but found %s' % node.id, node.start_mark)
mapping = OrderedDict()
for key_node, value_node in node.value:
key = self.construct_object(key_node, deep=deep)
try:
hash(key)
except TypeError, exc:
raise yaml.constructor.ConstructorError('while constructing a mapping',
node.start_mark, 'found unacceptable key (%s)' % exc, key_node.start_mark)
value = self.construct_object(value_node, deep=deep)
mapping[key] = value
return mapping
Update: the library was deprecated in favor of the yamlloader (which is based on the yamlordereddictloader)
I've just found a Python library (https://pypi.python.org/pypi/yamlordereddictloader/0.1.1) which was created based on answers to this question and is quite simple to use:
import yaml
import yamlordereddictloader
datas = yaml.load(open('myfile.yml'), Loader=yamlordereddictloader.Loader)
On my For PyYaml installation for Python 2.7 I updated __init__.py, constructor.py, and loader.py. Now supports object_pairs_hook option for load commands. Diff of changes I made is below.
__init__.py
$ diff __init__.py Original
64c64
< def load(stream, Loader=Loader, **kwds):
---
> def load(stream, Loader=Loader):
69c69
< loader = Loader(stream, **kwds)
---
> loader = Loader(stream)
75c75
< def load_all(stream, Loader=Loader, **kwds):
---
> def load_all(stream, Loader=Loader):
80c80
< loader = Loader(stream, **kwds)
---
> loader = Loader(stream)
constructor.py
$ diff constructor.py Original
20,21c20
< def __init__(self, object_pairs_hook=dict):
< self.object_pairs_hook = object_pairs_hook
---
> def __init__(self):
27,29d25
< def create_object_hook(self):
< return self.object_pairs_hook()
<
54,55c50,51
< self.constructed_objects = self.create_object_hook()
< self.recursive_objects = self.create_object_hook()
---
> self.constructed_objects = {}
> self.recursive_objects = {}
129c125
< mapping = self.create_object_hook()
---
> mapping = {}
400c396
< data = self.create_object_hook()
---
> data = {}
595c591
< dictitems = self.create_object_hook()
---
> dictitems = {}
602c598
< dictitems = value.get('dictitems', self.create_object_hook())
---
> dictitems = value.get('dictitems', {})
loader.py
$ diff loader.py Original
13c13
< def __init__(self, stream, **constructKwds):
---
> def __init__(self, stream):
18c18
< BaseConstructor.__init__(self, **constructKwds)
---
> BaseConstructor.__init__(self)
23c23
< def __init__(self, stream, **constructKwds):
---
> def __init__(self, stream):
28c28
< SafeConstructor.__init__(self, **constructKwds)
---
> SafeConstructor.__init__(self)
33c33
< def __init__(self, stream, **constructKwds):
---
> def __init__(self, stream):
38c38
< Constructor.__init__(self, **constructKwds)
---
> Constructor.__init__(self)
here's a simple solution that also checks for duplicated top level keys in your map.
import yaml
import re
from collections import OrderedDict
def yaml_load_od(fname):
"load a yaml file as an OrderedDict"
# detects any duped keys (fail on this) and preserves order of top level keys
with open(fname, 'r') as f:
lines = open(fname, "r").read().splitlines()
top_keys = []
duped_keys = []
for line in lines:
m = re.search(r'^([A-Za-z0-9_]+) *:', line)
if m:
if m.group(1) in top_keys:
duped_keys.append(m.group(1))
else:
top_keys.append(m.group(1))
if duped_keys:
raise Exception('ERROR: duplicate keys: {}'.format(duped_keys))
# 2nd pass to set up the OrderedDict
with open(fname, 'r') as f:
d_tmp = yaml.load(f)
return OrderedDict([(key, d_tmp[key]) for key in top_keys])

Categories