memory overflow when using numpy load in a loop - python

Looping over npz files load causes memory overflow (depending on the file
list length).
None of the following seems to help
Deleting the variable which stores the data in the file.
Using mmap.
calling gc.collect() (garbage collection).
The following code should reproduce the phenomenon:
import numpy as np
# generate a file for the demo
X = np.random.randn(1000,1000)
np.savez('tmp.npz',X=X)
# here come the overflow:
for i in xrange(1000000):
data = np.load('tmp.npz')
data.close() # avoid the "too many files are open" error
in my real application the loop is over a list of files and the overflow exceeds 24GB of RAM!
please note that this was tried on ubuntu 11.10, and for both numpy v
1.5.1 as well as 1.6.0
I have filed a report in numpy ticket 2048 but this may be of a wider interest and so I am posting it here as well (moreover, I am not sure that this is a bug but may result of my bad programming).
SOLUTION (by HYRY):
the command
del data.f
should precede the command
data.close()
for more information and a method to find the solution, please read HYRY's kind answer below

I think this is a bug, and maybe I found the solution: call "del data.f".
for i in xrange(10000000):
data = np.load('tmp.npz')
del data.f
data.close() # avoid the "too many files are open" error
to found this kind of memory leak. you can use the following code:
import numpy as np
import gc
# here come the overflow:
for i in xrange(10000):
data = np.load('tmp.npz')
data.close() # avoid the "too many files are open" error
d = dict()
for o in gc.get_objects():
name = type(o).__name__
if name not in d:
d[name] = 1
else:
d[name] += 1
items = d.items()
items.sort(key=lambda x:x[1])
for key, value in items:
print key, value
After the test program, I created a dict and count objects in gc.get_objects(). Here is the output:
...
wrapper_descriptor 1382
function 2330
tuple 9117
BagObj 10000
NpzFile 10000
list 20288
dict 21001
From the result we know that there are something wrong with BagObj and NpzFile. Find the code:
class NpzFile(object):
def __init__(self, fid, own_fid=False):
...
self.zip = _zip
self.f = BagObj(self)
if own_fid:
self.fid = fid
else:
self.fid = None
def close(self):
"""
Close the file.
"""
if self.zip is not None:
self.zip.close()
self.zip = None
if self.fid is not None:
self.fid.close()
self.fid = None
def __del__(self):
self.close()
class BagObj(object):
def __init__(self, obj):
self._obj = obj
def __getattribute__(self, key):
try:
return object.__getattribute__(self, '_obj')[key]
except KeyError:
raise AttributeError, key
NpzFile has del(), NpzFile.f is a BagObj, and BagObj._obj is NpzFile, this is a reference cycle and will cause both NpzFile and BagObj uncollectable. Here is some explanation in Python document: http://docs.python.org/library/gc.html#gc.garbage
So, to break the reference cycle, will need to call "del data.f"

What I found as the solution: (python==3.8 and numpy==1.18.5)
import gc # import garbage collector interface
for i in range(1000):
data = np.load('tmp.npy')
# process data
del data
gc.collect()

Related

Use with statement to close GDAL datasets

Consider this basic example for the use of a with statement from Jeff Knupp's blog:
class File():
def __init__(self, filename, mode):
self.filename = filename
self.mode = mode
def __enter__(self):
self.open_file = open(self.filename, self.mode)
return self.open_file
def __exit__(self, *args):
self.open_file.close()
I have a testfile which contains two lines, 'ABC' and 'DEF'. Everything works as expected:
with File('/tmp/testfile','r') as f:
txt = [x.strip() for x in f.readlines()]
print(txt)
# ['ABC', 'DEF']
Calling a class method outside the with block gives me the expected error:
f.readlines()
ValueError Traceback (most recent call last)
in ()
----> 1 f.readlines()
ValueError: I/O operation on closed file.
Now to my question:
How can I achieve the same behavior with a gdal object instead of a file?
I have a class method, which should read data from disk and put it into a gdal raster for further processing. Once this is done, I would like to close the generated gdal raster appropriately. Normally this is done with setting it to None and with using gdal.Unlink.
However, when I put everything into a context structure as in the previous example, I can still interact with the dataset outside of the with block.
Here's a reproducible example:
class Raster:
'''Raster class with sidelength s'''
def __init__(self,s):
self.sidelength = s
def __enter__(self):
# create raster in memory
driver = gdal.GetDriverByName('GTiff')
self.raster = driver.Create('/vsimem/inmem.tif', self.sidelength, self.sidelength, 1, gdal.GDT_Float32)
self.raster.GetRasterBand(1).WriteArray(np.random.rand(self.sidelength,self.sidelength))
return self.raster
def __exit__(self, *args):
# close file and unlink
self.raster = None
gdal.Unlink('/vsimem/inmem.tif')
The with block works as expected:
with Raster(5) as r:
print(r)
# <osgeo.gdal.Dataset; proxy of <Swig Object of type 'GDALDatasetShadow *' at 0x7f8078862ae0> >
But after the block the object is still there and I can still read the values:
print(r)
#<osgeo.gdal.Dataset; proxy of <Swig Object of type 'GDALDatasetShadow *' at 0x7f8078044a20> >
print(r.ReadAsArray())
#[[0.2549882 0.80292517 0.23358545 0.6284887 0.7294142 ]
# [0.9310723 0.21535267 0.9054575 0.60967094 0.9937953 ]
# [0.69144976 0.01727938 0.16800325 0.61249655 0.1785022 ]
# [0.16179436 0.43245795 0.7042811 0.4809799 0.85534436]
# [0.67751276 0.7560658 0.9594516 0.6294476 0.3539126 ]]
You cannot really close a gdal dataset while keeping a reference to it. You can keep a reference to the Raster instance instead and use r.raster to access the dataset inside the with block.
class Raster:
'''Raster class with sidelength s'''
def __init__(self,s):
self.sidelength = s
def __enter__(self):
# create raster in memory
driver = gdal.GetDriverByName('GTiff')
self.raster = driver.Create('/vsimem/inmem.tif', self.sidelength, self.sidelength, 1, gdal.GDT_Float32)
self.raster.GetRasterBand(1).WriteArray(np.random.rand(self.sidelength,self.sidelength))
return self
def __exit__(self, *args):
# close file and unlink
self.raster = None
gdal.Unlink('/vsimem/inmem.tif')
with Raster(5) as r:
print(r.raster)
Output:
<osgeo.gdal.Dataset; proxy of <Swig Object of type 'GDALDatasetShadow *' at 0x04564728> >
Outside the with block the dataset is unreachable:
r.raster is None
Output:
True
There will be problems again if you bind another variable to r.raster so you might want to completely encapsulate it inside the Raster instance, along with any functionality you need. At this point you would be more or less reinventing Rasterio but if your needs are simple that might be better than depending on it.
As suggested in the comment, Rasterio via rasterio.open implements the behaviour you are looking for.

How to properly get table-row indexes and named values from trapped var-binds in pysnmp

I'm trying to keep my code as clean as possible but I'm not completely satisfied with what I achieved so far.
I built a SNMP manager which receive traps from another device using a custom MIB, which I will refer to as MY-MIB.
I am not sure this is the cleanest way, but essentially I have:
from pysnmp.entity import engine, config
from pysnmp.carrier.asynsock.dgram import udp
from pysnmp.entity.rfc3413 import ntfrcv, context
from pysnmp.smi import builder, rfc1902
from pysnmp.smi.view import MibViewController
from pysnmp.entity.rfc3413 import mibvar
_snmp_engine = engine.SnmpEngine()
_snmpContext = context.SnmpContext(_snmpEngine)
_mibBuilder = _snmpContext.getMibInstrum().getMibBuilder()
#Add local path where MY-MIB is located
_mibSources = _mibBuilder.getMibSources() + (builder.DirMibSource('.'),)
_mibBuilder.setMibSources(*mibSources)
_mibBuilder.loadModules('MY-MIB')
_view_controller = MibViewController(_mibBuilder)
def my_callback_trap_processor(snmp_engine, state_reference,
context_id, context_name, var_binds, ctx):
#...CALLBACK CODE...
config.addV1System(snmp_engine, 'my-area', 'MYCOMMUNITY')
config.addTargetParams(snmp_engine, 'my-creds', 'my-area',
'noAuthNoPriv', 1)
config.addSocketTransport(snmp_engine,
udp.domainName + (1,),
udp.UdpTransport().openServerMode((IP_ADDRESS,
PORT)))
ntfrcv.NotificationReceiver(snmp_engine, my_callback_trap_processor)
snmp_engine.transportDispatcher.jobStarted(1)
try:
snmp_engine.transportDispatcher.runDispatcher()
except:
snmp_engine.transportDispatcher.closeDispatcher()
raise
In the callback function above I can get a pretty intelligible print by just using the following code:
varBinds = [rfc1902.ObjectType(rfc1902.ObjectIdentity(x[0]), x[1]).resolveWithMib(_view_controller) for x in var_binds]
for varBind in varBinds:
print(varBind.prettyPrint())
which, from a given trap that I receive, gives me:
SNMPv2-MIB::sysUpTime.0 = 0
SNMPv2-MIB::snmpTrapOID.0 = MY-MIB::myNotificationType
MY-MIB::myReplyKey.47746."ABC" = 0x00000000000000000000000000000000000
MY-MIB::myTime.0 = 20171115131544Z
MY-MIB::myOperationMode.0 = 'standalone'
Nice. But I want to manipulate/dissect each bit of information from the given var-binds, especially in a higher level way.
Looking at the innards of the library I was able to gather this code up:
for varBind in var_binds:
objct = rfc1902.ObjectIdentity(varBind[0]).resolveWithMib(self._view_controller)
(symName, modName), indices = mibvar.oidToMibName(
self._view_controller, objct.getOid()
)
print(symName, modName, indices, varBind[1])
that gives me:
sysUpTime SNMPv2-MIB (Integer(0),) 0
snmpTrapOID SNMPv2-MIB (Integer(0),) 1.3.6.1.X.Y.Z.A.B.C.D
myReplyKey MY-MIB (myTimeStamp(47746), myName(b'X00080')) 0x00000000000000000000000000000000000
myTime MY-MIB (Integer(0),) 20171115131544Z
myOperationMode MY-MIB (Integer(0),) 1
and in the case of myReplyKey indexes I can just do a:
for idx in indices:
try:
print(idx.getValue())
except AttributeError:
print(int(idx))
But in the case of the myOperationMode var-bind, how do I get the named-value 'standalone' instead of 1? And how to get the names of the indexes (myTimeStamp and myName)?
Update:
After Ilya's suggestions I researched the library a little bit more for getting the namedValues and, also, I used some Python hacking to get what I was looking for on the indices.
varBinds = [rfc1902.ObjectType(rfc1902.ObjectIdentity(x[0]), x[1]).resolveWithMib(_view_controller) for x in var_binds]
processed_var_binds = []
for var_bind in resolved_var_binds:
object_identity, object_value = var_bind
mod_name, var_name, indices = object_identity.getMibSymbol()
var_bind_dict = {'mib': mod_name, 'name': var_name, 'indices': {}}
for idx in indices:
try:
value = idx.getValue()
except AttributeError:
var_bind_dict['indices'] = int(idx.prettyPrint())
else:
var_bind_dict['indices'][type(value).__name__] = str(value)
try:
var_bind_dict['value'] = object_value.namedValues[object_value]
except (AttributeError, KeyError):
try:
var_bind_dict['value'] = int(object_value.prettyPrint())
except ValueError:
var_bind_dict['value'] = object_value.prettyPrint()
processed_var_binds.append(var_bind_dict)
To resolve SNMP PDU var-bindings against a MIB you can use this snippet what I think you have done already:
from pysnmp.smi.rfc1902 import *
var_binds = [ObjectType(ObjectIdentity(x[0]), x[1]).resolveWithMib(mibViewController)
for x in var_binds]
By this point you have a list of rfc1902.ObjectType objects. The ObjectType instance mimics a two-element tuple: ObjectIdentity and SNMP value object.
var_bind = var_binds[0]
object_identity, object_value = var_bind
Now, getMibSymbol() will give you MIB name, MIB object name and the tuple of indices made up from the trailing part of the OID. Index elements are SNMP value objects just as object_value:
>>> object_identity.getMibSymbol()
('SNMPv2-MIB', 'sysDescr', (0,))
The enumeration, should it present, is reported by .prettyPrint():
>>> from pysnmp.proto.rfc1902 import *
>>> Error = Integer.withNamedValues(**{'disk-full': 1, 'no-disk': -1})
>>> error = Error(1)
>>> error.prettyPrint()
'disk-full'
>>> int(error)
1

pickling issue while using pool to count check the file

I have my code that is sprawning multiple processes to check the count of files and maintaining the records in the database. The code which is working is mentioned below :
import multiprocessing as mp
from multiprocessing import Pool
import os
import time
import mysql.connector
"""Function to check the count of the file"""
def file_wc(fname):
with open('/home/vaibhav/Desktop/Input_python/'+ fname) as f:
count = sum(1 for line in f)
return (fname,count)
class file_audit:
def __init__(self):
"""Initialising the constructor for getting the names of files
and refrencing the outside class function"""
folder = '/home/vaibhav/Desktop/Input_python'
self.fnames = (name for name in os.listdir(folder))
self.file_wc=file_wc
def count_check(self):
"Creating 4 worker threads to check the count of the file parallelly"
pool = Pool(4)
self.m=list(pool.map(self.file_wc, list(self.fnames),4))
pool.close()
pool.join()
def database_updation(self):
"""To maintain an entry in the database with details
like filename and recrods present in the file"""
self.db = mysql.connector.connect(host="localhost",user="root",password="root",database="python_showtime" )
# prepare a cursor object using cursor() method
self.cursor = self.db.cursor()
query_string = ("INSERT INTO python_showtime.audit_capture"
"(name,records)"
"VALUES(%s,%s)")
#data_user = (name,records)
for each in self.m:
self.cursor.execute(query_string, each)
self.db.commit()
self.cursor.close()
start_time = time.time()
print("My program took", time.time() - start_time, "to run")
#if __name__ == '__main__':
x=file_audit()
x.count_check() #To check the count by sprawning multiple processes
x.database_updation() #To maintain the entry in the database
Point to be considered
Now if i put my function inside the class and comment self.file_wc=file_wc in the constructor section i get the Error can't pickle on generator objects. I got some fair understanding like we cannot pickle some objects,So want to know what exactly is happening at the background in very simple terms. I got the reference from here or here to make the code working

Python 2.7 ProcessPoolExecutor throwing IOError: [Errno 32] Broken pipe

I am streaming data into a class in chunks. For each chunk of data, two different types of np.convolve() are executed on the same ProcessPoolExecutor. The type of convolve that was called is determined by a return variable.
The order of the data must be maintained, so each future has an associated sequence number. The output function enforces that only data from contiguous futures is returned (not shown below). From what I understand I am properly calling the ProcessPoolExecutor.shutdown() function, but I am still getting a IOError:
The errors is:
$ python processpoolerror.py
ran 5000000 samples in 3.70395112038 sec: 1.34990982265 Msps
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
send(obj)
IOError: [Errno 32] Broken pipe
Sorry it's a bit long, but I have pruned this class down as much as possible while keeping the error. On my machine Ubuntu 16.04.2 with a Intel(R) Core(TM) i7-6700K CPU # 4.00GHz the paired down code always gives this error. In the non-pruned version of this code, the Broken pipe occurs 25% of the time.
If you edit line 78 to True, and print during the execution, the error is not thrown. If you reduce the amount of data on line 100, the error is not thrown. What am I doing wrong here? Thanks.
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import time
def _do_xcorr3(rev_header, packet_chunk, seq):
r1 = np.convolve(rev_header, packet_chunk, 'full')
return 0, seq, r1
def _do_power3(power_kernel, packet_chunk, seq):
cp = np.convolve(power_kernel, np.abs(packet_chunk) ** 2, 'full')
return 1, seq, cp
class ProcessPoolIssues():
## Constructor
# #param chunk_size how many samples to feed in during input() stage
def __init__(self,header,chunk_size=500,poolsize=5):
self.chunk_size = chunk_size ##! How many samples to feed
# ProcessPool stuff
self.poolsize = poolsize
self.pool = ProcessPoolExecutor(poolsize)
self.futures = []
# xcr stage stuff
self.results0 = []
self.results0.append((0, -1, np.zeros(chunk_size)))
# power stage stuff
self.results1 = []
self.results1.append((1, -1, np.zeros(chunk_size)))
self.countin = 0
self.countout = -1
def shutdown(self):
self.pool.shutdown(wait=True)
## Returns True if all data has been extracted for given inputs
def all_done(self):
return self.countin == self.countout+1
## main function
# #param packet_chunk an array of chunk_size samples to be computed
def input(self, packet_chunk):
assert len(packet_chunk) == self.chunk_size
fut0 = self.pool.submit(_do_xcorr3, packet_chunk, packet_chunk, self.countin)
self.futures.append(fut0)
fut1 = self.pool.submit(_do_power3, packet_chunk, packet_chunk, self.countin)
self.futures.append(fut1)
self.countin += 1
# loops through thread pool, copying any results from done threads into results0/1 (and then terminating them)
def cultivate_pool(self):
todel = []
for i, f in enumerate(self.futures):
# print "checking", f
if f.done():
a, b, c = f.result()
if a == 0:
self.results0.append((a,b,c)) # results from one type of future
elif a == 1:
self.results1.append((a,b,c)) # results from another type of future
todel.append(i)
# now we need to remove items from futures that are done
# we need do it in reverse order so we remove items from the end first (thereby not affecting indices as we go)
for i in sorted(todel, reverse=True):
del self.futures[i]
if False: # change this to true and error goes away
print "deleting future #", i
# may return None
def output(self):
self.cultivate_pool() # modifies self.results list
# wait for both results to be done before clearing
if len(self.results0) and len(self.results1):
del self.results0[0]
del self.results1[0]
self.countout += 1
return None
def testRate():
chunk = 500
# a value of 10000 will throw: IOError: [Errno 32] Broken pipe
# smaller values like 1000 do not
din = chunk * 10000
np.random.seed(666)
search = np.random.random(233) + np.random.random(233) * 1j
input = np.random.random(din) + np.random.random(din) * 1j
pct = ProcessPoolIssues(search, chunk, poolsize=8)
st = time.time()
for x in range(0, len(input), chunk):
slice = input[x:x + chunk]
if len(slice) != chunk:
break
pct.input(slice)
pct.output()
while not pct.all_done():
pct.output()
ed = time.time()
dt = ed - st
print "ran", din, "samples in", dt, "sec:", din / dt / 1E6, "Msps"
pct.shutdown()
if __name__ == '__main__':
testRate()
This is probably happening because you're exceeding the buffer size of the pipe when you try sending in larger chunks at once.
def _do_xcorr3(rev_header, packet_chunk, seq):
r1 = np.convolve(rev_header, packet_chunk, 'full')
return 0, seq, r1
def _do_power3(power_kernel, packet_chunk, seq):
cp = np.convolve(power_kernel, np.abs(packet_chunk) ** 2, 'full')
return 1, seq, cp
the values r1 and cp are very large because you are convolving with the square of the chunks.
Hence, when you try to run this with larger chunk sizes, the buffer of IO Pipe can't handle it. Refer this for clearer understanding.
As for the second part of the question,
if False: # change this to true and error goes away
print "deleting future #", i
Found this in the py3 docs:
16.2.4.4. Reentrancy
Binary buffered objects (instances of BufferedReader, BufferedWriter, BufferedRandom and BufferedRWPair) are not reentrant. While reentrant calls will not happen in normal situations, they can arise from doing I/O in a signal handler. If a thread tries to re-enter a buffered object which it is already accessing, a RuntimeError is raised. Note this doesn’t prohibit a different thread from entering the buffered object.
The above implicitly extends to text files, since the open() function will wrap a buffered object inside a TextIOWrapper. This includes standard streams and therefore affects the built-in function print() as well.

In Python, how can you load YAML mappings as OrderedDicts?

I'd like to get PyYAML's loader to load mappings (and ordered mappings) into the Python 2.7+ OrderedDict type, instead of the vanilla dict and the list of pairs it currently uses.
What's the best way to do that?
Python >= 3.6
In python 3.6+, it seems that dict loading order is preserved by default without special dictionary types. The default Dumper, on the other hand, sorts dictionaries by key. Starting with pyyaml 5.1, you can turn this off by passing sort_keys=False:
a = dict(zip("unsorted", "unsorted"))
s = yaml.safe_dump(a, sort_keys=False)
b = yaml.safe_load(s)
assert list(a.keys()) == list(b.keys()) # True
This can work due to the new dict implementation that has been in use in pypy for some time. While still considered an implementation detail in CPython 3.6, "the insertion-order preserving nature of dicts has been declared an official part of the Python language spec" as of 3.7+, see What's New In Python 3.7.
Note that this is still undocumented from PyYAML side, so you shouldn't rely on this for safety critical applications.
Original answer (compatible with all known versions)
I like #James' solution for its simplicity. However, it changes the default global yaml.Loader class, which can lead to troublesome side effects. Especially, when writing library code this is a bad idea. Also, it doesn't directly work with yaml.safe_load().
Fortunately, the solution can be improved without much effort:
import yaml
from collections import OrderedDict
def ordered_load(stream, Loader=yaml.SafeLoader, object_pairs_hook=OrderedDict):
class OrderedLoader(Loader):
pass
def construct_mapping(loader, node):
loader.flatten_mapping(node)
return object_pairs_hook(loader.construct_pairs(node))
OrderedLoader.add_constructor(
yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
construct_mapping)
return yaml.load(stream, OrderedLoader)
# usage example:
ordered_load(stream, yaml.SafeLoader)
For serialization, you could use the following funcion:
def ordered_dump(data, stream=None, Dumper=yaml.SafeDumper, **kwds):
class OrderedDumper(Dumper):
pass
def _dict_representer(dumper, data):
return dumper.represent_mapping(
yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
data.items())
OrderedDumper.add_representer(OrderedDict, _dict_representer)
return yaml.dump(data, stream, OrderedDumper, **kwds)
# usage:
ordered_dump(data, Dumper=yaml.SafeDumper)
In each case, you could also make the custom subclasses global, so that they don't have to be recreated on each call.
2018 option:
oyaml is a drop-in replacement for PyYAML which preserves dict ordering. Both Python 2 and Python 3 are supported. Just pip install oyaml, and import as shown below:
import oyaml as yaml
You'll no longer be annoyed by screwed-up mappings when dumping/loading.
Note: I'm the author of oyaml.
The yaml module allow you to specify custom 'representers' to convert Python objects to text and 'constructors' to reverse the process.
_mapping_tag = yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG
def dict_representer(dumper, data):
return dumper.represent_dict(data.iteritems())
def dict_constructor(loader, node):
return collections.OrderedDict(loader.construct_pairs(node))
yaml.add_representer(collections.OrderedDict, dict_representer)
yaml.add_constructor(_mapping_tag, dict_constructor)
2015 (and later) option:
ruamel.yaml is a drop in replacement for PyYAML (disclaimer: I am the author of that package). Preserving the order of the mappings was one of the things added in the first version (0.1) back in 2015. Not only does it preserve the order of your dictionaries, it will also preserve comments, anchor names, tags and does support the YAML 1.2 specification (released 2009)
The specification says that the ordering is not guaranteed, but of course there is ordering in the YAML file and the appropriate parser can just hold on to that and transparently generate an object that keeps the ordering. You just need to choose the right parser, loader and dumper¹:
import sys
from ruamel.yaml import YAML
yaml_str = """\
3: abc
conf:
10: def
3: gij # h is missing
more:
- what
- else
"""
yaml = YAML()
data = yaml.load(yaml_str)
data['conf'][10] = 'klm'
data['conf'][3] = 'jig'
yaml.dump(data, sys.stdout)
will give you:
3: abc
conf:
10: klm
3: jig # h is missing
more:
- what
- else
data is of type CommentedMap which functions like a dict, but has extra information that is kept around until being dumped (including the preserved comment!)
Note: there is a library, based on the following answer, which implements also the CLoader and CDumpers: Phynix/yamlloader
I doubt very much that this is the best way to do it, but this is the way I came up with, and it does work. Also available as a gist.
import yaml
import yaml.constructor
try:
# included in standard lib from Python 2.7
from collections import OrderedDict
except ImportError:
# try importing the backported drop-in replacement
# it's available on PyPI
from ordereddict import OrderedDict
class OrderedDictYAMLLoader(yaml.Loader):
"""
A YAML loader that loads mappings into ordered dictionaries.
"""
def __init__(self, *args, **kwargs):
yaml.Loader.__init__(self, *args, **kwargs)
self.add_constructor(u'tag:yaml.org,2002:map', type(self).construct_yaml_map)
self.add_constructor(u'tag:yaml.org,2002:omap', type(self).construct_yaml_map)
def construct_yaml_map(self, node):
data = OrderedDict()
yield data
value = self.construct_mapping(node)
data.update(value)
def construct_mapping(self, node, deep=False):
if isinstance(node, yaml.MappingNode):
self.flatten_mapping(node)
else:
raise yaml.constructor.ConstructorError(None, None,
'expected a mapping node, but found %s' % node.id, node.start_mark)
mapping = OrderedDict()
for key_node, value_node in node.value:
key = self.construct_object(key_node, deep=deep)
try:
hash(key)
except TypeError, exc:
raise yaml.constructor.ConstructorError('while constructing a mapping',
node.start_mark, 'found unacceptable key (%s)' % exc, key_node.start_mark)
value = self.construct_object(value_node, deep=deep)
mapping[key] = value
return mapping
Update: the library was deprecated in favor of the yamlloader (which is based on the yamlordereddictloader)
I've just found a Python library (https://pypi.python.org/pypi/yamlordereddictloader/0.1.1) which was created based on answers to this question and is quite simple to use:
import yaml
import yamlordereddictloader
datas = yaml.load(open('myfile.yml'), Loader=yamlordereddictloader.Loader)
On my For PyYaml installation for Python 2.7 I updated __init__.py, constructor.py, and loader.py. Now supports object_pairs_hook option for load commands. Diff of changes I made is below.
__init__.py
$ diff __init__.py Original
64c64
< def load(stream, Loader=Loader, **kwds):
---
> def load(stream, Loader=Loader):
69c69
< loader = Loader(stream, **kwds)
---
> loader = Loader(stream)
75c75
< def load_all(stream, Loader=Loader, **kwds):
---
> def load_all(stream, Loader=Loader):
80c80
< loader = Loader(stream, **kwds)
---
> loader = Loader(stream)
constructor.py
$ diff constructor.py Original
20,21c20
< def __init__(self, object_pairs_hook=dict):
< self.object_pairs_hook = object_pairs_hook
---
> def __init__(self):
27,29d25
< def create_object_hook(self):
< return self.object_pairs_hook()
<
54,55c50,51
< self.constructed_objects = self.create_object_hook()
< self.recursive_objects = self.create_object_hook()
---
> self.constructed_objects = {}
> self.recursive_objects = {}
129c125
< mapping = self.create_object_hook()
---
> mapping = {}
400c396
< data = self.create_object_hook()
---
> data = {}
595c591
< dictitems = self.create_object_hook()
---
> dictitems = {}
602c598
< dictitems = value.get('dictitems', self.create_object_hook())
---
> dictitems = value.get('dictitems', {})
loader.py
$ diff loader.py Original
13c13
< def __init__(self, stream, **constructKwds):
---
> def __init__(self, stream):
18c18
< BaseConstructor.__init__(self, **constructKwds)
---
> BaseConstructor.__init__(self)
23c23
< def __init__(self, stream, **constructKwds):
---
> def __init__(self, stream):
28c28
< SafeConstructor.__init__(self, **constructKwds)
---
> SafeConstructor.__init__(self)
33c33
< def __init__(self, stream, **constructKwds):
---
> def __init__(self, stream):
38c38
< Constructor.__init__(self, **constructKwds)
---
> Constructor.__init__(self)
here's a simple solution that also checks for duplicated top level keys in your map.
import yaml
import re
from collections import OrderedDict
def yaml_load_od(fname):
"load a yaml file as an OrderedDict"
# detects any duped keys (fail on this) and preserves order of top level keys
with open(fname, 'r') as f:
lines = open(fname, "r").read().splitlines()
top_keys = []
duped_keys = []
for line in lines:
m = re.search(r'^([A-Za-z0-9_]+) *:', line)
if m:
if m.group(1) in top_keys:
duped_keys.append(m.group(1))
else:
top_keys.append(m.group(1))
if duped_keys:
raise Exception('ERROR: duplicate keys: {}'.format(duped_keys))
# 2nd pass to set up the OrderedDict
with open(fname, 'r') as f:
d_tmp = yaml.load(f)
return OrderedDict([(key, d_tmp[key]) for key in top_keys])

Categories