Python: Accessing YAML values using "dot notation" - python

I'm using a YAML configuration file. So this is the code to load my config in Python:
import os
import yaml
with open('./config.yml') as file:
config = yaml.safe_load(file)
This code actually creates a dictionary. Now the problem is that in order to access the values I need to use tons of brackets.
YAML:
mysql:
user:
pass: secret
Python:
import os
import yaml
with open('./config.yml') as file:
config = yaml.safe_load(file)
print(config['mysql']['user']['pass']) # <--
I'd prefer something like that (dot notation):
config('mysql.user.pass')
So, my idea is to utilize the PyStache render() interface.
import os
import yaml
with open('./config.yml') as file:
config = yaml.safe_load(file)
import pystache
def get_config_value( yml_path, config ):
return pystache.render('{{' + yml_path + '}}', config)
get_config_value('mysql.user.pass', config)
Would that be a "good" solution? If not, what would be a better alternative?
Additional question [Solved]
I've decided to use Ilja Everilä's solution. But now I've got an additional question: How would you create a wrapper Config class around DotConf?
The following code doesn't work but I hope you get the idea what I'm trying to do:
class Config( DotDict ):
def __init__( self ):
with open('./config.yml') as file:
DotDict.__init__(yaml.safe_load(file))
config = Config()
print(config.django.admin.user)
Error:
AttributeError: 'super' object has no attribute '__getattr__'
Solution
You just need to pass self to the constructor of the super class.
DotDict.__init__(self, yaml.safe_load(file))
Even better soltution (Ilja Everilä)
super().__init__(yaml.safe_load(file))

The Simple
You could use reduce to extract the value from the config:
In [41]: config = {'asdf': {'asdf': {'qwer': 1}}}
In [42]: from functools import reduce
...:
...: def get_config_value(key, cfg):
...: return reduce(lambda c, k: c[k], key.split('.'), cfg)
...:
In [43]: get_config_value('asdf.asdf.qwer', config)
Out[43]: 1
This solution is easy to maintain and has very few new edge cases, if your YAML uses a very limited subset of the language.
The Correct
Use a proper YAML parser and tools, such as in this answer.
The Convoluted
On a lighter note (not to be taken too seriously), you could create a wrapper that allows using attribute access:
In [47]: class DotConfig:
...:
...: def __init__(self, cfg):
...: self._cfg = cfg
...: def __getattr__(self, k):
...: v = self._cfg[k]
...: if isinstance(v, dict):
...: return DotConfig(v)
...: return v
...:
In [48]: DotConfig(config).asdf.asdf.qwer
Out[48]: 1
Do note that this fails for keywords, such as "as", "pass", "if" and the like.
Finally, you could get really crazy (read: probably not a good idea) and customize dict to handle dotted string and tuple keys as a special case, with attribute access to items thrown in the mix (with its limitations):
In [58]: class DotDict(dict):
...:
...: # update, __setitem__ etc. omitted, but required if
...: # one tries to set items using dot notation. Essentially
...: # this is a read-only view.
...:
...: def __getattr__(self, k):
...: try:
...: v = self[k]
...: except KeyError:
...: return super().__getattr__(k)
...: if isinstance(v, dict):
...: return DotDict(v)
...: return v
...:
...: def __getitem__(self, k):
...: if isinstance(k, str) and '.' in k:
...: k = k.split('.')
...: if isinstance(k, (list, tuple)):
...: return reduce(lambda d, kk: d[kk], k, self)
...: return super().__getitem__(k)
...:
...: def get(self, k, default=None):
...: if isinstance(k, str) and '.' in k:
...: try:
...: return self[k]
...: except KeyError:
...: return default
...: return super().get(k, default=default)
...:
In [59]: dotconf = DotDict(config)
In [60]: dotconf['asdf.asdf.qwer']
Out[60]: 1
In [61]: dotconf['asdf', 'asdf', 'qwer']
Out[61]: 1
In [62]: dotconf.asdf.asdf.qwer
Out[62]: 1
In [63]: dotconf.get('asdf.asdf.qwer')
Out[63]: 1
In [64]: dotconf.get('asdf.asdf.asdf')
In [65]: dotconf.get('asdf.asdf.asdf', 'Nope')
Out[65]: 'Nope'

On the one hand your example takes the right approach by using get_config_value('mysql.user.pass', config) instead of solving the dotted access with attributes. I am not sure
if you realised that on purpose you were not trying to do the more intuitive:
print(config.mysql.user.pass)
which you can't get to work, even when overloading __getattr__, as pass is a Python language element.
However your example describes only a very restricted subset of YAML files as it doesn't involve any sequence collections, nor any complex keys.
If you want to cover more than the tiny subset you can e.g. extend the powerful round-trip capable objects of ruamel.yaml:¹
import ruamel.yaml
def mapping_string_access(self, s, delimiter=None, key_delim=None):
def p(v):
try:
v = int(v)
except:
pass
return v
# possible extend for primitives like float, datetime, booleans, etc.
if delimiter is None:
delimiter = '.'
if key_delim is None:
key_delim = ','
try:
key, rest = s.split(delimiter, 1)
except ValueError:
key, rest = s, None
if key_delim in key:
key = tuple((p(key) for key in key.split(key_delim)))
else:
key = p(key)
if rest is None:
return self[key]
return self[key].string_access(rest, delimiter, key_delim)
ruamel.yaml.comments.CommentedMap.string_access = mapping_string_access
def sequence_string_access(self, s, delimiter=None, key_delim=None):
if delimiter is None:
delimiter = '.'
try:
key, rest = s.split(delimiter, 1)
except ValueError:
key, rest = s, None
key = int(key)
if rest is None:
return self[key]
return self[key].string_access(rest, delimiter, key_delim)
ruamel.yaml.comments.CommentedSeq.string_access = sequence_string_access
Once that is set up you are can run the following:
yaml_str = """\
mysql:
user:
pass: secret
list: [a: 1, b: 2, c: 3]
[2016, 9, 14]: some date
42: some answer
"""
yaml = ruamel.yaml.YAML()
config = yaml.load(yaml_str)
def get_config_value(path, data, **kw):
return data.string_access(path, **kw)
print(get_config_value('mysql.user.pass', config))
print(get_config_value('mysql:user:pass', config, delimiter=":"))
print(get_config_value('mysql.list.1.b', config))
print(get_config_value('mysql.2016,9,14', config))
print(config.string_access('mysql.42'))
giving:
secret
secret
2
some date
some answer
showing that with a bit more forethought and very little extra work you can have flexible dotted access to many to a vast range of YAML files, and not just those consisting of recursive mappings with string scalars as keys.
As shown you can directly call config.string_access(mysql.user.pass) instead of defining and using get_config_value()
this works with strings and integers as mapping keys, but can be easily extended to support other key types (boolean, date, date-time).
¹ This was done using ruamel.yaml a YAML 1.2 parser, of which I am the author.

I ended up using python-box.
This package provides multiple ways to read config files (yaml, csv, json, ...).
And not only that, it allows you to pass dict or strings directly:
from box import Box
import yaml # Only required for different loaders
# Pass dict directly
movie_box = Box({ "Robin Hood: Men in Tights": { "imdb stars": 6.7, "length": 104 } })
# Load from yaml file
# Here it is also possible to use PyYAML arguments,
# for example to specify different loaders e.g. SafeLoader or FullLoader
conf = Box.from_yaml(filename="./config.yaml", Loader=yaml.FullLoader)
conf.mysql.user.pass
A lot more examples, are available in the Wiki.

It's quite old question, but I came here hunting for the answer, but looking for more simpler solution. Finally, came up with my own solution using easydict library; installed using pip install easydict
def yaml_load(fileName):
import yaml
from easydict import EasyDict as edict
fc = None
with open(fileName, 'r') as f:
fc = edict(yaml.load(f))
## or use safe_load
## fc = edict(yaml.safe_load(f))
return fc
Now, simply call yaml_load with the valid yaml filename:
config = yaml_load('./config.yml')
## assuming: config["mysql"]["user"]["pass"] is a valid key in config.yml
print("{}".format(config.mysql.user.pass))

I had the same problem a while ago and built this getter:
def get(self, key):
"""Tries to find the configuration value for a given key.
:param str key: Key in dot-notation (e.g. 'foo.lol').
:return: The configuration value. None if no value was found.
"""
try:
return self.__lookup(self.config, key)
except KeyError:
return None
def __lookup(self, dct, key):
"""Checks dct recursive to find the value for key.
Is used by get() interanlly.
:param dict dct: The configuration dict.
:param str key: The key we are looking for.
:return: The configuration value.
:raise KeyError: If the given key is not in the configuration dict.
"""
if '.' in key:
key, node = key.split('.', 1)
return self.__lookup(dct[key], node)
else:
return dct[key]
The getter looks-up a config value from self.config in a recursive manner (by using __lookup).
If you have trouble adjusting this for your case, feel free to ask for further help.

I generally follow a best practice of converting config (any kind, not just yaml) to an in memory object.
This way the text based config is unwrapped by 1 function and the text is thrown away, giving a beautiful object to work with as against having every function to deal with the internals of the config. That way all functions only know of that one internal object interface. If any new parameter is added/renamed/deleted from the config file, the only function to change is the loader function which loads the config into the in memory object.
Below is an example i did for loading FloydHub config yaml file into an in-memory object. I feel it is a very good design pattern.
First define a config representative class like below:
class FloydYamlConfig(object):
class Input:
def __init__(self, destination, source):
self.destination = destination
self.source = source
def __init__(self, floyd_yaml_dict):
self.machine = floyd_yaml_dict['machine']
self.env = floyd_yaml_dict['env']
self.description = floyd_yaml_dict['description']
self.max_runtime = floyd_yaml_dict['max_runtime']
self.command = floyd_yaml_dict['command']
self.input = []
for input_conf in floyd_yaml_dict['input']:
input_obj = self.Input(destination=input_conf['destination'], source=input_conf['source'])
self.input.append(input_obj)
def __str__(self):
input_str = ''
for input_obj in self.input:
input_str += '\ndestination: {}\n source: {}'.format(input_obj.destination, input_obj.source)
print_str = ('machine: {}\n'
'env: {}\n'
'input: {}\n'
'description: {}\n'
'max_runtime: {}\n'
'command: {}\n').format(
self.machine, self.env, input_str, self.description, self.max_runtime, self.command)
return print_str
Then load the yaml into the object for further use:
floyd_conf = read_floyd_yaml_config(args.floyd_yaml_path)
def read_floyd_yaml_config(floyd_yaml_path) -> FloydYamlConfig:
with open(floyd_yaml_path) as f:
yaml_conf_dict = yaml.safe_load(f)
floyd_conf = FloydYamlConfig(yaml_conf_dict)
# print(floyd_conf)
return floyd_conf
Sample yaml
# see: https://docs.floydhub.com/floyd_config
machine: gpu2
env: tensorflow-1.0
input:
- destination: data
source: abc/datasets/my-data/6
- destination: config
source: abc/datasets/my-config/1
description: this is a test
max_runtime: 3600
command: >-
echo 'hello world'

Related

Python dataclass attributes monkeypatching

I have problem with my class Config, that is works as proxy between user and ini file. It can load parameters from ini files and set them to its name equivalent in dataclass. I've realized, that it I want to get some attribute with dot like Config()._BASE_DIR, it returns str value, because ConfigParser can get values as a str. My idea is to create some method, which will patch all my attributes with property and property.setter to make possible to access dataclass attributes using dot, but wrap them with annotation classes, so, for example, Config()._minAR will return not 4.0 as string but as float.
Is my idea acceptable, or do I need to do it differently?
Config code parts:
import configparser
import pathlib
from dataclasses import dataclass
from itertools import zip_longest
#dataclass
class Config:
_IGNORE_FIELDS = {'_IGNORE_FIELDS' ,'_CONF_PARSER'}
_CONF_PARSER: configparser.ConfigParser = configparser.ConfigParser()
_BASE_TABLE_FILE_SUFFIX: str = '.csv'
_BASE_DIR: pathlib.Path = pathlib.Path().absolute()
_CONF_PATH: pathlib.Path = _BASE_DIR / 'conf'
_CONF_FILE_PATH: pathlib.Path = _CONF_PATH / 'settings.ini'
_DATA_TABLE_PATH: pathlib.Path = _CONF_PATH / ('_data_table' + _BASE_TABLE_FILE_SUFFIX)
_minAR: float = 4.0
_maxAR: float = 5.0
CATCH_TIME: int = 6
def __init__(self) -> None:
self.prepare()
def check_synchronized(self) -> tuple[bool, str]:
if not self.CONF_PARSER.has_section('settings'):
return False, 'ini'
parser_config = self.CONF_PARSER['settings'].items()
python_config = {
k: v
for k, v in self.__dataclass_fields__.items()
if k not in self._IGNORE_FIELDS
}.items()
for pair_1, pair_2 in zip_longest(python_config, parser_config, fillvalue=(None, None)):
key_1, val_1 = pair_1
if key_1 is None:
return False, 'script'
key_2, val_2 = pair_2
if key_2 is None:
return False, 'ini'
if key_2 in self._IGNORE_FIELDS:
continue
if key_1.lower() != key_2.lower() or (default := str(val_1.default)) != val_2:
mode = 'ini' if default != str(getattr(self, key_1)) else 'script'
return False, mode
return True, 'both'
def updateFromIni(self):
for key, value in self.CONF_PARSER['settings'].items():
upper_key = key.upper()
if str(getattr(self, upper_key)) == value:
continue
setattr(self, upper_key, value)
def prepare(self):
self._createConfDir()
is_sync, mode = self.check_synchronized()
if is_sync:
return
if mode == 'ini' or mode == 'both':
self._writeAll()
elif mode == 'script':
self.updateFromIni()
def _writeAll(self):
if not self.CONF_PARSER.has_section('settings'):
self.CONF_PARSER.add_section('settings')
for key, field in self.__dataclass_fields__.items():
if key in self._IGNORE_FIELDS:
continue
self.CONF_PARSER.set('settings', key, str(field.default))
self._writeInFile()
def _writeInFile(self):
with open(self.CONF_FILE_PATH, 'w') as file:
self.CONF_PARSER.write(file)
def _createConfDir(self) -> None:
if not self.CONF_PATH.exists():
self.CONF_PATH.mkdir(parents=True, exist_ok=True)
def setValue(self, field, value):
if not hasattr(self, field) or field in self._IGNORE_FIELDS:
return
setattr(self, field, value)
if not isinstance(value, str):
value = str(value)
self.CONF_PARSER.set('settings', field, value)
self._writeInFile()
More context: I use dataclass with configParser to make my Config class able to do the following things:
Sync attributes with ini file (if no ini file, create it from Config structure with default values; if Config not syncronized with ini file, load from ini, and write to ini, it ini-file has wrong structure, or some values are incorrect) to avoid the situation, when user accidentally delete ini file;
Set and Get all existing values in config from any part of my program (it is PyQt6 application);
Save it state from one session (application run) to another.
So, I had no idea, what other structure of config class I should have used, except for this. If you have better idea for synchronizable config, tell me.
I've discovered, that only one change, that I need to make my Config class make custom dot access to attributes, is to write custom magic method __getattribute__ in my class.
result:
import configparser
import pathlib
from dataclasses import dataclass
from itertools import zip_longest
from typing import Any
ACCESS_FIELDS = {
'BASE_TABLE_FILE_SUFFIX', 'BASE_DIR', 'CONF_PATH', 'CONF_FILE_PATH',
'DATA_TABLE_PATH', 'minAR', 'CATCH_TIME'
}
class Config:
# some code ...
def __getattribute__(self, __name: str) -> Any:
if __name == 'ACCESS_FIELDS':
return ACCESS_FIELDS
attr = super().__getattribute__(__name)
if __name in ACCESS_FIELDS:
_type = self.__annotations__[__name]
return _type(attr)
return attr
# other code ...
I created variable with accessed fields not in class body, because in other cases, if I get ACCESS_FIELDS by using Config.ACCESS_FIELDS or self.ACCESS_FIELDS, it will call __getattrubute__ method again and cause recursion error.
Basically, I got all what I need by using this solution, but I still has problem with setValue method. I've discovered, that __setattr__ overriden method works not so good with __getattribute__ overriden method in my class (it cause recursion error too). Probably, I'll restructure my Config class, but not now.

pickle, dill and cloudpickle returning field as empty dict on custom class after process termination

I have an object of a custom class that I am trying to serialize and permanently store.
When I serialize it, store it, load it and use it in the same run, it works fine. It only messes up when I've ended the process and then try to load it again from the pickle file. This is the code that works fine:
first_model = NgramModel(3, name="debug")
for paragraph in text:
first_model.train(paragraph_to_sentences(text))
# paragraph to sentences just uses regex to do the equivalent of splitting by punctuation
print(first_model.context_options)
# context_options is a dict (counter)
first_model = NgramModel.load_existing_model("debug")
#load_existing_model loads the pickle file. Look in the class code
print(first_model.context_options)
However, when I run this alone, it prints an empty counter:
first_model = NgramModel.load_existing_model("debug")
print(first_model.context_options)
This is a shortened version of the class file (the only two methods that touch the pickle/dill are update_pickle_state and load_existing_model):
import os
import dill
from itertools import count
from collections import Counter
from os import path
class NgramModel:
context_options: dict[tuple, set[str]] = {}
ngram_count: Counter[tuple] = Counter()
n = 0
pickle_path: str = None
num_paragraphs = 0
num_sentences = 0
def __init__(self, n: int, **kwargs):
self.n = n
self.pickle_path = NgramModel.pathify(kwargs.get('name', NgramModel.gen_pickle_name())) #use name if exists else generate random name
def train(self, paragraph_as_list: list[str]):
'''really the central method that coordinates everything else. Takes a list of sentences, generates data(n-grams) from each, updates the fields, and saves the instance (self) to a pickle file'''
self.num_paragraphs += 1
for sentence in paragraph_as_list:
self.num_sentences += 1
generated = self.generate_Ngrams(sentence)
self.ngram_count.update(generated)
for ngram in generated:
self.add_to_set(ngram)
self.update_pickle_state()
def update_pickle_state(self):
'''saves instance to pickle file'''
file = open(self.pickle_path, "wb")
dill.dump(self, file)
file.close()
#staticmethod
def load_existing_model(name: str):
'''returns object from pickle file'''
path = NgramModel.pathify(name)
file = open(path, "rb")
obj: NgramModel = dill.load(file)
return obj
def generate_Ngrams(self, string: str):
'''ref: https://www.analyticsvidhya.com/blog/2021/09/what-are-n-grams-and-how-to-implement-them-in-python/'''
words = string.split(" ")
words = ["<start>"] * (self.n - 1) + words + ["<end>"] * (self.n - 1)
list_of_tup = []
for i in range(len(words) + 1 - self.n):
list_of_tup.append((tuple(words[i + j] for j in range(self.n - 1)), words[i + self.n - 1]))
return list_of_tup
def add_to_set(self, ngram: tuple[tuple[str, ...], str]):
if ngram[0] not in self.context_options:
self.context_options[ngram[0]] = set()
self.context_options[ngram[0]].add(ngram[1])
#staticmethod
def pathify(name):
'''converts name to path'''
return f"models/{name}.pickle"
#staticmethod
def gen_pickle_name():
for i in count():
new_name = f"unnamed-pickle-{i}"
if not path.exists(NgramModel.pathify(new_name)):
return new_name
All the other fields print properly and are complete and correct except the two dicts
The problem is that is that context_options is a mutable class-member, not an instance member. If I had to guess, dill is only pickling instance members, since the class definition holds class members. That would account for why you see a "filled-out" context_options when you're working in the same shell but not when you load fresh — you're using the dirtied class member in the former case.
It's for stuff like this that you generally don't want to use mutable class members (or similarly, mutable default values in function signatures). More typical is to use something like context_options: dict[tuple, set[str]] = None and then check if it's None in the __init__ to set it to a default value, e.g., an empty dict. Alternatively, you could use a #dataclass and provide a field initializer, i.e.
#dataclasses.dataclass
class NgramModel:
context_options: dict[tuple, set[str]] = dataclasses.field(default_factory=dict)
...
You can observe what I mean about it being a mutable class member with, for instance...
if __name__ == '__main__':
ng = NgramModel(3, name="debug")
print(ng.context_options) # {}
ng.context_options[("foo", "bar")] = {"baz", "qux"}
print(ng.context_options) # {('foo', 'bar'): {'baz', 'qux'}}
ng2 = NgramModel(3, name="debug")
print(ng2.context_options) # {('foo', 'bar'): {'baz', 'qux'}}
I would expect a brand new ng2 to have the same context that the brand new ng had - empty (or whatever an appropriate default is).

the best way to parse and validate YAML configuration file

We have project which stores settings in YAML (settings file is generated by ansible scripts). Now we are using pyyaml to parse YAML format and marshmallow to validate settings. I'm pretty happy with storing setting in YAML, but I don't think marshmellow is the tool I need (schemas are hard to read, I do not need serialization for settings, want something like xsd). So what are the best practices of validating settings in project, maybe there is language independent way? (we are using python 2.7)
YAML settings:
successive:
worker:
cds_process_number: 0 # positive integer or zero
spider_interval: 10 # positive integer
run_worker_sh: /home/lmakeev/CDS/releases/master/scripts/run_worker.sh # OS path
allow:
- "*" # regular expression
deny:
- "^[A-Z]{3}_.+$" # regular expression
A schema description is a language of its own, with its own syntax and idiosyncrasies you have to learn. And you have to maintain its "programs" against which your YAML is verified if your requirements change.
If you are already working with YAML and are familiar with Python you can use YAML's tag facility to check objects at parse time.
Assuming you have a file input.yaml:
successive:
worker:
cds_process_number: !nonneg 0
spider_interval: !pos 10
run_worker_sh: !path /home/lmakeev/CDS/releases/master/scripts/run_worker.sh
allow:
- !regex "*"
deny:
- !regex "^[A-Z]{3}_.+$"
(your example file with the comments removed and tags inserted), you can create and register four classes that check the values using the following program¹:
import sys
import os
import re
import ruamel.yaml
import pathlib
class NonNeg:
yaml_tag = u"!nonneg"
#classmethod
def from_yaml(cls, constructor, node):
val = int(node.value) # this creates/returns an int
assert val >= 0
return val
class Pos(int):
yaml_tag = u"!pos"
#classmethod
def from_yaml(cls, constructor, node):
val = cls(node.value) # this creates/return a Pos()
assert val > 0
return val
class Path:
yaml_tag = u"!path"
#classmethod
def from_yaml(cls, constructor, node):
val = pathlib.Path(node.value)
assert os.path.exists(val)
return val
class Regex:
yaml_tag = u"!regex"
def __init__(self, val, comp):
# store original string and compile() of that string
self._val = val
self._compiled = comp
#classmethod
def from_yaml(cls, constructor, node):
val = str(node.value)
try:
comp = re.compile(val)
except Exception as e:
comp = None
print("Incorrect regex", node.start_mark)
print(" ", node.tag, node.value)
return cls(val, comp)
yaml = ruamel.yaml.YAML(typ="safe")
yaml.register_class(NonNeg)
yaml.register_class(Pos)
yaml.register_class(Path)
yaml.register_class(Regex)
data = yaml.load(pathlib.Path('input.yaml'))
The actual checks in the individual from_yaml classmethods should be adapted to your needs (I had to remove the assert for the Path, as I don't have that file).
If you run the above you'll note that it prints:
Incorrect regex in "input.yaml", line 7, column 9
!regex *
because "*" is not a valid regular expression. Did you mean: ".*"?
¹ This was done using ruamel.yaml, a YAML 1.2 parser, of which I am the author. You can achieve the same results with PyYAML, e.g by subclassing ObjectDict (which is unsafe by default, so make sure you correct that in your code)

Static classes being initialised on import. How does python 2 initialise static classes on import

I am trying to introduce python 3 support for the package mime and the code is doing something I have never seen before.
There is a class Types() that is used in the package as a static class.
class Types(with_metaclass(ItemMeta, object)): # I changed this for 2-3 compatibility
type_variants = defaultdict(list)
extension_index = defaultdict(list)
# __metaclass__ = ItemMeta # unnessecary now
def __init__(self, data_version=None):
self.data_version = data_version
The type_variants defaultdict is what is getting filled in python 2 but not in 3.
It very much seems to be getting filled by this class when is in a different file called mime_types.py.
class MIMETypes(object):
_types = Types(VERSION)
def __repr__(self):
return '<MIMETypes version:%s>' % VERSION
#classmethod
def load_from_file(cls, type_file):
data = open(type_file).read()
data = data.split('\n')
mime_types = Types()
for index, line in enumerate(data):
item = line.strip()
if not item:
continue
try:
ret = TEXT_FORMAT_RE.match(item).groups()
except Exception as e:
__parsing_error(type_file, index, line, e)
(unregistered, obsolete, platform, mediatype, subtype, extensions,
encoding, urls, docs, comment) = ret
if mediatype is None:
if comment is None:
__parsing_error(type_file, index, line, RuntimeError)
continue
extensions = extensions and extensions.split(',') or []
urls = urls and urls.split(',') or []
mime_type = Type('%s/%s' % (mediatype, subtype))
mime_type.extensions = extensions
...
mime_type.url = urls
mime_types.add(mime_type) # instance of Type() is being filled?
return mime_types
The function startup() is being run whenever mime_types.py is imported and it does this.
def startup():
global STARTUP
if STARTUP:
type_files = glob(join(DIR, 'types', '*'))
type_files.sort()
for type_file in type_files:
MIMETypes.load_from_file(type_file) # class method is filling Types?
STARTUP = False
This all seems pretty weird to me. The MIMETypes class first creates an instance of Types() on the first line. _types = Types(VERSION). It then seems to do nothing with this instance and only use the mime_types instance created in the load_from_file() class method. mime_types = Types().
This sort of thing vaguely reminds me of javascript class construction. How is the instance mime_types filling Types.type_variants so that when it is imported like this.
from mime import Type, Types
The class's type_variants defaultdict can be used. And why isn't this working in python 3?
EDIT:
Adding extra code to show how type_variants is filled
(In "Types" Class)
#classmethod
def add_type_variant(cls, mime_type):
cls.type_veriants[mime_type.simplified].append(mime_type)
#classmethod
def add(cls, *types):
for mime_type in types:
if isinstance(mime_type, Types):
cls.add(*mime_type.defined_types())
else:
mts = cls.type_veriants.get(mime_type.simplified)
if mts and mime_type in mts:
Warning('Type %s already registered as a variant of %s.',
mime_type, mime_type.simplified)
cls.add_type_variant(mime_type)
cls.index_extensions(mime_type)
You can see that MIMETypes uses the add() classmethod.
Without posting more of your code, it's hard to say. I will say that I was able to get that package ported to Python 3 with only a few changes (print statement -> function, basestring -> str, adding a dot before same-package imports, and a really ugly hack to compensate for their love of cmp:
def cmp(x,y):
if isinstance(x, Type): return x.__cmp__(y)
if isinstance(y, Type): return y.__cmp__(x) * -1
return 0 if x == y else (1 if x > y else -1)
Note, I'm not even sure this is correct.
Then
import mime
print(mime.Types.type_veriants) # sic
printed out a 1590 entry defaultdict.
Regarding your question about MIMETypes._types not being used, I agree, it's not.
Regarding your question about how the dictionary is being populated, it's quite simple, and you've identified most of it.
import mime
Imports the package's __init__.py which contains the line:
from .mime_types import MIMETypes, VERSION
And mime_types.py includes the lines:
def startup():
global STARTUP
if STARTUP:
type_files = glob(join(DIR, 'types', '*'))
type_files.sort()
for type_file in type_files:
MIMETypes.load_from_file(type_file)
STARTUP = False
startup()
And MIMETypes.load_from_file() has the lines:
mime_types = Types()
#...
for ... in ...:
mime_types.add(mime_type)
And Types.add(): has the line:
cls.add_type_variant(mime_type)
And that classmethod contains:
cls.type_veriants[mime_type.simplified].append(mime_type)

Elegant pattern for mutually exclusive keyword args?

Sometimes in my code I have a function which can take an argument in one of two ways. Something like:
def func(objname=None, objtype=None):
if objname is not None and objtype is not None:
raise ValueError("only 1 of the ways at a time")
if objname is not None:
obj = getObjByName(objname)
elif objtype is not None:
obj = getObjByType(objtype)
else:
raise ValueError("not given any of the ways")
doStuffWithObj(obj)
Is there any more elegant way to do this? What if the arg could come in one of three ways? If the types are distinct I could do:
def func(objnameOrType):
if type(objnameOrType) is str:
getObjByName(objnameOrType)
elif type(objnameOrType) is type:
getObjByType(objnameOrType)
else:
raise ValueError("unk arg type: %s" % type(objnameOrType))
But what if they are not? This alternative seems silly:
def func(objnameOrType, isName=True):
if isName:
getObjByName(objnameOrType)
else:
getObjByType(objnameOrType)
cause then you have to call it like func(mytype, isName=False) which is weird.
How about using something like a command dispatch pattern:
def funct(objnameOrType):
dispatcher = {str: getObjByName,
type1: getObjByType1,
type2: getObjByType2}
t = type(objnameOrType)
obj = dispatcher[t](objnameOrType)
doStuffWithObj(obj)
where type1,type2, etc are actual python types (e.g. int, float, etc).
Sounds like it should go to https://codereview.stackexchange.com/
Anyway, keeping the same interface, I may try
arg_parsers = {
'objname': getObjByName,
'objtype': getObjByType,
...
}
def func(**kwargs):
assert len(kwargs) == 1 # replace this with your favorite exception
(argtypename, argval) = next(kwargs.items())
obj = arg_parsers[argtypename](argval)
doStuffWithObj(obj)
or simply create 2 functions?
def funcByName(name): ...
def funcByType(type_): ...
One way to make it slightly shorter is
def func(objname=None, objtype=None):
if [objname, objtype].count(None) != 1:
raise TypeError("Exactly 1 of the ways must be used.")
if objname is not None:
obj = getObjByName(objname)
else:
obj = getObjByType(objtype)
I have not yet decided if I would call this "elegant".
Note that you should raise a TypeError if the wrong number of arguments was given, not a ValueError.
For whatever it's worth, similar kinds of things happen in the Standard Libraries; see, for example, the beginning of GzipFile in gzip.py (shown here with docstrings removed):
class GzipFile:
myfileobj = None
max_read_chunk = 10 * 1024 * 1024 # 10Mb
def __init__(self, filename=None, mode=None,
compresslevel=9, fileobj=None):
if mode and 'b' not in mode:
mode += 'b'
if fileobj is None:
fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
if filename is None:
if hasattr(fileobj, 'name'): filename = fileobj.name
else: filename = ''
if mode is None:
if hasattr(fileobj, 'mode'): mode = fileobj.mode
else: mode = 'rb'
Of course this accepts both filename and fileobj keywords and defines a particular behavior in the case that it receives both; but the general approach seems pretty much identical.
I use a decorator:
from functools import wraps
def one_of(kwarg_names):
# assert that one and only one of the given kwarg names are passed to the decorated function
def inner(f):
#wraps(f)
def wrapped(*args, **kwargs):
count = 0
for kw in kwargs:
if kw in kwarg_names and kwargs[kw] is not None:
count += 1
assert count == 1, f'exactly one of {kwarg_names} required, got {kwargs}'
return f(*args, **kwargs)
return wrapped
return inner
Used as:
#one_of(['kwarg1', 'kwarg2'])
def my_func(kwarg1='default', kwarg2='default'):
pass
Note that this only accounts for non- None values that are passed as keyword arguments. E.g. multiple of the kwarg_names may still be passed if all but one of them have a value of None.
To allow for passing none of the kwargs simply assert that the count is <= 1.
It sounds like you're looking for function overloading, which isn't implemented in Python 2. In Python 2, your solution is nearly as good as you can expect to get.
You could probably bypass the extra argument problem by allowing your function to process multiple objects and return a generator:
import types
all_types = set([getattr(types, t) for t in dir(types) if t.endswith('Type')])
def func(*args):
for arg in args:
if arg in all_types:
yield getObjByType(arg)
else:
yield getObjByName(arg)
Test:
>>> getObjByName = lambda a: {'Name': a}
>>> getObjByType = lambda a: {'Type': a}
>>> list(func('IntType'))
[{'Name': 'IntType'}]
>>> list(func(types.IntType))
[{'Type': <type 'int'>}]
The built-in sum() can be used to on a list of boolean expressions. In Python, bool is a subclass of int, and in arithmetic operations, True behaves as 1, and False behaves as 0.
This means that this rather short code will test mutual exclusivity for any number of arguments:
def do_something(a=None, b=None, c=None):
if sum([a is not None, b is not None, c is not None]) != 1:
raise TypeError("specify exactly one of 'a', 'b', or 'c'")
Variations are also possible:
def do_something(a=None, b=None, c=None):
if sum([a is not None, b is not None, c is not None]) > 1:
raise TypeError("specify at most one of 'a', 'b', or 'c'")
I occasionally run into this problem as well, and it is hard to find an easily generalisable solution. Say I have more complex combinations of arguments that are delineated by a set of mutually exclusive arguments and want to support additional arguments for each (some of which may be required and some optional), as in the following signatures:
def func(mutex1: str, arg1: bool): ...
def func(mutex2: str): ...
def func(mutex3: int, arg1: Optional[bool] = None): ...
I would use object orientation to wrap the arguments in a set of descriptors (with names depending on the business meaning of the arguments), which can then be validated by something like pydantic:
from typing import Optional
from pydantic import BaseModel, Extra
# Extra.forbid ensures validation error if superfluous arguments are provided
class BaseDescription(BaseModel, extra=Extra.forbid):
pass # Arguments common to all descriptions go here
class Description1(BaseDescription):
mutex1: str
arg1: bool
class Description2(BaseDescription):
mutex2: str
class Description3(BaseDescription):
mutex3: int
arg1: Optional[bool]
You could instantiate these descriptions with a factory:
class DescriptionFactory:
_class_map = {
'mutex1': Description1,
'mutex2': Description2,
'mutex3': Description3
}
#classmethod
def from_kwargs(cls, **kwargs) -> BaseDescription:
kwargs = {k: v for k, v in kwargs.items() if v is not None}
set_fields = kwargs.keys() & cls._class_map.keys()
try:
[set_field] = set_fields
except ValueError:
raise ValueError(f"exactly one of {list(cls._class_map.keys())} must be provided")
return cls._class_map[set_field](**kwargs)
#classmethod
def validate_kwargs(cls, func):
def wrapped(**kwargs):
return func(cls.from_kwargs(**kwargs))
return wrapped
Then you can wrap your actual function implementation like this and use type checking to see which arguments were provided:
#DescriptionFactory.validate_kwargs
def func(desc: BaseDescription):
if isinstance(desc, Description1):
... # use desc.mutex1 and desc.arg1
elif isinstance(desc, Description2):
... # use desc.mutex2
... # etc.
and call as func(mutex1='', arg1=True), func(mutex2=''), func(mutex3=123) and so on.
This is not overall shorter code, but it performs argument validation in a very descriptive way according to your specification, raises useful pydantic errors when validation fails, and results in accurate static types in each branch of the function implementation.
Note that if you're using Python 3.10+, structural pattern matching could simplify some parts of this.

Categories