Deserialize class with generated field value - python

I have a class like:
class Pathology:
"""
Represents a pathology, which is initialized with a name and description.
"""
def __init__(self: str, name: str, description: str):
self.id = str(uuid.uuid4())
self.name = name
self.description = description
self.phases = []
def to_json(self):
return jsonpickle.encode(self, make_refs=False, unpicklable=False)
In this class, I do not ever want a user to pass in a value for id, I always wish to generate it upon construction.
When deserializing from JSON, I wish to do something like:
with open('data/test_case_1.json', 'r') as test_case_1_file:
test_case_1 = test_case_1_file.read()
# parse file
obj = jsonpickle.decode(test_case_1)
assert pathology == Pathology(**obj)
However, I run into an error TypeError: __init__() got an unexpected keyword argument 'id'
I suspect this is because the init constructor does not have the field id available.
What is the pythonic way to support this behavior?

In this class, I do not ever want a user to pass in a value for id, I always wish to generated it upon construction.
Based on the above desired result, my recommendation is to define id as a (read-only) property. The benefits of defining it as a property is that it won't be treated as an instance attribute, and coincidentally it won't accept a value via the constructor; the main drawback is that it won't show in the class's __repr__ value (assuming we use the generated one we get from dataclasses) or in the dataclasses.asdict helper function.
I've also taken added a few additional changes in the implementation as well (hopefully for the better):
Re-declare the class as a dataclass, which I personally prefer as it reduces a bit of boilerplate code such as an __init__ constructor, or the need to define an __eq__ method for example (the latter to check if two class objects are equal via ==). The dataclasses module also provides a helpful asdict function which we can make use of in the serialization process.
Use built-in JSON (de)serialization via the json module. Part of the reason for this decision is I have personally never used the jsonpickle module, and I only have a rudimentary understanding of how pickling works in general. I feel that converting class objects to/from JSON is more natural, and likely also performs better in any case.
Add a from_json_file helper method, which we can use to load a new class object from a local file path.
import json
import uuid
from dataclasses import dataclass, asdict, field, fields
from functools import cached_property
from typing import List
#dataclass
class Pathology:
"""
Represents a pathology, which is initialized with a name and description.
"""
name: str
description: str
phases: List[str] = field(init=False, default_factory=list)
#cached_property
def id(self) -> str:
return str(uuid.uuid4())
def to_json(self):
return json.dumps(asdict(self))
#classmethod
def from_json_file(cls, file_name: str):
# A list of only the fields that can be passed in to the constructor.
# Note: maybe it's worth caching this for repeated runs.
init_fields = tuple(f.name for f in fields(cls) if f.init)
if not file_name.endswith('.json'):
file_name += '.json'
with open(file_name, 'r') as in_file:
test_case_1 = json.load(in_file)
# parse file
return cls(**{k: v for k, v in test_case_1.items() if k in init_fields})
And here's some quick code I put together, to confirm that everything is as expected:
def main():
p1 = Pathology('my-name', 'my test description.')
print('P1:', p1)
p_id = p1.id
print('P1 -> id:', p_id)
assert p1.id == p_id, 'expected id value to be cached'
print('Serialized JSON:', p1.to_json())
# Save JSON to file
with open('my_file.json', 'w') as out_file:
out_file.write(p1.to_json())
# De-serialize object from file
p2 = Pathology.from_json_file('my_file')
print('P2:', p2)
# assert both objects are same
assert p2 == p1
# IDs should be unique, since it's automatically generated each time (we
# don't pass in an ID to the constructor or store it in JSON file)
assert p1.id != p2.id, 'expected IDs to be unique'
if __name__ == '__main__':
main()

Related

'value is not a valid dict' when using pydantic on data loaded from a numpy archive

I've gotten into using pydantic to keep my types in order, but I've been finding that it doesn't play nicely with numpy types. The process of saving objects wraps everything in array of dtype=object and I need to manually undo it everywhere.
I've figured out how to parse this away for strings or lists. For example, for strings, the following seems to work:
from pydantic import BaseModel, validators
class str_type(str):
#classmethod
def __get_validators__(cls):
yield cls.validate
yield validators.str_validator
#classmethod
def validate(cls, value):
if issubclass(type(value), np.ndarray):
value = value.item()
return value
class MyClass(BaseModel):
my_attribute: str_type
def save_npz(self, filename):
"""Saves a *.npz file with all attributes"""
np.savez(filename, **self.dict())
#classmethod
def load_npz(cls, filename):
"""Loads a *.npz file and creates an instance of Args"""
data = np.load(filename, allow_pickle=True)
data_dict = dict(data)
return cls(**data_dict)
b = MyClass(my_attribute='hi')
b.save_npz('temp_file.npz')
c = MyClass.load_npz('temp_file.npz')
However, for this trick doesn't seem to work for dicts, and I'm at a loss for why. This is my MWE:
from pydantic import BaseModel, validators
class dict_type(dict):
#classmethod
def __get_validators__(cls):
yield cls.validate
yield validators.dict_validator
#classmethod
def validate(cls, value):
if issubclass(type(value), np.ndarray):
value = value.item()
return value
class MyClass(BaseModel):
my_attribute: dict_type[str, float]
def save_npz(self, filename):
"""Saves a *.npz file with all attributes"""
np.savez(filename, **self.dict())
#classmethod
def load_npz(cls, filename):
"""Loads a *.npz file and creates an instance of Args"""
data = np.load(filename, allow_pickle=True)
data_dict = dict(data)
return cls(**data_dict)
b = MyClass(my_attribute={'hi': 3})
b.save_npz('temp_file.npz')
c = MyClass.load_npz('temp_file.npz')
pydantic.error_wrappers.ValidationError: 1 validation error for MyClass my_attribute value is not a valid dict(type=type_error.dict)
EDIT
I ended up using a solution based on #Daniil Fajnberg's suggestions below. I created a standard BaseModel and put a numpy unwrapper within a standard validator that applies to every attribute:
def numpy_unwrap(value):
""" A common issue when loading data in an *.npz file is that numpy wraps the object in a
numpy array for safekeeping. For example, instead of saving "True" as type 'bool', it's
saved as array(True, dtype=object). In most cases, it's easy to unwrap the object from
this array by just calling the .item() method on the enclosing array, unless it's supposed
to be a list, in which case you call .tolist().
"""
if not issubclass(type(value), np.ndarray):
return value
try:
return value.tolist()
except ValueError:
return value.item()
class BaseModel_np(BaseModel):
#validator('*', pre=True, always=True)
def unwrap_numpy_array(cls, value):
return numpy_unwrap(value)
class Config:
arbitrary_types_allowed = True
To be honest, I don't really see the point in defining your own data type here. You can accomplish what you are trying to do with "vanilla" types and a #validator-decorated method in your model. Here is my suggestion:
from typing import Union
import numpy as np
from pydantic import BaseModel, validator
class MyClass(BaseModel):
my_attribute: dict[str, float]
#validator("my_attribute", pre=True)
def convert_numpy_array(cls, v: Union[dict[str, float], np.ndarray]) -> dict[str, float]:
if isinstance(v, np.ndarray):
v = v.item()
assert isinstance(v, dict)
return v
def save_npz(self, filename: str) -> None:
"""Saves a *.npz file with all attributes"""
np.savez(filename, **self.dict())
#classmethod
def load_npz(cls, filename: str) -> "MyClass":
"""Loads a *.npz file and creates an instance of Args"""
data = np.load(filename, allow_pickle=True)
return cls.parse_obj(dict(data))
if __name__ == '__main__':
b = MyClass(my_attribute={'hi': 3})
b.save_npz('temp_file.npz')
c = MyClass.load_npz('temp_file.npz')
print(c)
Output: my_attribute={'hi': 3.0}
Notice that the validator takes care of ensuring a dict instance and does so before other validation thanks to pre=True. If we didn't use that parameter, a ValidationError would be raised because the model's built-in dict-validator would rightly reject the np.ndarray. (The custom validator would never even be called.)
Miscellaneous side notes:
I took the liberty of adding some more type annotations since you seem to care about your types.
Your check if the type of value is a subclass of np.ndarray is equivalent to simply checking if the value is an instance of np.ndarray. I find the latter more readable.
I adjusted your load_npz method to call parse_obj on the dictionary, again because of slightly better readability (in my opinion) than dictionary unpacking.
PS - Custom Base Model:
Since you mentioned you want the solution to be universal, so that you can reuse it anywhere, I suggest defining your own base model with a universal validator. Here is a simple demonstration:
from typing import TypeVar, Union, cast
import numpy as np
from pydantic import BaseModel, validator
T = TypeVar("T")
M = TypeVar("M", bound="MyBaseModel")
class MyBaseModel(BaseModel):
#validator("*", pre=True)
def convert_numpy_array(cls, v: Union[T, np.ndarray]) -> T:
if isinstance(v, np.ndarray):
v = v.item()
return cast(T, v)
def save_npz(self, filename: str) -> None:
"""Saves a *.npz file with all attributes"""
np.savez(filename, **self.dict())
#classmethod
def load_npz(cls: M, filename: str) -> M:
"""Loads a *.npz file and creates an instance of Args"""
data = np.load(filename, allow_pickle=True)
return cls.parse_obj(dict(data))
class MyClassA(MyBaseModel):
some_string: str
class MyClassB(MyClassA):
a_number: float
my_dict: dict[str, float]
if __name__ == '__main__':
b = MyClassB(
some_string="foo",
a_number=3.14,
my_dict={'hi': 3}
)
b.save_npz('temp_file.npz')
c = MyClassB.load_npz('temp_file.npz')
print(c)
Output: some_string='foo' a_number=3.14 my_dict={'hi': 3.0}
The validator decorator can take the special string "*" instead of the names of specific fields. That way it will be called for every field of the model. The logic inside the validator is pretty much unchanged.
In this example, I put the saving and loading methods into the base model because that seemed sensible, but you can obviously do that as you need it.
That way you can just inherit from MyBaseModel (or from classes that in turn inherit from MyBaseModel) everywhere in your code, and the validator logic will remain intact for every field, as demonstrated by MyClassB in the example.
Of course you may run into problems, if the ndarray.item() method fails for some array. I don't know what your specific requirements are. But you can put in place additional checks in your universal validator, such as checking appropriate dtype of the array or what have you.
The advantage of doing it this way is that inheritance feels much more natural than specifying special types that just introduce additional validation logic. It is less error prone because all you have to do is remember to inherit from your base model. And you can even override validator methods in child models, if you want to.
(The TypeVars and cast function are just type-sugar. Maybe overkill for you.)

Distinguishing between Pydantic Models with same fields

I'm using Pydantic to define hierarchical data in which there are models with identical attributes.
However, when I save and load these models, Pydantic can no longer distinguish which model was used and picks the first one in the field type annotation.
I understand that this is expected behavior based on the documentation.
However, the class type information is important to my application.
What is the recommended way to distinguish between different classes in Pydantic? One hack is to simply add an extraneous field to one of the models, but I'd like to find a more elegant solution.
See the simplified example below: container is initialized with data of type DataB, but after exporting and loading, the new container has data of type DataA as it's the first element in the type declaration of container.data.
Thanks for your help!
from abc import ABC
from pydantic import BaseModel #pydantic 1.8.2
from typing import Union
class Data(BaseModel, ABC):
""" base class for a Member """
number: float
class DataA(Data):
""" A type of Data"""
pass
class DataB(Data):
""" Another type of Data """
pass
class Container(BaseModel):
""" container holds a subclass of Data """
data: Union[DataA, DataB]
# initialize container with DataB
data = DataB(number=1.0)
container = Container(data=data)
# export container to string and load new container from string
string = container.json()
new_container = Container.parse_raw(string)
# look at type of container.data
print(type(new_container.data).__name__)
# >>> DataA
As correctly noted in the comments, without storing additional information models cannot be distinguished when parsing.
As of today (pydantic v1.8.2), the most canonical way to distinguish models when parsing in a Union (in case of ambiguity) is to explicitly add a type specifier Literal. It will look like this:
from abc import ABC
from pydantic import BaseModel
from typing import Union, Literal
class Data(BaseModel, ABC):
""" base class for a Member """
number: float
class DataA(Data):
""" A type of Data"""
tag: Literal['A'] = 'A'
class DataB(Data):
""" Another type of Data """
tag: Literal['B'] = 'B'
class Container(BaseModel):
""" container holds a subclass of Data """
data: Union[DataA, DataB]
# initialize container with DataB
data = DataB(number=1.0)
container = Container(data=data)
# export container to string and load new container from string
string = container.json()
new_container = Container.parse_raw(string)
# look at type of container.data
print(type(new_container.data).__name__)
# >>> DataB
This method can be automated, but you can use it at your own responsibility, since it breaks static typing and uses objects that may change in future versions:
from pydantic.fields import ModelField
class Data(BaseModel, ABC):
""" base class for a Member """
number: float
def __init_subclass__(cls, **kwargs):
name = 'tag'
value = cls.__name__
annotation = Literal[value]
tag_field = ModelField.infer(name=name, value=value, annotation=annotation, class_validators=None, config=cls.__config__)
cls.__fields__[name] = tag_field
cls.__annotations__[name] = annotation
class DataA(Data):
""" A type of Data"""
pass
class DataB(Data):
""" Another type of Data """
pass
Just wanted to take the opportunity to list another possible alternative here to pydantic - which already supports this use case very well, as per below answer.
I am the creator and maintainer of a relatively newer and lesser-known JSON serialization library, the Dataclass Wizard - which relies on the Python dataclasses module to perform its magic. As of the latest version, 0.14.0, the dataclass-wizard now supports dataclasses within Union types. Previously, it did not support dataclasses within Union types at all, which was kind of a glaring omission, and something on my "to-do" list of things to (eventually) add support for.
As of the latest, it should now support defining dataclasses within Union types. The reason it did not generally work before, is because the data being de-serialized is often a JSON object, which only knows simple types such as arrays and dictionaries, for example. A dict type would not otherwise match any of the Union[Data1, Data2] types, even if the object had all the correct dataclass fields as keys. This is simply because it doesn't compare the dict object against each of the dataclass fields in the Union types, though that might change in a future release.
So in any case, here is a simple example to demonstrate the usage of dataclasses in Union types, using a class inheritance model with the JSONWizard mixin class:
With Class Inheritance
from abc import ABC
from dataclasses import dataclass
from typing import Union
from dataclass_wizard import JSONWizard
#dataclass
class Data(ABC):
""" base class for a Member """
number: float
class DataA(Data, JSONWizard):
""" A type of Data"""
class _(JSONWizard.Meta):
"""
This defines a custom tag that uniquely identifies the dataclass.
"""
tag = 'A'
class DataB(Data, JSONWizard):
""" Another type of Data """
class _(JSONWizard.Meta):
"""
This defines a custom tag that uniquely identifies the dataclass.
"""
tag = 'B'
#dataclass
class Container(JSONWizard):
""" container holds a subclass of Data """
data: Union[DataA, DataB]
The usage is shown below, and is again pretty straightforward. It relies on a special __tag__ key set in a dictionary or JSON object to marshal it into the correct dataclass, based on the Meta.tag value for that class, that we have set up above.
print('== Load with DataA ==')
input_dict = {
'data': {
'number': '1.0',
'__tag__': 'A'
}
}
# De-serialize the `dict` object to a `Container` instance.
container = Container.from_dict(input_dict)
print(repr(container))
# prints:
# Container(data=DataA(number=1.0))
# Show the prettified JSON representation of the instance.
print(container)
# Assert we load the correct dataclass from the annotated `Union` types
assert type(container.data) == DataA
print()
print('== Load with DataB ==')
# initialize container with DataB
data_b = DataB(number=2.0)
container = Container(data=data_b)
print(repr(container))
# prints:
# Container(data=DataB(number=2.0))
# Show the prettified JSON representation of the instance.
print(container)
# Assert we load the correct dataclass from the annotated `Union` types
assert type(container.data) == DataB
# Assert we end up with the same instance when serializing and de-serializing
# our data.
string = container.to_json()
assert container == Container.from_json(string)
Without Class Inheritance
Here is the same example as above, but with relying solely on dataclasses, without using any special class inheritance model:
from abc import ABC
from dataclasses import dataclass
from typing import Union
from dataclass_wizard import asdict, fromdict, LoadMeta
#dataclass
class Data(ABC):
""" base class for a Member """
number: float
class DataA(Data):
""" A type of Data"""
class DataB(Data):
""" Another type of Data """
#dataclass
class Container:
""" container holds a subclass of Data """
data: Union[DataA, DataB]
# Setup tags for the dataclasses. This can be passed into either
# `LoadMeta` or `DumpMeta`.
#
# Note that I'm not a fan of this syntax either, so it might change. I was
# thinking of something more explicit, like `LoadMeta(...).bind_to(class)`
LoadMeta(DataA, tag='A')
LoadMeta(DataB, tag='B')
# The rest is the same as before.
# initialize container with DataB
data = DataB(number=2.0)
container = Container(data=data)
print(repr(container))
# prints:
# Container(data=DataB(number=2.0))
# Assert we load the correct dataclass from the annotated `Union` types
assert type(container.data) == DataB
# Assert we end up with the same data when serializing and de-serializing.
out_dict = asdict(container)
assert container == fromdict(Container, out_dict)
I'm trying to hack something together in the meantime using custom validators.
Basically the class decorator adds a class_name: str field, which is added to the json string. The validator then looks up the correct subclass based on its value.
def register_distinct_subclasses(fields: tuple):
""" fields is tuple of subclasses that we want to be registered as distinct """
field_map = {field.__name__: field for field in fields}
def _register_distinct_subclasses(cls):
""" cls is the superclass of fields, which we add a new validator to """
orig_init = cls.__init__
class _class:
class_name: str
def __init__(self, **kwargs):
class_name = type(self).__name__
kwargs["class_name"] = class_name
orig_init(**kwargs)
#classmethod
def __get_validators__(cls):
yield cls.validate
#classmethod
def validate(cls, v):
if isinstance(v, dict):
class_name = v.get("class_name")
json_string = json.dumps(v)
else:
class_name = v.class_name
json_string = v.json()
cls_type = field_map[class_name]
return cls_type.parse_raw(json_string)
return _class
return _register_distinct_subclasses
which is called as follows
Data = register_distinct_subclasses((DataA, DataB))(Data)

How does the sorting solution work for leetcode's question: max width ramp? [duplicate]

I have gone through most of the documentation of __getitem__ in the Python docs, but I am still unable to grasp the meaning of it.
So all I can understand is that __getitem__ is used to implement calls like self[key]. But what is the use of it?
Lets say I have a python class defined in this way:
class Person:
def __init__(self,name,age):
self.name = name
self.age = age
def __getitem__(self,key):
print ("Inside `__getitem__` method!")
return getattr(self,key)
p = Person("Subhayan",32)
print (p["age"])
This returns the results as expected. But why use __getitem__ in the first place? I have also heard that Python calls __getitem__ internally. But why does it do it?
Can someone please explain this in more detail?
Cong Ma does a good job of explaining what __getitem__ is used for - but I want to give you an example which might be useful.
Imagine a class which models a building. Within the data for the building it includes a number of attributes, including descriptions of the companies that occupy each floor :
Without using __getitem__ we would have a class like this :
class Building(object):
def __init__(self, floors):
self._floors = [None]*floors
def occupy(self, floor_number, data):
self._floors[floor_number] = data
def get_floor_data(self, floor_number):
return self._floors[floor_number]
building1 = Building(4) # Construct a building with 4 floors
building1.occupy(0, 'Reception')
building1.occupy(1, 'ABC Corp')
building1.occupy(2, 'DEF Inc')
print( building1.get_floor_data(2) )
We could however use __getitem__ (and its counterpart __setitem__) to make the usage of the Building class 'nicer'.
class Building(object):
def __init__(self, floors):
self._floors = [None]*floors
def __setitem__(self, floor_number, data):
self._floors[floor_number] = data
def __getitem__(self, floor_number):
return self._floors[floor_number]
building1 = Building(4) # Construct a building with 4 floors
building1[0] = 'Reception'
building1[1] = 'ABC Corp'
building1[2] = 'DEF Inc'
print( building1[2] )
Whether you use __setitem__ like this really depends on how you plan to abstract your data - in this case we have decided to treat a building as a container of floors (and you could also implement an iterator for the Building, and maybe even the ability to slice - i.e. get more than one floor's data at a time - it depends on what you need.
The [] syntax for getting item by key or index is just syntax sugar.
When you evaluate a[i] Python calls a.__getitem__(i) (or type(a).__getitem__(a, i), but this distinction is about inheritance models and is not important here). Even if the class of a may not explicitly define this method, it is usually inherited from an ancestor class.
All the (Python 2.7) special method names and their semantics are listed here: https://docs.python.org/2.7/reference/datamodel.html#special-method-names
The magic method __getitem__ is basically used for accessing list items, dictionary entries, array elements etc. It is very useful for a quick lookup of instance attributes.
Here I am showing this with an example class Person that can be instantiated by 'name', 'age', and 'dob' (date of birth). The __getitem__ method is written in a way that one can access the indexed instance attributes, such as first or last name, day, month or year of the dob, etc.
import copy
# Constants that can be used to index date of birth's Date-Month-Year
D = 0; M = 1; Y = -1
class Person(object):
def __init__(self, name, age, dob):
self.name = name
self.age = age
self.dob = dob
def __getitem__(self, indx):
print ("Calling __getitem__")
p = copy.copy(self)
p.name = p.name.split(" ")[indx]
p.dob = p.dob[indx] # or, p.dob = p.dob.__getitem__(indx)
return p
Suppose one user input is as follows:
p = Person(name = 'Jonab Gutu', age = 20, dob=(1, 1, 1999))
With the help of __getitem__ method, the user can access the indexed attributes. e.g.,
print p[0].name # print first (or last) name
print p[Y].dob # print (Date or Month or ) Year of the 'date of birth'
As a side note, the __getitem__ method also allows you to turn your object into an iterable.
Example: if used with iter(), it can generate as many int squared values as you want:
class MyIterable:
def __getitem__(self, index):
return index ** 2
obj = MyIterable()
obj_iter = iter(obj)
for i in range(1000):
print(next(obj_iter))
For readability and consistency. That question is part of why operator overloading exists, since __getitem__ is one of the functions that implement that.
If you get an unknown class, written by an unknown author, and you want to add its 3rd element to its 5th element, you can very well assume that obj[3] + obj[5] will work.
What would that line look like in a language that does not support operator overloading?? Probably something like obj.get(3).add(obj.get(5))?? Or maybe obj.index(3).plus(obj.index(5))??
The problem with the second approach is that (1) it's much less readable and (2) you can't guess, you have to look up the documentation.
A common library that uses this technique is the 'email' module. It uses the __getitem__ method in the email.message.Message class, which in turn is inherited by MIME-related classes.
Then in the and all you need to get a valid MIME-type message with sane defaults is add your headers. There's a lot more going on under the hood but the usage is simple.
message = MIMEText(message_text)
message['to'] = to
message['from'] = sender
message['subject'] = subject
Django core has several interesting and nifty usages for magic methods, including __getitem__. These were my recent finds:
Django HTTP Request
When you submit GET/POST data in Django, it will be stored in Django's request object as request.GET/request.POST dict. This dict is of type QueryDict which inherits from MultiValueDict.
When you submit data, say user_id=42, QueryDict will be stored/represented as:
<QueryDict: {'user_id': ['42']}>
So, the passed data becomes
'user_id': ['42']
instead of the intuitive
'user_id': '42'
MultiValueDict's docstring explains though why it needs to auto-convert this to list format:
This class exists to solve the irritating problem raised by cgi.parse_qs, which returns a list for every key..
Given that the QueryDict values are transformed into lists, they will need to be accessed then like this (same idea with request.GET):
request.POST['user_id'][0]
request.POST['user_id'][-1]
request.POST.get('user_id')[0]
request.POST.get('user_id)[-1]
But, these are horrible ways to access the data. So. Django overridden the __getitem__ and __get__ in MultiValueDict. This is the simplified version:
def __getitem__(self, key):
"""
Accesses the list value automatically
using the `-1` list index.
"""
list_ = super().__getitem__(key)
return list_[-1]
def get(self, key, default=None):
"""
Just calls the `__getitem__` above.
"""
return self[key]
With these, you could now have a more intuitive accessors:
request.POST['user_id']
request.POST.get('user_id')
Django Forms
In Django, you could declare forms like this (includes ModelForm):
class ArticleForm(...):
title = ...
These forms inherit from BaseForm, and have these overridden magic methods (simplified version):
def __iter__(self):
for name in self.fields:
yield self[name]
def __getitem__(self, name):
return self.fields[name]
resulting to these convenient patterns:
# Instead of `for field in form.fields`.
# This is a common pattern in Django templates.
for field in form
...
# Instead of `title = form.fields['title']`
title = form['title']
In summary, magic methods (or their overrides) increase code readability and developer experience/convenience.
The use of __getitem__ includes implementing control flow measures that for some weird reason cannot be performed lower in the execution stack:
class HeavenlyList(list):
"""don't let caller get 666th element"""
def __getitem__(self, key):
"""return element"""
if isinstance(key, slice):
return [
super().__getitem__(i)
for i in range(key.start, key.stop, key.step)
if i != 666
]
return super().__getitem__(key) if key != 666 else None
A similar, but more interesting reason is to allow slice-based access to elements in container/sequence types that ordinarily don't allow it:
class SliceDict(dict):
"""handles slices"""
def __setitem__(self, key, value):
"""map key to value"""
if not isinstance(key, int)
raise TypeError("key must be an integer")
super().__setitem__(key, value)
def __getitem__(self, key):
"""return value(s)"""
if not isinstance(key, slice):
return super().__getitem__(key)
return [
super().__getitem__(i)
for i in range(key.start, key.stop, key.step)
]
Another interesting use is overriding str.__getitem__ to accept str objects as well as ints and slices, such that the str input is a regular expression, and the return value is the match object iterator returned by re.finditer:
from re import finditer
class REString(str):
"""handles regular expressions"""
re_flags = 0
def __getitem__(self, key):
"""return some/all of string or re.finditer"""
if isinstance(key, str):
return finditer(key, self, flags=self.re_flags)
return super().__getitem__(key)
A real-world problem where overriding dict.__getitem__ in particular proves useful is when a program requires information that is distributed over the internet and available over HTTP. Because these information are remote, the process can employ some level of laziness-- only retrieving data for items it doesn't have or that have changed. The specific example is having a dictionary instance lazily retrieve and store Python Enhancement Proposals. There are many of these documents, sometimes they are revised, and they all reside on hosts known by the domain name peps.python.org. Therefore the idea is to make a HTTP GET request for the PEP number passed into __getitem__, fetching it if the dictionary doesn't already contain it or the PEPs HTTP ETAG changed.
from http import HTTPStatus, client
class PEPDict(dict):
"""lazy PEP container"""
conn = client.HTTPSConnection("peps.python.org")
def __getitem__(self, pep):
"""return pep pep"""
# if lazy for too long
if self.conn.sock is None:
self.conn.connect()
# build etag check in request header
requestheaders = dict()
if pep in self:
requestheaders = {
"if-none-match": super().__getitem__(pep)[0]
}
# make request and fetch response
self.conn.request(
"GET",
"/%s/" % str(pep).zfill(4),
headers=requestheaders
)
response = self.conn.getresponse()
# (re)set the pep
if response.status = HTTPStatus.OK:
self.__setitem__(
pep, (
response.getheader("etag"),
response.read()
)
)
# raise if status is not ok or not modified
if response.status != HTTPStatus.NOT_MODIFIED:
raise Exception("something weird happened")
return super().__getitem__(pep)[1]
A good resource for understanding further what is the use of it is to review its associated special/dunder methods in the emulating container types section of Python's data model document.
OK I'll just leave this here. OP questions the very basics of software engineering.
This is about defining class interface. Consistency, readability or whatever else is secondary.
First of all this is about how different parts of the project can talk to your object.
Imagine function which calls [] on some object. Now you are tasked to do exactly what this function does with some new type object that you have. But your object is not a list or dict, or tuple.
Now you don't need to implement anything but define a __getitem__ for the class of your object.
Interfaces create building blocks out of bunch of internal implementations. Define them wisely.

When should I subclass EnumMeta instead of Enum?

In this article Nick Coghlan talks about some of the design decisions that went in to the PEP 435 Enum type, and how EnumMeta can be subclassed to provide a different Enum experience.
However, the advice I give (and I am the primary stdlib Enum author) about using a metaclass is it should not be done without a really good reason -- such as not being able to accomplish what you need with a class decorator, or a dedicated function to hide any ugliness; and in my own work I've been able to do whatever I needed simply by using __new__, __init__, and/or normal class/instance methods when creating the Enum class:
Enum with attributes
Handling missing members
class constants that are not Enum members
And then there is this cautionary tale of being careful when delving into Enum, with and without metaclass subclassing:
Is it possible to override __new__ in an enum to parse strings to an instance?
Given all that, when would I need to fiddle with EnumMeta itself?
The best cases I have seen so far for subclassing EnumMeta comes from these four questions:
A more pythonic way to define an enum with dynamic members
Prevent invalid enum attribute assignment
Create an abstract Enum class
Invoke a function when an enum member is accessed
We'll examine the dynamic member case further here.
First, a look at the code needed when not subclassing EnumMeta:
The stdlib way
from enum import Enum
import json
class BaseCountry(Enum):
def __new__(cls, record):
member = object.__new__(cls)
member.country_name = record['name']
member.code = int(record['country-code'])
member.abbr = record['alpha-2']
member._value_ = member.abbr, member.code, member.country_name
if not hasattr(cls, '_choices'):
cls._choices = {}
cls._choices[member.code] = member.country_name
cls._choices[member.abbr] = member.country_name
return member
def __str__(self):
return self.country_name
Country = BaseCountry(
'Country',
[(rec['alpha-2'], rec) for rec in json.load(open('slim-2.json'))],
)
The aenum way 1 2
from aenum import Enum, MultiValue
import json
class Country(Enum, init='abbr code country_name', settings=MultiValue):
_ignore_ = 'country this' # do not add these names as members
# create members
this = vars()
for country in json.load(open('slim-2.json')):
this[country['alpha-2']] = (
country['alpha-2'],
int(country['country-code']),
country['name'],
)
# have str() print just the country name
def __str__(self):
return self.country_name
The above code is fine for a one-off enumeration -- but what if creating Enums from JSON files was common for you? Imagine if you could do this instead:
class Country(JSONEnum):
_init_ = 'abbr code country_name' # remove if not using aenum
_file = 'some_file.json'
_name = 'alpha-2'
_value = {
1: ('alpha-2', None),
2: ('country-code', lambda c: int(c)),
3: ('name', None),
}
As you can see:
_file is the name of the json file to use
_name is the path to whatever should be used for the name
_value is a dictionary mapping paths to values3
_init_ specifies the attribute names for the different value components (if using aenum)
The JSON data is taken from https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes -- here is a short excerpt:
[{"name":"Afghanistan","alpha-2":"AF","country-code":"004"},
{"name":"Ă…land Islands","alpha-2":"AX","country-code":"248"},
{"name":"Albania","alpha-2":"AL","country-code":"008"},
{"name":"Algeria","alpha-2":"DZ","country-code":"012"}]
Here is the JSONEnumMeta class:
class JSONEnumMeta(EnumMeta):
#classmethod
def __prepare__(metacls, cls, bases, **kwds):
# return a standard dictionary for the initial processing
return {}
def __init__(cls, *args , **kwds):
super(JSONEnumMeta, cls).__init__(*args)
def __new__(metacls, cls, bases, clsdict, **kwds):
import json
members = []
missing = [
name
for name in ('_file', '_name', '_value')
if name not in clsdict
]
if len(missing) in (1, 2):
# all three must be present or absent
raise TypeError('missing required settings: %r' % (missing, ))
if not missing:
# process
name_spec = clsdict.pop('_name')
if not isinstance(name_spec, (tuple, list)):
name_spec = (name_spec, )
value_spec = clsdict.pop('_value')
file = clsdict.pop('_file')
with open(file) as f:
json_data = json.load(f)
for data in json_data:
values = []
name = data[name_spec[0]]
for piece in name_spec[1:]:
name = name[piece]
for order, (value_path, func) in sorted(value_spec.items()):
if not isinstance(value_path, (list, tuple)):
value_path = (value_path, )
value = data[value_path[0]]
for piece in value_path[1:]:
value = value[piece]
if func is not None:
value = func(value)
values.append(value)
values = tuple(values)
members.append(
(name, values)
)
# get the real EnumDict
enum_dict = super(JSONEnumMeta, metacls).__prepare__(cls, bases, **kwds)
# transfer the original dict content, _items first
items = list(clsdict.items())
items.sort(key=lambda p: (0 if p[0][0] == '_' else 1, p))
for name, value in items:
enum_dict[name] = value
# add the members
for name, value in members:
enum_dict[name] = value
return super(JSONEnumMeta, metacls).__new__(metacls, cls, bases, enum_dict, **kwds)
# for use with both Python 2/3
JSONEnum = JSONEnumMeta('JsonEnum', (Enum, ), {})
A few notes:
JSONEnumMeta.__prepare__ returns a normal dict
EnumMeta.__prepare__ is used to get an instance of _EnumDict -- this is the proper way to get one
keys with a leading underscore are passed to the real _EnumDict first as they may be needed when processing the enum members
Enum members are in the same order as they were in the file
1 Disclosure: I am the author of the Python stdlib Enum, the enum34 backport, and the Advanced Enumeration (aenum) library.
2 This requires aenum 2.0.5+.
3 The keys are numeric to keep multiple values in order should your Enum need more than one.

How to do yaml.safe_dump() and .safe_load() of Python object without yaml.YAMLObject?

I want to serialize some object with yaml.safe_dump(). How can I serialize Python objects with add_representer() and add_constructor() ...
I can not add yaml.YAMLObject to Thing (third party module) and not want use.
I do such dump:
import yaml
class Thing(object):
def __init__(self, name):
self.name = name
def Thing_representer(dumper, data):
return dumper.represent_mapping('!Thing', data.__dict__)
yaml.SafeDumper.add_representer(Thing, Thing_representer)
safe_dump = yaml.safe_dump(t)
print safe_dump
It works fine but I have no idea how to do constructor?
def Thing_constructor(loader, data):
thing = Thing()
return thing.__dict__.update(loader.construct_mapping(data))
yaml.SafeLoader.add_constructor('!Thing', Thing_constructor)
yaml.safe_load(safe_dump)
It throws exception TypeError: __init__() takes exactly 2 arguments (1 given) and should throw since constructor requires parameters. Maybe there is another option to construct object skipping constructor?
You cannot construct Thing() without handing in the name. You can solve that
in various ways, but the following should work.
def thing_constructor(self, node):
name = None
for x in node.value:
if x[0].value == 'name':
name = x[1].value
return Thing(name)
yaml.SafeLoader.add_constructor('!Thing', thing_constructor)
res = yaml.safe_load(safe_dump)
print res.name
You can simplify the setting of the name parameter, but this way it is more extensible if Thing would have taken more parameters.

Categories