Detecting Duplicates in Class Instance's Variable Values

Detecting Duplicates in Class Instance's Variable Values - python

class Translator(object):
def __init__(self, tracking_col='tracking_id', coding_col='coding', qualifying_code_col='qualifying_code',
translation_col='translation'):
self._results = []
self.tracking_col = tracking_col
self.data_col = coding_col
self.definition_col = qualifying_code_col
self.translation_col = translation_col
self.__validate_parameters(self.__dict__)
def __validate_parameters(self, variable_values):
class_values = {}
for key, value in variable_values.items():
if type(value) is str:
class_values.setdefault(value, set()).add(key)
for key, values in class_values.items():
# If there is more than one value, there is a duplicate
if len(values) > 1:
raise Exception('Duplicate column names exist in parameters. \'{}\' are set to \'{}\'. '
'Do not use duplicate column names.'.format(values, key))
This class cannot have the duplicate values for any of the 'col' variables. If duplicate values exist, logic further in the class may not crash but will create unpredictable results.
Upon instantiation my function __validate_parameters will detect duplicate values and raise an Exception. The problem is I am dumping all the values out to a dictionary, iterating to create another dictionary, and finally manually raising an exception (which from what I've been told is the wrong thing to do in any situation). It's also rather verbose.
Is there a shorter and more concise way to validate for duplicates while propogating an error up without the complexity above?

There is nothing wrong with manually raising an exception. Collecting your cols in some collection will make validation easier:
class Translator(object):
def __init__(self, tracking_col=..., coding_col=..., qualifying_code_col=...,
translation_col=...):
self._results = []
self.cols = [tracking_col, coding_col, qualifying_code_col, translation_col]
self.validate_cols(self)
def validate_cols(self):
if len(self.cols) > len(set(self.cols)):
raise ...
#property
def tracking_col(self):
return cols[0]
# ...

You could make the constructor take a dictionary instead of individual variables, e.g.
class Translator(object):
def __init__(self, cols={}):
defaults = { "tracking_col" : "tracking_id",
"coding_col" : "coding",
"qualifying_code_col" : "qualifying_code",
"translation_col" : "translation" }
for d in defaults:
if d not in cols:
cols[d] = defaults[d]
self.__validate_parameters(cols)
def __validate_parameters(self, d):
import Counter
c = Counter.Counter(d.values())
if any(cnt > 1 for cnt in c.values()):
raise Exception("Duplicate values found: '%s'" % str(c))
(Code not tested)

Related

Dynamically subclass an Enum base class

I've set up a metaclass and base class pair for creating the line specifications of several different file types I have to parse.
I have decided to go with using enumerations because many of the individual parts of the different lines in the same file often have the same name. Enums make it easy to tell them apart. Additionally, the specification is rigid and there will be no need to add more members, or extend the line specifications later.
The specification classes work as expected. However, I am having some trouble dynamically creating them:
>>> C1 = LineMakerMeta('C1', (LineMakerBase,), dict(a = 0))
AttributeError: 'dict' object has no attribute '_member_names'
Is there a way around this? The example below works just fine:
class A1(LineMakerBase):
Mode = 0, dict(fill=' ', align='>', type='s')
Level = 8, dict(fill=' ', align='>', type='d')
Method = 10, dict(fill=' ', align='>', type='d')
_dummy = 20 # so that Method has a known length
A1.format(**dict(Mode='DESIGN', Level=3, Method=1))
# produces ' DESIGN 3 1'
The metaclass is based on enum.EnumMeta, and looks like this:
import enum
class LineMakerMeta(enum.EnumMeta):
"Metaclass to produce formattable LineMaker child classes."
def _iter_format(cls):
"Iteratively generate formatters for the class members."
for member in cls:
yield member.formatter
def __str__(cls):
"Returns string line with all default values."
return cls.format()
def format(cls, **kwargs):
"Create formatted version of the line populated by the kwargs members."
# build resulting string by iterating through members
result = ''
for member in cls:
# determine value to be injected into member
try:
try:
value = kwargs[member]
except KeyError:
value = kwargs[member.name]
except KeyError:
value = member.default
value_str = member.populate(value)
result = result + value_str
return result
And the base class is as follows:
class LineMakerBase(enum.Enum, metaclass=LineMakerMeta):
"""A base class for creating Enum subclasses used for populating lines of a file.
Usage:
class LineMaker(LineMakerBase):
a = 0, dict(align='>', fill=' ', type='f'), 3.14
b = 10, dict(align='>', fill=' ', type='d'), 1
b = 15, dict(align='>', fill=' ', type='s'), 'foo'
# ^-start ^---spec dictionary ^--default
"""
def __init__(member, start, spec={}, default=None):
member.start = start
member.spec = spec
if default is not None:
member.default = default
else:
# assume value is numerical for all provided types other than 's' (string)
default_or_set_type = member.spec.get('type','s')
default = {'s': ''}.get(default_or_set_type, 0)
member.default = default
#property
def formatter(member):
"""Produces a formatter in form of '{0:<format>}' based on the member.spec
dictionary. The member.spec dictionary makes use of these keys ONLY (see
the string.format docs):
fill align sign width grouping_option precision type"""
try:
# get cached value
return '{{0:{}}}'.format(member._formatter)
except AttributeError:
# add width to format spec if not there
member.spec.setdefault('width', member.length if member.length != 0 else '')
# build formatter using the available parts in the member.spec dictionary
# any missing parts will simply not be present in the formatter
formatter = ''
for part in 'fill align sign width grouping_option precision type'.split():
try:
spec_value = member.spec[part]
except KeyError:
# missing part
continue
else:
# add part
sub_formatter = '{!s}'.format(spec_value)
formatter = formatter + sub_formatter
member._formatter = formatter
return '{{0:{}}}'.format(formatter)
def populate(member, value=None):
"Injects the value into the member's formatter and returns the formatted string."
formatter = member.formatter
if value is not None:
value_str = formatter.format(value)
else:
value_str = formatter.format(member.default)
if len(value_str) > len(member) and len(member) != 0:
raise ValueError(
'Length of object string {} ({}) exceeds available'
' field length for {} ({}).'
.format(value_str, len(value_str), member.name, len(member)))
return value_str
#property
def length(member):
return len(member)
def __len__(member):
"""Returns the length of the member field. The last member has no length.
Length are based on simple subtraction of starting positions."""
# get cached value
try:
return member._length
# calculate member length
except AttributeError:
# compare by member values because member could be an alias
members = list(type(member))
try:
next_index = next(
i+1
for i,m in enumerate(type(member))
if m.value == member.value
)
except StopIteration:
raise TypeError(
'The member value {} was not located in the {}.'
.format(member.value, type(member).__name__)
)
try:
next_member = members[next_index]
except IndexError:
# last member defaults to no length
length = 0
else:
length = next_member.start - member.start
member._length = length
return length

This line:
C1 = enum.EnumMeta('C1', (), dict(a = 0))
fails with exactly the same error message. The __new__ method of EnumMeta expects an instance of enum._EnumDict as its last argument. _EnumDict is a subclass of dict and provides an instance variable named _member_names, which of course a regular dict doesn't have. When you go through the standard mechanism of enum creation, this all happens correctly behind the scenes. That's why your other example works just fine.
This line:
C1 = enum.EnumMeta('C1', (), enum._EnumDict())
runs with no error. Unfortunately, the constructor of _EnumDict is defined as taking no arguments, so you can't initialize it with keywords as you apparently want to do.
In the implementation of enum that's backported to Python3.3, the following block of code appears in the constructor of EnumMeta. You could do something similar in your LineMakerMeta class:
def __new__(metacls, cls, bases, classdict):
if type(classdict) is dict:
original_dict = classdict
classdict = _EnumDict()
for k, v in original_dict.items():
classdict[k] = v
In the official implementation, in Python3.5, the if statement and the subsequent block of code is gone for some reason. Therefore classdict must be an honest-to-god _EnumDict, and I don't see why this was done. In any case the implementation of Enum is extremely complicated and handles a lot of corner cases.
I realize this is not a cut-and-dried answer to your question but I hope it will point you to a solution.

Create your LineMakerBase class, and then use it like so:
C1 = LineMakerBase('C1', dict(a=0))
The metaclass was not meant to be used the way you are trying to use it. Check out this answer for advice on when metaclass subclasses are needed.
Some suggestions for your code:
the double try/except in format seems clearer as:
for member in cls:
if member in kwargs:
value = kwargs[member]
elif member.name in kwargs:
value = kwargs[member.name]
else:
value = member.default
this code:
# compare by member values because member could be an alias
members = list(type(member))
would be clearer with list(member.__class__)
has a false comment: listing an Enum class will never include the aliases (unless you have overridden that part of EnumMeta)
instead of the complicated __len__ code you have now, and as long as you are subclassing EnumMeta you should extend __new__ to automatically calculate the lengths once:
# untested
def __new__(metacls, cls, bases, clsdict):
# let the main EnumMeta code do the heavy lifting
enum_cls = super(LineMakerMeta, metacls).__new__(cls, bases, clsdict)
# go through the members and calculate the lengths
canonical_members = [
member
for name, member in enum_cls.__members__.items()
if name == member.name
]
last_member = None
for next_member in canonical_members:
next_member.length = 0
if last_member is not None:
last_member.length = next_member.start - last_member.start

The simplest way to create Enum subclasses on the fly is using Enum itself:
>>> from enum import Enum
>>> MyEnum = Enum('MyEnum', {'a': 0})
>>> MyEnum
<enum 'MyEnum'>
>>> MyEnum.a
<MyEnum.a: 0>
>>> type(MyEnum)
<class 'enum.EnumMeta'>
As for your custom methods, it might be simpler if you used regular functions, precisely because Enum implementation is so special.

how to fix the _getitem__ method

def pnamedtuple(type_name, field_names, mutable=False):
pass
class type_name:
def __init__(self, x, y):
self.x = x
self.y = y
self._fields = ['x','y']
self._mutable = False
def get_x(self):
return self.x
def get_y(self):
return self.y
def __getitem__(self,i):
if i > 1 or i <0:
raise IndexError
if i == 0 or i == 'x':
return self.get_x():
if i == 1 or i == 'y':
return self.get_y():
the getitem method to overload the [] (indexing operator) for this class: an index of 0 returns the value of the first field name in the field_names list; an index of 1 returns the value of the second field name in the field_names list, etc. Also, the index can be a string with the named field. So, for p = Point(1,2) writing p.get_x(), or p[0]), or p['x'] returns a result of 1. Raise an IndexError with an appropriate message if the index is out of bounds int or a string that does not name a field.
I am not sure how to fix the getitme function. below is the bsc.txt
c-->t1 = Triple1(1,2,3)
c-->t2 = Triple2(1,2,3)
c-->t3 = Triple3(1,2,3)
# Test __getitem__ functions
e-->t1[0]-->1
e-->t1[1]-->2
e-->t1[2]-->3
e-->t1['a']-->1
e-->t1['b']-->2
e-->t1['c']-->3
^-->t1[4]-->IndexError
^-->t1['d']-->IndexError
^-->t1[3.2]-->IndexError
can someone tell how to fix my _getitem _ function to get the output in bsc.txt? many thanks.

You've spelled __getitem__ incorrectly. Magic methods require two __ underscores before and after them.
So you haven't overloaded the original __getitem__ method, you've simply created a new method named _getitem_.

Python 3 does not allow strings and integers to be compared with > or <; it's best to stick with == if you don't yet know the type of i. You could use isinstance, but here you can easily convert the only two valid integer values to strings (or vice versa), then work only on strings.
def __getitem__(self, i):
if i == 0:
i = "x"
elif i == 1:
i = "y"
if i == "x":
return self.get_x()
elif i == "y":
return self.get_y()
else:
raise IndexError("Invalid key: {}".format(i))

your function is interesting, but there are some issues with it:
In python 3 you can't compare string with numbers, so you first should check with == against know values and or types. For example
def __getitem__(self,i):
if i in {0,"x"}:
return self.x
elif i in {1,"y"}:
return self.y
else:
raise IndexError(repr(i))
But defined like that (in your code or in the example above) for an instance t1 this t1[X] for all string X others than "x" or "y" will always fail as you don't adjust it for any other value. And that is because
pnamedtuple looks like you want for it to be a factory like collections.namedtuple, but it fail to be general enough because you don't use any the arguments of your function at all. And no, type_name is not used either, whatever value it have is throw away when you make the class declaration.
how to fix it?
You need other ways to store the value of the fields and its respective name, for example a dictionary lets call it self._data
To remember how you called yours field, use the argument of your function, for instance self._fields = field_names
To accept a unknown number of arguments use * like __init__(self, *values) then verify that you have the same numbers of values and fields and build your data structure of point 1 (the dictionary)
Once that those are ready then __getitem__ become something like:
def __getitem__(self, key):
if key in self._data:
return self._data[key]
elif isintance(key,int) and 0 <= key < len(self._fields):
return self._data[ self._fields[key] ]
else:
raise IndexError( repr(key) )
or you can simple inherit from a appropriate namedtuple and the only thing you need to do is overwrite its __getitem__ like
def __getitem__(self,key):
if key in self._fields:
return getattr(self,key)
return super().__getitem__(key)

How do I use dictionary key,value pair to set class instance attributes "pythonic"ly?

I have created some Python classes to use as multivariate data structures, which are then used for various tasks. In some instances, I like to populate the classes with various value sets. The default parameter filename "ho2.defaults" would look something like this:
name = 'ho2'
mass_option = 'h1o16'
permutation = 'odd'
parity = 'odd'
j_total = 10
lr = 40
br = 60
jmax = 60
mass_lr = 14578.471659
mass_br = 1781.041591
length_lr = ( 1.0, 11.0, 2.65 )
length_br = ( 0.0, 11.0, 2.46 )
use_spline = True
energy_units = 'au'
pes_zpe = -7.407998138300982E-2
pes_cutoff = 0.293994
Currently, I create a dictionary from reading the desired key,value pairs from file, and now I'd like a "pythonic" way of making those dictionary keys be class instance variable names, i.e.
# Instantiate Molecule Class
molecule = Molecule()
# Create Dictionary of default values
default_dict = read_dict_from_file(filename)
# Set populate class instance variables with dictionary values
for key,value in default_dict:
molecule.key = value
So the Class's instance variable "molecule.name" could be set with the dictionary key,value pair. I could do this by hand, but I'ms sure there is a better way to loop through it. In actuality, the dictionary could be large, and I'd rather allow the user to choose which values they want to populate, so the dictionary could change. What am I missing here?

You would use setattr: setattr(molecule, key, value)

The simple way is:
vars(molecule).update(default_dict)
This will clobber any pre-existing attributes though. For a more delicate approach try:
for name, value in default_dict.items():
if not hasattr(molecule, name):
setattr(molecule, name value)

I'd invert the logic so that the object dynamically answers questions:
class Settings(object):
ATTRS = {'foo', 'bar'}
def __init__(self, defaults):
self.__dict__['data'] = defaults.copy()
def __getattr__(self, key):
if key not in self.ATTRS or key not in self.data:
raise AttributeError("'{}' object has no attribute '{}'".format(
self.__class__.__name__, key))
return self.data[key]
def __setattr__(self, key, value):
self.data[key] = value
s = Settings({'a': 'b', 'foo': 'foo!', 'spam': 'eggs'})
print s.foo
try:
print s.spam
except AttributeError:
pass
else:
raise AssertionError("That should have failed because 'spam' isn't in Settings.ATTRS")
try:
print s.bar
except AttributeError:
pass
else:
raise AssertionError("That should have failed because 'bar' wasn't passed in")
class Molecule(settings):
ATTRS = {'name', 'mass_option', ...}
molecule = Molecule(default_dict)

Dynamically adding nested dictionaries

I want to dynamically add values in a nested dictionary. I am trying to cache similarity score of two words with their part-of-speech-tag.
In short I want to store values as this;
synset_cache[word1][word1_tag][word2][word2_tag] = score
class MyClass(Object):
def __init__(self):
MyClass.synset_cache={} #dict
def set_cache(self,word1, word1_tag, word2, word2_tag, score)
try:
MyClass.synset_cache[word1]
except:
MyClass.synset_cache[word1]={} #create new dict
try:
MyClass.synset_cache[word1][word1_tag]
except:
MyClass.synset_cache[word1][word1_tag]={} #create new dict
try:
MyClass.synset_cache[word1][word1_tag][word2]
except:
MyClass.synset_cache[word1][word1_tag][word2]={} #create new dict
#store the value
MyClass.synset_cache[word1][word1_tag][word2][word2_tag] = score
But I am getting this error.
Type error: list indices must be integers, not unicode
Line number it shows is at MyClass.synset_cache[word1][word1_tag]={} #create new dict.
How can I get this working?
EDIT:
According to the #Robᵩ's comments on his answer; I was assigning a list to this MyClass.synset_cache in another method(note it is at the class-level). So this code part had no errors.

Use dict.setdefault.
This might work:
#UNTESTED
d = MyClass.synset_cache.setdefault(word1, {})
d = d.setdefault(word1_tag, {})
d = d.setdefault(word2, {})
d[word2_tag] = score
Alternatively, you can use this handy recursive defaultdict that springs up new levels of dict automatically. (See: here and here.)
import collections
def tree():
return collections.defaultdict(tree)
class MyClass(Object):
def __init__(self):
MyClass.synset_cache=tree()
def set_cache(self,word1, word1_tag, word2, word2_tag, score)
MyClass.synset_cache[word1][word1_tag][word2][word2_tag] = score

This will be data dependent, as at least for some test data (see below), the code does not produce that error. How are you calling it?
Also, note that as written above, it won't compile due to some syntax errors (i.e. no colon to end the def set_cache line).
Below is some tweaked-to-compile code with some example calling data and how that pretty-prints:
#!/usr/bin/env python
import pprint
class MyClass():
def __init__(self):
MyClass.synset_cache={} #dict
def set_cache(self,word1, word1_tag, word2, word2_tag, score):
try:
MyClass.synset_cache[word1]
except:
MyClass.synset_cache[word1]={} #create new dict
try:
MyClass.synset_cache[word1][word1_tag]
except:
MyClass.synset_cache[word1][word1_tag]={} #create new dict
try:
MyClass.synset_cache[word1][word1_tag][word2]
except:
MyClass.synset_cache[word1][word1_tag][word2]={} #create new dict
#store the value
MyClass.synset_cache[word1][word1_tag][word2][word2_tag] = score
x = MyClass()
x.set_cache('foo', 'foo-tag', 'bar', 'bar-tag', 100)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(x.synset_cache)
Which outputs:
{ 'foo': { 'foo-tag': { 'bar': { 'bar-tag': 100}}}}
A couple other things of note...
I'd recommend using the in style syntax to check for key presence rather than try-except. It's more compact and more Pythonic.
Also, your main variable, synset_cache, is class-level (i.e. static). Did you mean for that to be the case?

python cache dictionary - counting number of hits

I'm implementing a caching service in python. I'm using a simple dictionary so far. What I'd like to do is to count number of hits (number of times when a stored value was retrieved by the key). Python builtin dict has no such possibility (as far as I know). I searched through 'python dictionary count' and found Counter (also on stackoverflow), but this doesn't satisfy my requirements I guess. I don't need to count what already exists. I need to increment something that come from the outside. And I think that storing another dictionary with hits counting only is not the best data structure I can get :)
Do you have any ideas how to do it efficiently?

For an alternative method, if you're using Python 3 (or are willing to add this module to your Python 2 project, which has a slightly different interface), I strongly recommend the lru_cache decorator.
See the docs here. For example, this code :
from functools import lru_cache
#lru_cache(maxsize=32)
def meth(a, b):
print("Taking some time", a, b)
return a + b
print(meth(2, 3))
print(meth(2, 4))
print(meth(2, 3))
...will output :
Taking some time 2 3
5
Taking some time 2 4
6
5 <--- Notice that this function result is cached
As per the documentation, you can get the number of hits and misses with meth.cache_info(), and clear the cache with meth.cache_clear().

You can subclass a built-in dict class:
class CustomDict(dict):
def __init__(self, *args, **kwargs):
self.hits = {}
super(CustomDict, self).__init__(*args, **kwargs)
def __getitem__(self, key):
if key not in self.hits:
self.hits[key] = 0
self.hits[key] += 1
return super(CustomDict, self).__getitem__(key)
usage:
>>> d = CustomDict()
>>> d["test"] = "test"
>>> d["test"]
'test'
>>> d["test"]
'test'
>>> d.hits["test"]
2

Having another dictionary to store the hit counts is probably not a bad option, but you could also do something like:
class CacheService(object):
def __init__(self):
self.data = {}
def __setitem__(self, key, item):
self.data[key] = [item, 0]
def __getitem__(self, key):
value = self.data[key]
value[1] += 1
return value[0]
def getcount(self, key):
return self.data[key][1]
You can use it something like this:
>>> cs = CacheService()
>>> cs[1] = 'one'
>>> cs[2] = 'two'
>>> print cs.getcount(1)
0
>>> cs[1]
'one'
>>> print cs.getcount(1)
1

It will be much easier to just overload the built-in dict data type. This will solve your problem.
def CountDict(dict):
count = {}
def __getitem__(self, key):
CountDict.count[key] = CountDict.count.get(key, 0) + 1
return super(CountDict, self).__getitem__(self, key)
def __setitem__(self, key, value):
return super(CountDict, self).__setitem__(self, key, value)
def get_count(self, key):
return CountDict.count.get(key, 0)
This will give you lot more flexibility. Like you can have two counts one for number of reads and another for number of writes, if you wish without much of a complexity. To learn more about super, see here.
Edited to meet OP's need of keeping a count for reading a key. The output can be obtained by calling get_count method.
>>>my_dict = CountDict()
>>>my_dict["a"] = 1
>>>my_dict["a"]
>>>1
>>>my_dict["a"]
>>>1
>>>my_dict.get_count("a")
>>>2

You could try this approach.
class AccessCounter(object):
'''A class that contains a value and implements an access counter.
The counter increments each time the value is changed.'''
def __init__(self, val):
super(AccessCounter, self).__setattr__('counter', 0)
super(AccessCounter, self).__setattr__('value', val)
def __setattr__(self, name, value):
if name == 'value':
super(AccessCounter, self).__setattr__('counter', self.counter + 1)
# Make this unconditional.
# If you want to prevent other attributes to be set, raise AttributeError(name)
super(AccessCounter, self).__setattr__(name, value)
def __delattr__(self, name):
if name == 'value':
super(AccessCounter, self).__setattr__('counter', self.counter + 1)
super(AccessCounter, self).__delattr__(name)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Detecting Duplicates in Class Instance's Variable Values - python

Related

Dynamically subclass an Enum base class

how to fix the _getitem__ method

How do I use dictionary key,value pair to set class instance attributes "pythonic"ly?

Dynamically adding nested dictionaries

python cache dictionary - counting number of hits

Categories

Resources