I would like to use in Python something akin to -- or better than -- R arrays. R arrays are tensor-like objects with a dimnames attribute, which allows to straightforwardly allows to subset tensors based on names (strings). In numpy recarrays allow for column names, and pandas for flexible and efficient subsetting of 2-dimensional arrays. Is there something in Python that allows similar operations as slicing and subsetting of ndarrays by using names (or better, objects that are hashable and immutable in Python)?
How about this quick and dirty mapping from lists of strings to indices? You could clean up the notation with callable classes.
def make_dimnames(names):
return [{n:i for i,n in enumerate(name)} for name in names]
def foo(d, *args):
return [d[x] for x in args]
A = np.arange(9).reshape(3,3)
dimnames = [('x','y','z'),('a','b','c')]
Adims = make_dimnames(dimnames)
A[foo(Adims[0],'x','z'),foo(Adims[1],'b')] # A[[0,2],[1]]
A[foo(Adims[0],'x','z'),slice(*foo(Adims[1],'b','c'))] # A[[0,2],slice(1,2)]
Or does R do something more significant with the dimnames?
A class compresses the syntax a bit:
class bar(object):
def __init__(self,dimnames):
self.dd = {n:i for i,n in enumerate(dimnames)}
def __call__(self,*args):
return [self.dd[x] for x in args]
def __getitem__(self,key):
return self.dd[key]
d0, d1 = bar(['x','y','z']), bar(['a','b','c'])
A[d0('x','z'),slice(*d1('a','c'))]
http://docs.scipy.org/doc/numpy/user/basics.subclassing.html
sublassing ndarray, with simple example of adding an attribute (which could be dinnames). Presumably extending the indexing to use that attribute shouldn't be hard.
Inspired by the use of __getitem__ in numpy/index_tricks, I've generalized the indexing:
class DimNames(object):
def __init__(self, dimnames):
self.dd = [{n:i for i,n in enumerate(names)} for names in dimnames]
def __getitem__(self,key):
# print key
if isinstance(key, tuple):
return tuple([self.parse_key(key, self.dd[i]) for i,key in enumerate(key)])
else:
return self.parse_key(key, self.dd[0])
def parse_key(self,key, dd):
if key is None:
return key
if isinstance(key,int):
return key
if isinstance(key,str):
return dd[key]
if isinstance(key,tuple):
return tuple([self.parse_key(k, dd) for k in key])
if isinstance(key,list):
return [self.parse_key(k, dd) for k in key]
if isinstance(key,slice):
return slice(self.parse_key(key.start, dd),
self.parse_key(key.stop, dd),
self.parse_key(key.step, dd))
raise KeyError
dd = DimNames([['x','y','z'], ['a','b','c']])
print A[dd['x']] # A[0]
print A[dd['x','c']] # A[0,2]
print A[dd['x':'z':2]] # A[0:2:2]
print A[dd[['x','z'],:]] # A[[0,2],:]
print A[dd[['x','y'],'b':]] # A[[0,1], 1:]
print A[dd[:'z', :2]] # A[:2,:2]
I suppose further steps would be to subclass A, add dd as attribute, and change its __getitem__, simplifying the notation to A[['x','z'],'b':].
Related
For example we have large list of objects like this:
class KeyStatisticEntry:
def __init__(self, value=""):
self.usedBytes = len(value)
self.encoding = get_string_encoding(value)
#property
def total(self):
overhead = get_object_overhead(self.usedBytes)
if self.encoding == 'some value':
return overhead
else:
return self.usedBytes + overhead
#property
def aligned(self):
return some_func_with(self.usedBytes)
# Here is lots of calculated properties on basis of existing properties
And we need to agregate lots of metrix about this obejct - min, max, sum, mean, stdev values of it propertirs. Currently i do it with code like this:
used_bytes = []
total_bytes = []
aligned_bytes = []
encodings = []
for obj in keys.items():
used_bytes.append(obj.usedBytes)
total_bytes.append(obj.total)
aligned_bytes.append(obj.aligned)
encodings.append(obj.encoding)
total_elements = len(used_bytes)
used_user = sum(used_bytes)
used_real = sum(total_bytes)
aligned = sum(aligned_bytes)
mean = statistics.mean(used_bytes)
Question:
Is here is more "pythonic" way with better perfomance and memory usage?
You can use operator.attrgetter in order to get multiple attribute of your objects then use itertools.zip_longest (itertools.izip_longest in Python 2.X ) to attach the relative attributes together.
from operator import attrgetter
all_result = [attrgetter('usedBytes','total','aligned','encoding')(obj) for obj in keys.items()]
Or use a generator expression to create a generator instead of a list :
all_result = (attrgetter('usedBytes','total','aligned','encoding')(obj) for obj in keys.items())
Then use zip_longest:
used_bytes, total_bytes, aligned_bytes, encodings = zip_longest(*all_results)
Then use map function to apply the sum function on iterables for which you need the sum:
used_user, used_real, aligned = map(sum,(used_bytes, total_bytes, aligned_bytes))
And separately for len and mean:
total_elements = len(used_bytes)
mean = statistics.mean(used_bytes)
And if you want to handle all the sub lists as generator (which is more optimized in terms of memory use and less performance in terms of runtime) you can use a new class in order to calculate the desire result separately using generators :
from itertools import tee
class Aggregator:
def __init__(self, all_obj):
self.obj = all_obj
self.used_user, self.mean = self.getTotalBytesAndMean()
self.total_elements = len(self.all_obj)
self.aligned = self.getAligned()
def getTotalBytesAndMean(self):
iter_1, iter_2 = tee((obj.usedBytes for obj in self.all_obj))
return sum(iter_1), statistics.mean(iter_2)
def getTotal(self):
return sum(obj.total for obj in self.all_obj)
def getAligned(self):
return sum(obj.aligned for obj in self.all_obj)
def getEncoding(self):
return (obj.encoding for obj in self.all_obj)
Then you can do :
Agg = Aggregator(keys.items())
# And simply access to attributes
Agg.used_user
There is a probably better way for memory usage, using (implicit) generators instead of lists for getting all your infos. I am not sure it will be better if you are doing many computations on the same list (for example for usedBytes). Note however that you cannot use len on a generator (but the length would be the length of your input list anyway):
total_elements = len(keys.items())
used_user = sum(obj.usedBytes for obj in keys.items())
used_real = sum(obj.total for obj in keys.items())
aligned = sum(obj.aligned for obj in keys.items())
mean = statistics.mean(obj.usedBytes for obj in keys.items())
When Python takes the intersection of two sets, it always returns elements from the smaller one, which is reasonable in nearly all cases, but I'm trying to do the opposite.
In the piece of code below, note that the intersection yields an integer, not a float.
[in] >>> x = {1.0,2.0,3.0}
[in] >>> y = {1}
[in] >>> x.intersection(y)
[out] >>> {1}
[in] >>> y.intersection(x)
[out] >>> {1}
If I want to get a float back, I have to use some heavy copying.
[in] >>> x - y
[out] >>> {2.0,3.0}
[in] >>> x - (x - y)
[out] >>> {1.0}
I'm dealing with much larger sets than the example above. My question is whether there's any way to trick Python set.intersection method into returning elements from the larger set, or if there another method that can return the float 1.0 besides what I've done here.
The reason why I'm doing this in the first place is I'm trying to implement a frozen dictionary in pure python by sub-classing frozenset. I'm storing the key-value pairs using a subclass of tuple I call "Item" where hash returns the hash of the key only. Using the code below, I'm able to create a set with a single key-value pair inside of it. Then I extract the attribute "value" and return it.
def __getitem__(self, key):
wrapped = Item((key,),flag=False)
if not frozenset.__contains__(self, wrapped):
raise KeyError(key)
matches = self - (self - {wrapped})
for pair in frozenset.__iter__(matches):
return pair.value
I know that the copying is the reason for the slowness because when I try to return an item whose key is not in the dictionary, I get a KeyError immediately, even for sets with 10 million items.
At the risk of answering something different than what you actually asked for (but maybe helping with the end-goal)... a Frozen dict is actually really easy to implement in python:
from collections import Mapping
class FrozenDict(Mapping):
def __init__(self, *args, **kwargs):
self._hash = None # defer calculating hash until needed.
self._data = dict(*args, **kwargs)
def __getitem__(self, item):
return self._data[item]
def __len__(self):
return len(self._data)
def __iter__(self):
return iter(self._dict)
def __repr__(self):
return '{}({!r})'.format(type(self), self._data)
def __hash__(self):
if self._hash is not None:
return self._hash
# Only hashible if the items are hashible.
self._hash = hash(tuple(self.items()))
x = FrozenDict({'a': 'b'})
print x
x['c'] = 'Bad Bad Bad'
Of course, this isn't truly frozen (in the same sense that a frozenset is frozen). A user could reach in and modify the data on the frozendict -- But then they deserve any code breakages that they cause.
To answer your actual question, the only alternative that I can think of is to define your own intersection function:
>>> s1 = set([1])
>>> s2 = set([1., 2.])
>>> def intersection(s1, s2):
... return set(x for x in s1 if x in s2)
...
>>> intersection(s1, s2)
set([1])
>>> intersection(s2, s1)
set([1.0])
This one always returns sets, but you could easily modify to return frozenset or the type of the input if you make the assumption that the type of the first input has a constructor that accepts only an iterable:
def intersection(s1, s2):
output_type = type(s1)
return output_type(x for x in s1 if x in s2)
I'm trying to write a python (2.7) matrix module. (I know about numpy, this is just for fun.)
My Code:
from numbers import Number
import itertools
test2DMat = [[1,2,3],[4,5,6],[7,8,9]]
test3DMat = [[[1,2,3],[4,5,6],[7,8,9]],[[2,3,4],[5,6,7],[8,9,0]],[[9,8,7],[6,5,4],[3,2,1]]]
class Dim(list):
def __new__(cls,inDim):
# If every item in inDim is a number create a Vec
if all(isinstance(item,Number) for item in inDim):
#return Vec(inDim)
return Vec.__new__(cls,inDim)
# Otherwise create a Dim
return list.__new__(cls,inDim)
def __init__(self,inDim):
# Make sure every item in inDim is iterable
try:
for item in inDim: iter(item)
except TypeError:
raise TypeError('All items in a Dim must be iterable')
# Make sure every item in inDim has the same length
# or that there are zero items in the list
if len(set(len(item) for item in inDim)) > 1:
raise ValueError('All lists in a Dim must be the same length')
inDim = map(Dim,inDim)
list.__init__(self,inDim)
class Vec(Dim):
def __new__(cls,inDim):
if cls.__name__ not in [Vec.__name__,Dim.__name__]:
newMat = list.__new__(Vec,inDim)
newMat.__init__(inDim)
return newMat
return list.__new__(Vec,inDim)
def __init__(self,inDim):
list.__init__(self,inDim)
class Matrix(Dim):
def __new__(cls,inMat):
return Dim.__new__(cls,inMat)
def __init__(self,inMat):
super(Matrix,self).__init__(inMat)
Current Functionality:
So far I have written a few classes, Matrix, Dim, and Vec. Matrix and Vec are both subclasses of Dim. When creating a matrix, one would first start out with a list of lists and they would create a matrix like:
>>> startingList = [[1,2,3],[4,5,6],[7,8,9]]
>>> matrix.Matrix(startingList)
[[1,2,3],[4,5,6],[7,8,9]]
This should create a Matrix. The created Matrix should contain multiple Dims all of the same length. Each of these Dims should contain multiple Dims all of the same length, etc. The last Dim, the one that contains numbers, should contain only numbers and should be a Vec instead of a Dim.
The Problem:
All of this works, for lists. If I were however, to use an iterator object instead (such as that returned by iter()) this does not function as I want it to.
For example:
>>> startingList = [[1,2,3],[4,5,6],[7,8,9]]
>>> matrix.Matrix(iter(startingList))
[]
My Thoughts:
I'm fairly certain that this is happening because in Dim.__new__ I iterate over the input iterable which, when the same iterable is then passed to Matrix.__init__ it has already been iterated over and will therefore appear to be empty, resulting in the empty matrix that I get.
I have tried copying the iterator using itertools.tee(), but this also doesn't work because I don't actually call Matrix.__init__ it gets called implicitly when Matrix.__new__ returns and I therefore cannot call it with different parameters than those passed to Matrix.__init__. Everything I have thought of to do comes up against this same problem.
Is there any way for me to preserve the existing functionality and also allow matrix.Matrix() to be called with an iterator object?
The key is that Vec.__init__ is getting called twice; once inside your __new__ method and once when you return it from the __new__ method. So if you mark it as already initialised and return early from Vec.__init__ if it is already initialised, then you can ignore the second call:
class A(object):
def __new__(cls, param):
return B.__new__(cls, param + 100)
class B(A):
def __new__(cls, param):
b = object.__new__(B)
b.__init__(param)
return b
def __init__(self, param):
if hasattr(self, 'param'):
print "skipping __init__", self
return
self.param = param
print A(5).param
What you would need to do is check if the variable that is passed in is a tuple or list. If it is then you can use it directly, otherwise you need to convert the iterator into a list/tuple.
if isinstance(inDim, collections.Sequence):
pass
elif hastattr(inDim, '__iter__'): # this is better than using iter()
inDim = tuple(inDim)
else:
# item is not iterable
There is also a better way of checking that the length of all the lists are the same:
if len(inDim) > 0:
len_iter = (len(item) for item in inDim)
first_len = len_iter.next()
for other_len in len_iter:
if other_len != first_len:
raise ValueError('All lists in a Dim must be the same length')
[Sorry, I'm new in Python. Although it seems to be a very basic question, I did my share of due diligence before asking this audience, trying to avoid really stupid questions].
I'm trying to figure out the correct idiom for returning an l-value from a function. Assume I've a container of 64 objects, and I want to be able to return a reference to these objects.
class ChessBoard:
def __init__(self):
self.squares = [None for x in range(64)]
square( row, col ):
return self.squares(row*8+col) <---- I'd like this to be l-value
Then, from outside the class I want to:
board = ChessBoard()
board.square(0,0) = Piece( Shapes.ROOK, Colors.White ) <-- I'm getting an error here
board.square(0,1) = Piece( Shapes.BISHOP, Colors.White )
... etc.
So, I would like the function 'at' to return a lvalue (Something like a reference in C++), but I can't find anything resembling a reference or a pointer in the language. If I stored a list in each square containing one Piece, it is possible I could do something like: board.square(0,0)[0] = Piece - but it seems crazy (or maybe not - as I said, I'm new to the language).
How would you approach this data structure?
In Python, everything is a reference. The only problem is that None is immutable, so you can't use the returned reference to change the value.
You also can't override the assignment operator, so you won't get this particular kind of behaviour. However, a good and very flexible solution would be to override the __setitem__ and __getitem__ methods to implement the subscription operator ([]) for the class:
class ChessBoard(object):
def __init__(self):
self.squares = [None] * 64
def __setitem__(self, key, value):
row, col = key
self.squares[row*8 + col] = value
def __getitem__(self, key):
row, col = key
return self.squares[row*8 + col]
Usage:
>>> c = ChessBoard()
>>> c[1,2] = 5
>>> c[1,2]
5
You can try something like this, at the cost of having to put bogus [:] indexers around:
class Board:
def __init__(self):
self.squares=[None for x in range(64)]
def square(self, row, col):
squares=self.squares
class Prox:
def __getitem__(self, i):
return squares[row*8+col]
def __setitem__(self, i, v):
squares[row*8+col]=v
return Prox()
Then you can do
b=Board()
b.square(2,3)[:]=Piece('Knight')
if b.square(x,y)[:] == Piece('King') ...
And so on. It doesn't actually matter what you put in the []s, it just has to be something.
(Got the idea from the Proxies Perl6 uses to do this)
As Niklas points out, you can't return an l-value.
However, in addition to overriding subscription, you can also use properties (an application of descriptors: http://docs.python.org/howto/descriptor.html) to create an object attribute, which when read from, or assigned to, runs code.
(Not answering your question in the title, but your "How would you approach this data structure?" question:) A more pythonic solution for your data structure would be using a list of lists:
# define a function that generates an empty chess board
make_chess_board = lambda : [[None for x in xrange(8)] for y in xrange(8)]
# grab an instance
b = make_chess_board()
# play the game!
b[0][0] = Piece(Shapes.ROOK, Colors.White)
b[0][1] = Piece(Shapes.BISHOP, Colors.White)
# Or use tuples:
b[0][0] = (Shapes.ROOK, Colors.White)
b[0][1] = (Shapes.BISHOP, Colors.White)
I have the following two Python functions:
#classmethod
def serialize_dict(cls, d):
values = []
for column_name in cls().distinguishing_column_names():
value = str(d[column_name])
if value == 'None':
value = ''
values.append(value)
return ' '.join(values)
#classmethod
def serialize_row(cls, row):
values = []
for column_name in cls().distinguishing_column_names():
value = str(row.value(cls()._meta.db_table, column_name))
if value == 'None':
value = ''
values.append(value)
return ' '.join(values)
As you can see, the two functions are identical except for the first line of the for loop. Not very DRY. How could I refactor this code to take out all the repetitions, given that row and d are of different types (dict and a custom type of mine, respectively)?
Why don't you just implement the relevant bits of the dict interface in your custom type?
So that row[column_name] results in the code you want?
You use the __getitem__ special method for this.
Add an if isinstance(arg, dict) to determine whether to treat it as a row or dict, then merge the two methods together.
You could implement the function that serialise the bit into two different lambda functions that can then used as parameters of a single serialisation method:
ds = lambda d , cls , column_name : str(d[column_name])
rs = lambda d , cls , column_name : str(d.value(cls()._meta.db_table, column_name))
def __serialize(cls, d, ser):
values = []
for column_name in cls().distinguishing_column_names():
value = ser(d,cls,column_name)
if value == 'None':
value = ''
values.append(value)
return ' '.join(values)
#classmethod
def serialize_dict(cls, d):
return __serialize(cls, d, ds)
#classmethod
def serialize_row(cls, row):
return __serialize(cls, d, rs)
If the row instance can get hold of the name of its table, add a __ getitem __ method to make it behave like the dictionary. Otherwise, wrap it with a proxy object that knows the table name.