Group similar dict entries as a tuple of keys - python

I would like to group similar entries of a dataset.
ds = {1: 'foo',
2: 'bar',
3: 'foo',
4: 'bar',
5: 'foo'}
>>>tupelize_dict(ds)
{
(1,3,5): 'foo',
(2,4): 'bar'
}
I wrote this function, but I am sure there is something way simpler, isn't?
def tupelize_dict(data):
from itertools import chain, combinations
while True:
rounds = []
for x in combinations(data.keys(), 2):
rounds.append((x, data[x[0]], data[x[1]]))
end = True
for k, a, b in rounds:
if a == b:
k_chain = [x if isinstance(x, (tuple, list)) else [x] for x in k]
data[tuple(sorted(chain.from_iterable(k_chain)))] = a
[data.pop(r) for r in k]
end = False
break
if end:
break
return data
EDIT
I am interested in the general case where the content of the dataset can be any type of object that allows ds[i] == ds[j]:
ds = {1: {'a': {'b':'c'}},
2: 'bar',
3: {'a': {'b':'c'}},
4: 'bar',
5: {'a': {'b':'c'}}}

something like this should do the trick:
>>> from collections import defaultdict
>>> ds = {1: 'foo',
... 2: 'bar',
... 3: 'foo',
... 4: 'bar',
... 5: 'foo'}
>>>
>>> d = defaultdict(list)
>>> for k, v in ds.items():
... d[v].append(k)
...
>>> res = {tuple(v): k for k, v in d.items()}
>>> res
{(1, 3, 5): 'foo', (2, 4): 'bar'}

as well as you could do something like this.
def tupelize_dict(ds):
cache = {}
for key, value in ds.items():
cache.setdefault(value, []).append(key)
return {tuple(v): k for k, v in cache.items()}
ds = {1: 'foo',
2: 'bar',
3: 'foo',
4: 'bar',
5: 'foo'}
print(tupelize_dict(ds))

Following the answer of acushner, it is possible to make it work if I can compute a hash of the content of dataset's elements.
import pickle
from collections import defaultdict
def tupelize_dict(ds):
t = {}
d = defaultdict(list)
for k, v in ds.items():
h = dumps(ds)
t[h] = v
d[h].append(k)
return {tuple(v): t[k] for k, v in d.items()}
This solution is MUCH faster than my original proposition.
To test it I made a set of big random nested dictionary and run cProfile on both implementations:
original: 204.9 seconds
new: 6.4 seconds
EDIT:
I realized the dumps does not work with some dictionaries because the keys order can internally vary for obscure reasons (see this question)
A workaround would be to order all the dicts:
import copy
import collections
def faithfulrepr(od):
od = od.deepcopy(od)
if isinstance(od, collections.Mapping):
res = collections.OrderedDict()
for k, v in sorted(od.items()):
res[k] = faithfulrepr(v)
return repr(res)
if isinstance(od, list):
for i, v in enumerate(od):
od[i] = faithfulrepr(v)
return repr(od)
return repr(od)
def tupelize_dict(ds):
taxonomy = {}
binder = collections.defaultdict(list)
for key, value in ds.items():
signature = faithfulrepr(value)
taxonomy[signature] = value
binder[signature].append(key)
def tu(keys):
return tuple(sorted(keys)) if len(keys) > 1 else keys[0]
return {tu(keys): taxonomy[s] for s, keys in binder.items()}

Related

Remove elements from dict in for loop

How can I delete all elements in a dictionary after a modified one?
If the second one is changed, then we delete everything that is behind it or, if the first one is changed, then we delete everything after it and so on.
d = {'first': 'one', 'second': 'two', 'third': 'three'}
k = 'second'
for key in d:
if k in key:
d[key] = 'new value'
# delete third
Instead of deleting elements, just create a new dictionary.
Since dictionaries are not ordered, you should sort the items in some way.
d = {'first': 'one', 'second': 'two', 'third': 'three'}
k = 'second'
new_d = {}
for key, value in sorted(d.items()):
if key == k:
new_d[key] = 'new value'
break
new_d[key] = value
You can use the following code. it makes the dict items into a list and creates a new dict from the remaining items:
d = {'first': 'one', 'second': 'two', 'third': 'three'}
k = 'second'
d[k] = 'NewValue'
d = dict(list(d.items())[:list(d.keys()).index(k)+1])
print(d)
Out:
{'first': 'one', 'second': 'NewValue'}
Out of curiosity I timed it against #Daniel's answer those are my timeit code and result:
import timeit, functools
def test_1(d,k):
d = dict(list(d.items())[:list(d.keys()).index(k)+1])
d[k] = 'new vlaue'
return d
def test_2(d, k):
new_d = {}
for key, value in sorted(d.items()):
if key == k:
new_d[key] = 'new value'
break
new_d[key] = value
return new_d
if __name__ == '__main__':
keys = [x for x in range(100000)]
values = [x for x in range(100000)]
d = dict(zip(keys, values))
k = 9999
a = timeit.timeit(functools.partial(test_1, d, k), number = 1000)
b = timeit.timeit(functools.partial(test_2, d, k), number = 1000)
print(a, b)
Output:
5.107241655999815 6.745305094000059
If you change the sorted(...) into list(...) in Daniels answer it is the other way around:
5.046288972999946 4.211456709999993
It is a constant offset, probably due to creating the list out of the dict twice instead of once. So #Daniels answer is both faster and less memory expensive
you can delete rest of dict by keys. like below:
d = {'first': 'one', 'second': 'two', 'third': 'three'}
k = 'first' #for modify
keys = ['second', 'third'] #rest of dict to delete
list(map(d.pop, keys))
it will return {'first':'modified_value'}
You should not be using normal dictionary if you're trying to keep them in an order, at least if you're below py3.7 of course.
Please use if you're below 3.7.
We are creating a new dictionary in the below code and removing the old one, you can even put this inside a function and use it.
from collections import OrderedDict
d =OrderedDict([('first','one'), ('second', 'two'), ('third', 'three')])
k = 'second'
output_dict = OrderedDict()
for key,value in d.items():
if k in key:
output_dict[key] = value
break
else:
output_dict[key] = value
del d
print(output_dict)

Instead of read a dictionary by keys, read by values (mydict[key] = value >>> mydict[value] = key) [duplicate]

This question already has answers here:
Reverse / invert a dictionary mapping
(32 answers)
Closed 5 years ago.
This question could sound something stange, but normally you can get values by the key, e.g:
>>> mydict = {'a':1,'b':2,'c':3}
>>> mydict['a']
1
>>> mydict['b']
2
>>> mydict['c']
3
But I need to do:
>>> mydict[1]
'a'
>>> mydict[2]
'b'
>>> mydict[3]
'c'
# In this case, my dictionary have to work like
>>> mydict = {1:'a',2:'b',3:'c'}
In my program my dictionary could be open by the two ways, I mean:
>>> mydict = {'a':1,'b':2,'c':3}
# Sometimes I need the value of a letter:
>>> mydict['a']
1
# And somethimes, I need the letter of a value.
>>> mydict.REVERSAL[1]
a
I can do something like: (I don't know if this work, I don't test it)
>>> mydict = {'a':1,'b':2,'c':3}
>>> mydict['a']
1
# etc...
>>> def reversal(z):
... tmp = {}
... for x,y in z.items():
... tmp[y] = x
... return tmp
>>> mydict = reversal(mydict)
>>> mydict[1]
a
# etc
>>> mydict = reversal(mydict)
>>> mydict['c']
3
# etc, etc, etc...
Is there an easy way to do that?
FIRST: I know about chr() and ord(), my code isn't about letters... this is only and example.
SECOND: In my dictionary there won't be two same values, so there won't be any problems with duplicate keys...
You need something like this,
In [21]: mydict = {'a':1,'b':2,'c':3}
In [22]: dict(zip(mydict.values(),mydict.keys()))
Out[22]: {1: 'a', 2: 'b', 3: 'c'}
Or
In [23]: dict(i[::-1] for i in mydict.items())
Out[23]: {1: 'a', 2: 'b', 3: 'c'}
Or
In [24]: dict(map(lambda x:x[::-1],mydict.items()))
Out[24]: {1: 'a', 2: 'b', 3: 'c'}
To reverse your dictionary, you can use .items():
mydict = {'a':1,'b':2,'c':3}
new_dict = {b:a for a, b in mydict.items()}
You can use some dictionary comprehension to switch keys and values:
>>> mydict = {'a': 1, 'b': 2, 'c': 3}
>>> reversal = {v: k for k, v in mydict.items()}
>>> reversal[1]
'a'

python create dict using list of strings with length of strings as values

I'm sure this can be done, but I have thus far been unsuccessful:
I have a list of strings. I want to create a dictionary with the length of said strings (which can be expressed as a range) as the key and the string itself as the value.
example:
Here's something like the list I have: ['foo','bar','help','this','guy']
I'd like to end up with a dictionary like this:
{3:['foo','bar','guy], 4:['this','help']}
Using defaultdict so you don't have to check whether or not to create the list for a new key:
from collections import defaultdict
x = ['foo','bar','help','this','guy']
len_dict = defaultdict(list)
for word in x:
len_dict[len(word)].append(word)
len_dict
#
# Out[5]: defaultdict(list, {3: ['foo', 'bar', 'guy'], 4: ['help', 'this']})
You can use a dictionary as a container with setdefault:
lst = ['foo','bar','help','this','guy']
result = {}
for w in lst:
result.setdefault(len(w), []).append(w)
result
# {3: ['foo', 'bar', 'guy'], 4: ['help', 'this']}
You can do it like that:
d={}
lst=['foo','bar','help','this','guy']
for i in lst:
if len(i) in d:
d[len(i)].append(i)
else:
d[len(i)]=[i]
This solution is pythonic, elegant and fast: (by the Famous Raymond Hettinger in one of his many conferences).
dict.setdefault is the dictionary method that initialises a key-value if the key is not found in dict as well as performing dict.get for provided key.
l = ['foo','bar','help','this','guy']
d = {}
for e in l:
key = len(e)
d.setdefault(key, []).append(name)
print(d)
Output:
{3: ['foo', 'bar', 'guy'], 4: ['help', 'this']}
This solution is the modern way of the solution above:
defaultdict from collection is a subclass of dict that automatically initialises value to any given key that is not in the defaultdict.
from collections import defaultdict
l = ['foo','bar','help','this','guy']
d = defaultdict(list)
for e in l:
key = len(e)
d[key].append(e)
print(d)
Output:
defaultdict(<class 'list'>, {3: ['foo', 'bar', 'guy'], 4: ['help', 'this']})
Similar to what have been said, but using the get method of dict class:
the_list=['foo','bar','help','this','guy']
d = {}
for word in the_list:
key = len(word)
d[key] = d.get(key, []) + [word]
print(d)
# {3: ['foo', 'bar', 'guy'], 4: ['help', 'this']}
Another approach:
from collections import defaultdict
given_list=['foo','bar','help','this','guy']
len_words=[len(i) for i in given_list]
d=defaultdict(list)
for i,j in list(zip(len_words,given_list)):
d[i].append(j)

Merge Dictionaries Preserving Old Keys and New Values

I'm writing a Python script that parses RSS feeds. I want to maintain a dictionary of entries from the feed that gets updated periodically. Entries that no longer exist in the feed should be removed, new entries should get a default value, and the values for previously seen entries should remain unchanged.
This is best explained by example, I think:
>>> old = {
... 'a': 1,
... 'b': 2,
... 'c': 3
... }
>>> new = {
... 'c': 'x',
... 'd': 'y',
... 'e': 'z'
... }
>>> out = some_function(old, new)
>>> out
{'c': 3, 'd': 'y', 'e': 'z'}
Here's my current attempt at this:
def merge_preserving_old_values_and_new_keys(old, new):
out = {}
for k, v in new.items():
out[k] = v
for k, v in old.items():
if k in out:
out[k] = v
return out
This works, but it seems to me there might be a better or more clever way.
EDIT: If you feel like testing your function:
def my_merge(old, new):
pass
old = {'a': 1, 'b': 2, 'c': 3}
new = {'c': 'x', 'd': 'y', 'e': 'z'}
out = my_merge(old, new)
assert out == {'c': 3, 'd': 'y', 'e': 'z'}
EDIT 2:
Defining Martijn Pieters' answer as set_merge, bravosierra99's as loop_merge, and my first attempt as orig_merge, I get the following timing results:
>>> setup="""
... old = {'a': 1, 'b': 2, 'c': 3}
... new = {'c': 'x', 'd': 'y', 'e': 'z'}
... from __main__ import set_merge, loop_merge, orig_merge
... """
>>> timeit.timeit('set_merge(old, new)', setup=setup)
3.4415210600000137
>>> timeit.timeit('loop_merge(old, new)', setup=setup)
1.161155690000669
>>> timeit.timeit('orig_merge(old, new)', setup=setup)
1.1776735319999716
I find this surprising, since I didn't expect the dictionary view approach to be that much slower.
Dictionaries have dictionary view objects that act as sets. Use these to get the intersection between old and new:
def merge_preserving_old_values_and_new_keys(old, new):
result = new.copy()
result.update((k, old[k]) for k in old.viewkeys() & new.viewkeys())
return result
The above uses the Python 2 syntax; use old.keys() & new.keys() if you are using Python 3, for the same results:
def merge_preserving_old_values_and_new_keys(old, new):
# Python 3 version
result = new.copy()
result.update((k, old[k]) for k in old.keys() & new.keys())
return result
The above takes all key-value pairs from new as a starting point, then adds the values for old for any key that appears in both.
Demo:
>>> merge_preserving_old_values_and_new_keys(old, new)
{'c': 3, 'e': 'z', 'd': 'y'}
Note that the function, like your version, produces a new dictionary (albeit that the key and value objects are shared; it is a shallow copy).
You could also just update the new dictionary in-place if you don't need that new dictionary for anything else:
def merge_preserving_old_values_and_new_keys(old, new):
new.update((k, old[k]) for k in old.viewkeys() & new.viewkeys())
return new
You could also use a one-liner dict comprehension to build a new dictionary:
def merge_preserving_old_values_and_new_keys(old, new):
return {k: old[k] if k in old else v for k, v in new.items()}
This should be more efficient, since you are no longer iterating through the entire old.items(). Additionally, it's more clear what you are trying to do this way since you aren't overwriting some values.
for k, v in new.items():
if k in old.keys():
out[k] = old[k]
else:
out[k] = v
return out
old = {
'a': 1,
'b': 2,
'c': 3
}
new = {
'c': 'x',
'd': 'y',
'e': 'z'
}
def merge_preserving_old_values_and_new_keys(o, n):
out = {}
for k in n:
if k in o:
out[k] = o[k]
else:
out[k] = n[k]
return out
print merge_preserving_old_values_and_new_keys(old, new)
I'm not 100% the best way to add this information to the discussion: feel free to edit/redistribute it if necessary.
Here are timing results for all of the methods discussed here.
from timeit import timeit
def loop_merge(old, new):
out = {}
for k, v in new.items():
if k in old:
out[k] = old[k]
else:
out[k] = v
return out
def set_merge(old, new):
out = new.copy()
out.update((k, old[k]) for k in old.keys() & new.keys())
return out
def comp_merge(old, new):
return {k: old[k] if k in old else v for k, v in new.items()}
def orig_merge(old, new):
out = {}
for k, v in new.items():
out[k] = v
for k, v in old.items():
if k in out:
out[k] = v
return out
old = {'a': 1, 'b': 2, 'c': 3}
new = {'c': 'x', 'd': 'y', 'e': 'z'}
out = {'c': 3, 'd': 'y', 'e': 'z'}
assert loop_merge(old, new) == out
assert set_merge(old, new) == out
assert comp_merge(old, new) == out
assert orig_merge(old, new) == out
setup = """
from __main__ import old, new, loop_merge, set_merge, comp_merge, orig_merge
"""
for a in ['loop', 'set', 'comp', 'orig']:
time = timeit('{}_merge(old, new)'.format(a), setup=setup)
print('{}: {}'.format(a, time))
size = 10**4
large_old = {i: 'old' for i in range(size)}
large_new = {i: 'new' for i in range(size//2, size)}
setup = """
from __main__ import large_old, large_new, loop_merge, set_merge, comp_merge, orig_merge
"""
for a in ['loop', 'set', 'comp', 'orig']:
time = timeit('{}_merge(large_old, large_new)'.format(a), setup=setup)
print('{}: {}'.format(a, time))
The winner is the improved looping method!
$ python3 merge.py
loop: 0.7791572390015062 # small dictionaries
set: 3.1920828100010112
comp: 1.1180207730030816
orig: 1.1681104259987478
loop: 927.2149353210007 # large dictionaries
set: 1696.8342713210004
comp: 902.039078668
orig: 1373.0389542560006
I'm disappointed, because the dictionary view/set operation method is much cooler.
With larger dictionaries (10^4 items), the dictionary comprehension method pulls ahead of the improved looping method and far ahead of the original method. The set operation method still performs the slowest.

How I can get rid of None values in dictionary?

Something like:
for (a,b) in kwargs.iteritems():
if not b : del kwargs[a]
This code raise exception because changing of dictionary when iterating.
I discover only non pretty solution with another dictionary:
res ={}
res.update((a,b) for a,b in kwargs.iteritems() if b is not None)
Thanks
Another way to write it is
res = dict((k,v) for k,v in kwargs.iteritems() if v is not None)
In Python3, this becomes
res = {k:v for k,v in kwargs.items() if v is not None}
You can also use filter:
d = dict(a = 1, b = None, c = 3)
filtered = dict(filter(lambda item: item[1] is not None, d.items()))
print(filtered)
{'a': 1, 'c': 3}
d = {'a': None, 'b': 'myname', 'c': 122}
print dict(filter(lambda x:x[1], d.items()))
{'b': 'myname', 'c': 122}
I like the variation of your second method:
res = dict((a, b) for (a, b) in kwargs.iteritems() if b is not None)
it's Pythonic and I don't think that ugly. A variation of your first is:
for (a, b) in list(kwargs.iteritems()):
if b is None:
del kwargs[a]
If you need to handle nested dicts, then you can leverage a simple recursive approach:
# Python 2
from collections import Mapping
def filter_none(d):
if isinstance(d, Mapping):
return dict((k, filter_none(v)) for k, v, in d.iteritems() if v is not None)
else:
return d
# Python 3
from collections.abc import Mapping
def filter_none(d):
if isinstance(d, Mapping):
return {k: filter_none(v) for k, v in d.items() if v is not None}
else:
return d
To anybody who may interests, here's another way to get rid of None value. Instead of deleting the key, I change the value of None with a placeholder for the same key.
One use case is applying with Spark RDD.map onto null valued JSON.
def filter_null(data, placeholder="[spark]nonexists"):
# Replace all `None` in the dict to the value of `placeholder`
return dict((k, filter_null(v, placeholder) if isinstance(v, dict) else v if v
is not None else placeholder) for k, v in data.iteritems())
Sample output:
>>> filter_null({'a':None,'b':"nul", "c": {'a':None,'b':"nul"}})
{'a': '[spark]nonexists', 'c': {'a': '[spark]nonexists', 'b': 'nul'}, 'b': 'nul'}
For python3, change the iteritems() to items().
The recursive approach to also filter nested lists of dicts in the dictionary:
def filter_none(d):
if isinstance(d, dict):
return {k: filter_none(v) for k, v in d.items() if v is not None}
elif isinstance(d, list):
return [filter_none(v) for v in d]
else:
return d
Sample output:
data = {'a': 'b', 'c': None, 'd':{'e': 'f', 'h': None, 'i':[{'j': 'k', 'l': None}]}}
print(filter_none(data))
>>> {'a': 'b', 'd': {'e': 'f', 'i': [{'j': 'k'}]}}

Categories