Dataframe to Nested Dictionaries in Python - python

Having a bit of trouble here.. I need to take a dataframe
import pandas as pd
region = ['A','A','A','B','B','B']
sub_region = ['1','2','2','3','3','4']
state = ['a','b','c','d','e','f']
pd.DataFrame({"region":region,"sub_region":sub_region,"state":state})
and convert into a nested dictionary with the following format:
[{name: "thing", children: [{name:"sub_thing",children:[{...}] }]}]
so a list of nested dictionaries where the key value pairs are always name:"", children:[{}], but childless children don't have children in their dict.. so the final desired output would be...
[{"name":"A",
"children":[{"name":"1","children":[{"name":"a"}]},
{"name":"2","children":[{"name":"b"},{"name":"c"}]}]
},
{"name":"B",
"children":[{"name":"3","children":[{"name":"d"},{"name":"e"}]},
{"name":"4","children":[{"name":"f"}]}]
}
]
Assume a generalized framework where the number of levels can vary.

I don't think you can do better than looping through the rows of the dataframe. That is, I don't see a way to vectorize this process. Also, if the number of levels can vary within the same dataframe, then the update function should be modified to handle nan entries (e.g. adding and not np.isnan(row[1]) to if len(row) > 1).
That said, I believe that the following script should be satisfactory.
import pandas as pd
region = ['A','A','A','B','B','B']
sub_region = ['1','2','2','3','3','4']
state = ['a','b','c','d','e','f']
df = pd.DataFrame({"region":region,"sub_region":sub_region,"state":state})
ls = []
def update(row,ls):
for d in ls:
if d['name'] == row[0]:
break
else:
ls.append({'name':row[0]})
d = ls[-1]
if len(row) > 1:
if not 'children' in d:
d['children'] = []
update(row[1:],d['children'])
for _,r in df.iterrows():
update(r,ls)
print(ls)
The resulting list ls:
[{'name': 'A',
'children': [{'name': '1', 'children': [{'name': 'a'}]},
{'name': '2', 'children': [{'name': 'b'}, {'name': 'c'}]}]},
{'name': 'B',
'children': [{'name': '3', 'children': [{'name': 'd'}, {'name': 'e'}]},
{'name': '4', 'children': [{'name': 'f'}]}]}]
Here's a version where childless children have 'children':[] in their dict, which I find a bit more natural.
import pandas as pd
region = ['A','A','A','B','B','B']
sub_region = ['1','2','2','3','3','4']
state = ['a','b','c','d','e','f']
df = pd.DataFrame({"region":region,"sub_region":sub_region,"state":state})
ls = []
def update(row,ls):
if len(row) == 0:
return
for d in ls:
if d['name'] == row[0]:
break
else:
ls.append({'name':row[0], 'children':[]})
d = ls[-1]
update(row[1:],d['children'])
for _,r in df.iterrows():
update(r,ls)
print(ls)
The resulting list ls:
[{'name': 'A',
'children': [{'name': '1', 'children': [{'name': 'a', 'children': []}]},
{'name': '2',
'children': [{'name': 'b', 'children': []},
{'name': 'c', 'children': []}]}]},
{'name': 'B',
'children': [{'name': '3',
'children': [{'name': 'd', 'children': []},
{'name': 'e', 'children': []}]},
{'name': '4', 'children': [{'name': 'f', 'children': []}]}]}]

Related

maintain dictionary structure while reducing nested dictionary

I have a list of pairs of nested dict dd and would like to maintain the structure to a list of dictionaries:
dd = [
[{'id': 'bla',
'detail': [{'name': 'discard', 'amount': '123'},
{'name': 'KEEP_PAIR_1A', 'amount': '2'}]},
{'id': 'bla2',
'detail': [{'name': 'discard', 'amount': '123'},
{'name': 'KEEP_PAIR_1B', 'amount': '1'}]}
],
[{'id': 'bla3',
'detail': [{'name': 'discard', 'amount': '123'},
{'name': 'KEEP_PAIR_2A', 'amount': '3'}]},
{'id': 'bla4',
'detail': [{'name': 'discard', 'amount': '123'},
{'name': 'KEEP_PAIR_2B', 'amount': '4'}]}
]
]
I want to reduce this to a list of paired dictionaries while extracting only some detail. For example, an expected output may look like this:
[{'name': ['KEEP_PAIR_1A', 'KEEP_PAIR_1B'], 'amount': [2, 1]},
{'name': ['KEEP_PAIR_2A', 'KEEP_PAIR_2B'], 'amount': [3, 4]}]
I have run my code:
pair=[]
for all_pairs in dd:
for output_pairs in all_pairs:
for d in output_pairs.get('detail'):
if d['name'] != 'discard':
pair.append(d)
output_pair = {
k: [d.get(k) for d in pair]
for k in set().union(*pair)
}
But it didn't maintain that structure :
{'name': ['KEEP_PAIR_1A', 'KEEP_PAIR_1B', 'KEEP_PAIR_2A', 'KEEP_PAIR_2B'],
'amount': ['2', '1', '3', '4']}
I assume I would need to use some list comprehension to solve this but where in the for loop should I do that to maintain the structure.
Since you want to combine dictionaries in lists, one option is to use dict.setdefault:
pair = []
for all_pairs in dd:
dct = {}
for output_pairs in all_pairs:
for d in output_pairs.get('detail'):
if d['name'] != 'discard':
for k,v in d.items():
dct.setdefault(k, []).append(v)
pair.append(dct)
Output:
[{'name': ['KEEP_PAIR_1A', 'KEEP_PAIR_1B'], 'amount': [2, 1]},
{'name': ['KEEP_PAIR_2A', 'KEEP_PAIR_2B'], 'amount': [3, 4]}]

How to create an empty list of dictionaries and populate afterwords?

I need to initialize an empty List of Dictionary(LOD) which must have the following keys in it. "id","name","age", "gender". I want to create a loop/nested loop that starts populating the LOD. For poppulating I have a list which has ID's and the rest of the keys are generated using the random function.
The ID list looks like this: id = ['1','2','3']
The result must look something like this.
LOD = [
{
'id': '1',
'name':'122121',
'age':'2131',
'gender':'121'
},
{
'id': '2',
'name':'122121',
'age':'2131',
'gender':'121'
},
{
'id': '3',
'name':'122121',
'age':'2131',
'gender':'121'
},
]
CJDB already does what you want. But if you'd perhaps prefer another approach:
ids = ['1','2','3']
keys = ["name","age", "gender"]
LOD = []
and then populate your list with dictionaries
for i in ids:
your_dictionary = {"id": i}
for key in keys:
your_dictionary[key] = '{}_rnd_function_output'.format(key)
LOD.append(your_dictionary)
And the output would be
>>> LOD
[{'id': '1',
'name': 'name_rnd_function_output',
'age': 'age_rnd_function_output',
'gender': 'gender_rnd_function_output'},
{'id': '2',
'name': 'name_rnd_function_output',
'age': 'age_rnd_function_output',
'gender': 'gender_rnd_function_output'},
{'id': '3',
'name': 'name_rnd_function_output',
'age': 'age_rnd_function_output',
'gender': 'gender_rnd_function_output'}
]
You might consider having a sub-dictionaries within a dictionary. Your ids would be keys for main dictionary and sub-dictionaries would be values.
LOD = {}
for i in ids:
LOD[i] = {}
for key in keys:
LOD[i][key] = '{}_rnd_function_output'.format(key)
And the output
>>> LOD
{'1': {'name': 'name_rnd_function_output',
'age': 'age_rnd_function_output',
'gender': 'gender_rnd_function_output'},
'2': {'name': 'name_rnd_function_output',
'age': 'age_rnd_function_output',
'gender': 'gender_rnd_function_output'},
'3': {'name': 'name_rnd_function_output',
'age': 'age_rnd_function_output',
'gender': 'gender_rnd_function_output'}}
You can use a dictionary-comprehension for this:
ids = ['1','2','3']
LOD = [
{
'id': i,
'name':'122121',
'age':'2131',
'gender':'121'
} for i in ids
]
Output:
>>> LOD
[{'id': '1', 'name': '122121', 'age': '2131', 'gender': '121'},
{'id': '2', 'name': '122121', 'age': '2131', 'gender': '121'},
{'id': '3', 'name': '122121', 'age': '2131', 'gender': '121'}]
Or, using the random module:
import random
ids = ['1','2','3']
LOD = [
{
'id': i,
'name': str(random.randint(100000, 999999)),
'age': str(random.randint(1000, 9999)),
'gender': str(random.randint(100, 999))
} for i in ids
]
Output:
>>> LOD
[{'id': '1', 'name': '727325', 'age': '5367', 'gender': '238'},
{'id': '2', 'name': '316019', 'age': '8963', 'gender': '702'},
{'id': '3', 'name': '464023', 'age': '4324', 'gender': '155'}]
Note that you should not use id as a variable name as it shadows the builtin python id object.
You can do it by initializing dict objects in list comprehensions
keys = ['id', 'name', 'age', 'gender']
ids = ['1', '2', '3']
LOD = [dict((key, i if key == 'id' else random.randint(1, 100)) for key in keys) for i in ids]
print(LOD)
'''
[{'id': '1', 'name': 34, 'age': 10, 'gender': 57},
{'id': '2', 'name': 64, 'age': 13, 'gender': 21},
{'id': '3', 'name': 11, 'age': 17, 'gender': 2}]
'''

Remove duplicates from list of dictionaries within list of dictionaries

I have list:
my_list = [{'date': '10.06.2016',
'account': [{'name': 'a'},
{'name': 'a'},
{'name': 'b'},
{'name': 'b'}]},
{'date': '22.06.2016',
'account': [{'name': 'a'},
{'name': 'a'}]}]
I want to remove duplicates from the list of dictionaries in 'account':
my_list = [{'date': '10.06.2016',
'account': [{'name': 'a'},
{'name': 'b'}]},
{'date': '22.06.2016',
'account': [{'name': 'a'}]}]
When using set, I get the following error:
TypeError: unhashable type: 'dict'
Can anybody help me with this problem?
This structure is probably over complicated, but it gets the job done.
my_list = [{'date': '10.06.2016',
'account': [{'name': 'a'},
{'name': 'a'},
{'name': 'b'},
{'name': 'b'}]},
{'date': '22.06.2016',
'account': [{'name': 'a'},
{'name': 'a'}]}]
>>> [{'date': date,
'account': [{'name': name} for name in group]
} for group, date in zip([set(account.get('name')
for account in item.get('account'))
for item in my_list],
[d.get('date') for d in my_list])]
[{'account': [{'name': 'a'}, {'name': 'b'}], 'date': '10.06.2016'},
{'account': [{'name': 'a'}], 'date': '22.06.2016'}]
def deduplicate_account_names(l):
for d in l:
names = set(map(lambda d: d.get('name'), d['account']))
d['account'] = [{'name': name} for name in names]
# even shorter:
# def deduplicate_account_names(l):
# for d in l:
# d['account'] = [{'name': name} for name in set(map(lambda d: d.get('name'), d['account']))]
my_list = [{'date': '10.06.2016',
'account': [{'name': 'a'},
{'name': 'a'},
{'name': 'b'},
{'name': 'b'}]},
{'date': '22.06.2016',
'account': [{'name': 'a'},
{'name': 'a'}]}]
deduplicate_account_names(my_list)
print(my_list)
# [ {'date': '10.06.2016',
# 'account': [ {'name': 'a'},
# {'name': 'b'} ] },
# {'date': '22.06.2016',
# 'account': [ {'name': 'a'} ] } ]
Sets can only have hashable members and neither lists nor dicts are - but they can be checked for equality.
you can do
def without_duplicates(inlist):
outlist=[]
for e in inlist:
if e not in outlist:
outlist.append(e)
return outlist
this can be slow for really big lists
Give this code a try:
for d in my_list:
for k in d:
if k == 'account':
v = []
for d2 in d[k]:
if d2 not in v:
v.append(d2)
d[k] = v
This is what you get after running the snippet above:
In [347]: my_list
Out[347]:
[{'account': [{'name': 'a'}, {'name': 'b'}], 'date': '10.06.2016'},
{'account': [{'name': 'a'}], 'date': '22.06.2016'}]

Python - create tree from dictionary

I have a problem, I must create a function that create a tree from a dictionary of any numbers of elements.
Example of dictionaries:
D1 = {'name': 'musica',
'children': [
{'name': 'rock',
'children': [
{'name': 'origini', 'children': []},
{'name': 'rock&roll', 'children': []},
{'name': 'hard rock', 'children': []}]},
{'name': 'jazz',
'children': [
{'name': 'origini',
'children': [{'name': '1900',
'children': [{'name': 'origini', 'children': []}]}]},
{'name': 'ragtime', 'children': []},
{'name': 'swing', 'children': []}]}]}
D2 = {'name': 'html',
'children': [
{'name': 'head',
'children': [
{'name': 'meta', 'children': []},
{'name': 'title', 'children': []},
{'name': 'style', 'children': []}]},
{'name': 'body',
'children': [
{'name': 'h1', 'children': []},
{'name': 'section',
'children': [
{'name': 'p',
'children': [
{'name': 'strong', 'children': []},
{'name': 'b', 'children': []},
{'name': 'em', 'children': []},
{'name': 'i', 'children': []}]},
{'name': 'p',
'children': [
{'name': 'q', 'children': []},
{'name': 'code', 'children': []},
{'name': 'kbd', 'children': []}]},
{'name': 'p',
'children': [
{'name': 'sup', 'children': []},
{'name': 'sub', 'children': []}]},
{'name': 'p',
'children': [
{'name': 'span', 'children': []}]}]},
{'name': 'footer',
'children': [
{'name': 'a',
'children': [
{'name': 'strong', 'children': []}]},
{'name': 'a',
'children': [
{'name': 'strong', 'children': []}]}]}]}]}
D3 = {'name': 'Giovanni di Bicci',
'children': [
{'name': 'Cosimo il vecchio',
'children': [
{'name': 'Piero il gottuso',
'children': [
{'name': 'Lorenzo il magnifico',
'children': [
{'name': 'Piero II',
'children': [
{'name': 'Lorenzo II', 'children': []}]},
{'name': 'Papa Leone X', 'children': []},
{'name': 'Giuliano', 'children': []}]}]},
{'name': 'Giovanni Carlo', 'children': []}]},
{'name': 'Lorenzo',
'children': [
{'name': 'Pierfrancesco',
'children': [
{'name': 'Lorenzo', 'children': []},
{'name': 'Giovanni',
'children': [
{'name': 'Giovanni dalle Bande Nere',
'children': [
{'name': 'Lorenzino', 'children': []},
{'name': 'Cosimo I',
'children': [
{'name': 'Francesco I',
'children': [
{'name': 'Maria', 'children': []}]},
{'name': 'Ferdinando I',
'children': {}}]}]}]}]}]}]}
Any solution?
Thanks a lot
ADDITION
Thank you all for the answers.
Now I write the full exercise so you can better understand and answer me: Actually I have implemented a class TNode made in these methods:
class TNode(object):
def __init__(self, name, Sinistra= None, Destra= None):
self._name = name
self.Destra = Destra
self.Sinistra = Sinistra
self._children = []
self._copy = []
self.c = c
def add(self, c):
self._children.append(c)
def children(self):
self._copy = self._children
return self._copy
def height(self):
h = 1
for node in self._children:
h = max(h, node.height() + 1)
return h
def count(self):
c = 1
for node in self._children:
c += node.count()
return c
def count_by_name(self, name):
lst = []
if self._name == name:
lst += [self]
for node in self._children:
lst += node.count_by_name(name)
return lst
def leaves(self):
leaves_s = []
if not self._children:
leaves_s.append(self.c)
for node in self._children:
leaves_s.update(node.leaves())
return len(leaves_s)
def paths(self, name):
paths_s = set()
if self._name == name:
paths_s.add((name,))
for node in self._children:
for j in node.paths(name):
paths_s.add((self._name,)+j)
return paths_s
I also need to create a function create_tree(d) that, taken a Dictionary "d" that represents a tree, creates the corresponding tree with nodes of type TNode and returns the root. The function must add the children in the same order as they are listed in the lists of the keys 'children'.
Sorry if initially I did not write all that.
I fail to create the function, referred to the class, that create a tree from a dictionary.
I use Python 2.7
Thanks.
You can create a tree with defaultdict:
from collections import defaultdict
def Tree():
return defaultdict(Tree)
Then using it:
>>> tree = Tree()
>>> tree['house']['car']['red']['hubcap'] = 1950

python efficient group by

I am looking for the most efficient way to extract items from a list of dictionaries.I have a list of about 5k dictionaries. I need to extract those records/items for which grouping by a particular field gives more than a threshold T number of records. For example, if T = 2 and dictionary key 'id':
list = [{'name': 'abc', 'id' : 1}, {'name': 'bc', 'id' : 1}, {'name': 'c', 'id' : 1}, {'name': 'bbc', 'id' : 2}]
The result should be:
list = [{'name': 'abc', 'id' : 1}, {'name': 'bc', 'id' : 1}, {'name': 'c', 'id' : 1}]
i.e. All the records with some id such that there are atleast 3 records of same id.
l = [{'name': 'abc', 'id' : 1}, {'name': 'bc', 'id' : 1}, {'name': 'c', 'id' : 1}, {'name': 'bbc', 'id' : 2}]
from collections import defaultdict
from itertools import chain
d = defaultdict(list)
T = 2
for dct in l:
d[dct["id"]].append(dct)
print(list(chain.from_iterable(v for v in d.values() if len(v) > T)))
[{'name': 'abc', 'id': 1}, {'name': 'bc', 'id': 1}, {'name': 'c', 'id': 1}]
If you want to keep them in groups don't chain just use each value:
[v for v in d.values() if len(v) > T] # itervalues for python2
[[{'name': 'abc', 'id': 1}, {'name': 'bc', 'id': 1}, {'name': 'c', 'id': 1}]]
Avoid using list as a variable as it shadows the python list type and if you had a variable list then the code above would cause you a few problems in relation to d = defaultdict(list)
to start out I would make a dictionary to group by your id
control = {}
for d in list:
control.setdefault(d['id'],[]).append(d)
from here all you have to do is check the length of control to see if its greater than your specified threshold
put it in a function like so
def find_by_id(obj, threshold):
control = {}
for d in obj:
control.setdefault(d['id'], []).append(d)
for val in control.values():
if len(val) > threshold:
print val

Categories