As an exercise, I wanted to be less reliant on pandas and build a custom merge function on a list of dictionaries. Essentially, this is a left merge, where the original list is preserved and if the key has multiple matches then extra rows are added. However in my case, the extra rows appear to be added but with the exact same information.
Could anyone steer me in the right direction, as to where this code is going wrong?
def merge(self, l2, key):
#self.data is a list of dictionaries
#l2 is the second list of dictionaries to merge
headers = l2[0]
found = {}
append_list = []
for row in self.data:
for row_b in l2:
if row_b[key] == row[key] and row[key] not in found:
found[row[key]] = ""
for header in headers:
row[header] = row_b[header]
elif row_b[key] == row[key]:
new_row = row
for header in headers:
new_row[header] = row_b[header]
append_list.append(new_row)
self.data.extend(append_list)
Edit: Here is some sample input, and expected output:
self.data = [{'Name':'James', 'Country':'Australia'}, {'Name':'Tom', 'Country':'France'}]
l2 = [{'Country':'France', 'Food':'Frog Legs'}, {'Country':'Australia', 'Food':'Meat Pie'},{'Country':'Australia', 'Food':'Pavlova'}]
I would expect self.data to equal the following after passing through the function, with a parameter of 'Country':
[{'Name':'James', 'Country':'Australia', 'Food':'Meat Pie'}, {'Name':'James', 'Country':'Australia', 'Food':'Pavlova'}, {'Name':'Tom', 'Country':'France', 'Food':'Frog Legs'}]
The function below takes two lists of dictionaries, where the dictionaries are expected to all have keyprop as one of their properties:
from collections import defaultdict
from itertools import product
def left_join(left_table, right_table, keyprop):
# create a dictionary indexed by `keyprop` on the left
left = defaultdict(list)
for row in left_table:
left[row[keyprop]].append(row)
# create a dictionary indexed by `keyprop` on the right
right = defaultdict(list)
for row in right_table:
right[row[keyprop]].append(row)
# now simply iterate through the "left side",
# grabbing rows from the "right side" if they are available
result = []
for key, left_rows in left.items():
right_rows = right.get(key)
if right_rows:
for left_row, right_row in product(left_rows, right_rows):
result.append({**left_row, **right_row})
else:
result.extend(left_rows)
return result
sample1 = [{'Name':'James', 'Country':'Australia'}, {'Name':'Tom', 'Country':'France'}]
sample2 = [{'Country':'France', 'Food':'Frog Legs'}, {'Country':'Australia', 'Food':'Meat Pie'},{'Country':'Australia', 'Food':'Pavlova'}]
print(left_join(sample1, sample2, 'Country'))
# outputs:
# [{'Name': 'James', 'Country': 'Australia', 'Food': 'Meat Pie'},
# {'Name': 'James', 'Country': 'Australia', 'Food': 'Pavlova'},
# {'Name': 'Tom', 'Country': 'France', 'Food': 'Frog Legs'}]
In a data set where you can assume that rows are unique on the value of keyprop in their respective data sets, the implementation is quite a bit simpler:
def left_join(left_table, right_table, keyprop):
# create a dictionary indexed by `keyprop` on the left
left = {row[keyprop]: row for row in left_table}
# create a dictionary indexed by `keyprop` on the right
right = {row[keyprop]: row for row in right_table}
# now simply iterate through the "left side",
# grabbing rows from the "right side" if they are available
return [{**leftrow, **right.get(key, {})} for key, leftrow in left.items()]
Related
I've seen that there are a fair few questions addressing more or less this issue, but I've not managed to apply them to my specific use-case, and I've been scratching my head and trying different solutions for a couple of days now.
I have a list of dictionaries, with their hierarchical position encoded as a string of index numbers - I want to rearrange the dictionaries into a nested hierarchy using these indices.
Here's some example data:
my_data = [{'id':1, 'text':'one', 'path':'1'},
{'id':2, 'text':'two', 'path':'3.1'},
{'id':3, 'text':'three', 'path':'2.1.1'},
{'id':4, 'text':'four', 'path':'3.2.1'},
{'id':5, 'text':'five', 'path':'2.1.2'},
{'id':6, 'text':'six', 'path':'3.2.2'},
{'id':7, 'text':'seven', 'path':'2'},
{'id':8, 'text':'eight', 'path':'3'},
{'id':9, 'text':'nine', 'path':'3.2'},
{'id':10, 'text':'ten', 'path':'2.1'}]
and here's what I'm trying to achieve:
result = {1:{'id':1, 'text':'one', 'path':'1'},
2:{'id':7, 'text':'seven', 'path':'2', 'children':{
1:{'id':10, 'text':'ten', 'path':'2.1', 'children':{
1:{'id':3, 'text':'three', 'path':'2.1.1'},
2:{'id':5, 'text':'five', 'path':'2.1.2'}
}}}},
3:{'id':8, 'text':'eight', 'path':'3', 'children':{
1:{'id':2, 'text':'two', 'path':'3.1'},
2:{'id':9, 'text':'nine', 'path':'3.2', 'children':{
1:{'id':4, 'text':'four', 'path':'3.2.1'},
2:{'id':6, 'text':'six', 'path':'3.2.2'}
}}}}
}
Since the paths of the individual data dictionaries don't appear in any logical order, I'm using dictionaries throughout rather than lists of dictionaries, as this allows me to create 'empty' spaces in the structure. I don't really want to rely on re-ordering the dictionaries in the initial list.
Here's my code:
#%%
class my_dict(dict):
def rec_update(self, index, dictObj): # extend the dict class with recursive update function
"""
Parameters
----------
index : list
path to dictObj.
dictObj : dict
data object.
Returns: updates the dictionary instance
-------
None.
"""
pos = index[0]
index.pop(0)
if len(index) != 0:
self.update({pos : {'children' : {self.rec_update(index, dictObj)}}})
else:
self.update({pos : dictObj})
#%%
dataOut = my_dict() #create empty dictionary to receive result
dataOut.clear()
# dictObj = my_data[0] # for testing
# dictObj = my_data[1]
for dictObj in my_data:
index = dictObj.get('path').split(".") # create the path list
dataOut.rec_update(index, dictObj) # place the current data dictionary in the hierarchy
The issue with the code is that the result of the nested function call in the class definition self.rec_update(index, dictObj) isn't ending up as the value of the 'children' key. Is this because I've not understood the scope of self properly?
I've noticed during testing that, if I run the dataOut.rec_update(index, dictObj) call for a single element of my_data, e.g. dictObj = my_data[1], that the index list variable in the console scope is modified, which is unexpected, as I thought the rec_update() function had its own distinct scope.
I think I can see a further bug where the 'children' element will be overwritten, but I'm not at that stage yet.
I'd welcome any explanation that can put me on the right track, please.
Here's a solution that you should be able to adapt to your needs. It's just a stand-alone function that transforms my_data into result:
def make_tree(data):
###
### Construct path_list and path_dict
###
path_dict = {}
path_list = []
for data in data:
path = data['path']
path_split = path.split('.')
assert len(path_split) >= 1
path_tuple = tuple(map(int, path_split))
assert path_tuple not in path_dict
path_dict[path_tuple] = data
path_list.append(path_tuple)
###
### Sort path_list. This is sorting the tuples corresponding to
### each path value. Among other things, this ensues that the
### parent of a path appears before the path.
###
path_list.sort()
###
### Construct and return the tree
###
new_path_dict = {}
tree = {}
for path_tuple in path_list:
data = path_dict[path_tuple]
path_leaf = path_tuple[-1]
new_data = data.copy()
if len(path_tuple) == 1:
assert path_leaf not in tree
tree[path_leaf] = new_data
else:
parent_path_tuple = path_tuple[:-1]
assert parent_path_tuple in new_path_dict
parent = new_path_dict[parent_path_tuple]
if 'children' not in parent:
children = {}
parent['children'] = children
else:
children = parent['children']
assert path_leaf not in children
children[path_leaf] = new_data
new_path_dict[path_tuple] = new_data
return tree
When called as:
result = make_tree(my_data)
It gives result the value:
{1: {'id': 1, 'text': 'one', 'path': '1'},
2: {'id': 7, 'text': 'seven', 'path': '2', 'children': {
1: {'id': 10, 'text': 'ten', 'path': '2.1', 'children': {
1: {'id': 3, 'text': 'three', 'path': '2.1.1'},
2: {'id': 5, 'text': 'five', 'path': '2.1.2'}}}}},
3: {'id': 8, 'text': 'eight', 'path': '3', 'children': {
1: {'id': 2, 'text': 'two', 'path': '3.1'},
2: {'id': 9, 'text': 'nine', 'path': '3.2', 'children': {
1: {'id': 4, 'text': 'four', 'path': '3.2.1'},
2: {'id': 6, 'text': 'six', 'path': '3.2.2'}}}}}}
Note that Python 3 dictionaries maintain the order of added elements, so in that sense, the constructed tree is "sorted" at each level by the corresponding path component.
Also note that the original source list, and the dictionaries it contains, are unchanged by this function.
I think I've cracked it! And I've learned a lot in the process. (I'd hope so - I've got at least 24 SO tabs open, 6 doc.python.org tabs, and maybe 20 others - so it's been a group effort!)
Here is a recursive function that creates the required nested data:
class my_dict(dict): # new class inherits dict()
def rec_update(self, index, dictObj):
pos = index[0] # store the first index position
index.pop(0) # remove the first position from the list
dictTemp = my_dict() # will be used to pass the nested branch to the recursive function - doesn't need defined here
if len(index) != 0: # ... then we've not arrived at the leaf yet
if not (pos in self and 'children' in self[pos]): # ... then create a new branch
self[pos] = {'children': {}} # create template
dictTemp = my_dict(self[pos]['children']) # cast the dictionary as my_dict so that it has access to the rec_update() function
self[pos]['children'] = dictTemp.rec_update(index, dictObj) # pass data on to next level, and recurse
else:
if (pos in self and 'children' in self[pos]): # ... then update existing branch
self[pos].update(dictObj) # add in the data alongside pre-existing children key
else: # populate new branch with data, finally!
self[pos] = dictObj
return self
and here is the calling code:
dataOut = my_dict()
for dictObj in my_data:
index = [int(i) for i in dictObj.get('path').split(".")] # turn path string into list and iterate; convert to integers
dataOut.rec_update(index, dictObj)
I still don't understand why changes to index inside the function alter index in the calling code - answers welcome :-)
But I did discover that I couldn't override dict.copy() with a __copy__() function inside my my_dict class definition, hence dictTemp = my_dict(self[pos]['children']) rather than dictTemp = self[pos]['children'].copy().
One final oddity which I've still to address: when I apply it to my production data, I have to run it twice!
Trying to create a dict that holds name,position and number for each player for each team. But when trying to create the final dictionary players[team_name] =dict(zip(number,name,position)) it throws an error (see below). I can't seem to get it right, any thoughts on what I'm doing wrong here would be highly appreciated. Many thanks,
from bs4 import BeautifulSoup as soup
import requests
from lxml import html
clubs_url = 'https://www.premierleague.com/clubs'
parent_url = clubs_url.rsplit('/', 1)[0]
data = requests.get(clubs_url).text
html = soup(data, 'html.parser')
team_name = []
team_link = []
for ul in html.find_all('ul', {'class': 'block-list-5 block-list-3-m block-list-1-s block-list-1-xs block-list-padding dataContainer'}):
for a in ul.find_all('a'):
team_name.append(str(a.h4).split('>', 1)[1].split('<')[0])
team_link.append(parent_url+a['href'])
team_link = [item.replace('overview', 'squad') for item in team_link]
team = dict(zip(team_name, team_link))
data = {}
players = {}
for team_name, team_link in team.items():
player_page = requests.get(team_link)
cont = soup(player_page.content, 'lxml')
clud_ele = cont.find_all('span', attrs={'class' : 'playerCardInfo'})
for i in clud_ele:
v_number = [100 if v == "-" else v.get_text(strip=True) for v in i.select('span.number')]
v_name = [v.get_text(strip=True) for v in i.select('h4.name')]
v_position = [v.get_text(strip=True) for v in i.select('span.position')]
key_number = [key for element in i.select('span.number') for key in element['class']]
key_name = [key for element in i.select('h4.name') for key in element['class']]
key_position = [key for element in i.select('span.position') for key in element['class']]
number = dict(zip(key_number,v_number))
name = dict(zip(key_name,v_name))
position = dict(zip(key_position,v_name))
players[team_name] = dict(zip(number,name,position))
---> 21 players[team_name] = dict(zip(number,name,position))
22
23
ValueError: dictionary update sequence element #0 has length 3; 2 is required
There are many problems in your code. The one causing the error is that you are trying to instantiate a dictionary with a 3-items tuple in list which is not possible. See the dict doc for details.
That said, I would suggest to rework the whole nested loop.
First, you have in clud_ele a list of player info, each player info concerns only one player and provides only one position, only one name and only one number. So there is no need to store those informations in lists, you could use simple variables:
for player_info in clud_ele:
number = player_info.select('span.number')[0].get_text(strip=True)
if number == '-':
number = 100
name = player_info.select('h4.name')[0].get_text(strip=True)
position = player_info.select('span.position')[0].get_text(strip=True)
Here, usage of select method returns a list but since you know that the list contains only one item, it's ok to get this item to call get_text on. But you could check that the player_info.select('span.number') length is actually 1 before continuing to work if you want to be sure...
This way, you get scalar data type which will be much easier to manipulate.
Also note that I renamed the i to player_info which is much more explicit.
Then you can easily add your player data to your players dict:
players[team_name].append({'name': name,
'position': position
'number': number})
This assume that you create the players[team_name] before the nested loop with players[team_name] = [].
Edit: as stated in the #kederrac's answer, usage of a defaultdict is a smart and convenient way to avoid the manual creation of each players[team_name] list
Finally, this will give you:
a dictionary containing values for name, position and number keys for each player
a team list containg player dictionaries for each team
a players dictionary associating a team list for each team_name
It is the data structure you seems to want, but other structures are possible. Remember to think about your data structure to make it logical AND easily manipulable.
you can't instantiate a dict with 3 arguments, the problem is the fact that you have 3 variables in the zip: zip(number, name, position) with which you want to instantiate a dict, you should give only 2 arguments at a time, the key and the value
I've rewritten your las part of the code:
from collections import defaultdict
data = {}
players = defaultdict(list)
for team_name, team_link in team.items():
player_page = requests.get(team_link)
cont = soup(player_page.text, 'lxml')
clud_ele = cont.find_all('span', attrs={'class' : 'playerCardInfo'})
for i in clud_ele:
num = i.select('span.number')[0].get_text(strip=True)
number = 100 if num == '-' else num
name = i.select('h4.name')[0].get_text(strip=True)
position = i.select('span.position')[0].get_text(strip=True)
players[team_name].append({'number': number, 'position': position, 'name': name})
output:
defaultdict(list,
{'Arsenal': [{'number': '1',
'position': 'Goalkeeper',
'name': 'Bernd Leno'},
{'number': '26',
'position': 'Goalkeeper',
'name': 'Emiliano Martínez'},
{'number': '33', 'position': 'Goalkeeper', 'name': 'Matt Macey'},
{'number': '2',
'position': 'Defender',
'name': 'Héctor Bellerín'},
.......................
Ok, so I am working on an application that can go through a number of different database objects, compare the string and return the associated id, first name and last name. I currently have it to where I am building a list of tuples and then populating a dictionary with the key and values(using a list). What I want to do next is find the Max percentage and then return the associated fist and last name from the dictionary. I know the description is a little confusing so please look at the below examples and code:
# My Dictionary:
{'percent': [51.9, 52.3, 81.8, 21.0], 'first_name': ['Bob', 'Bill', 'Matt', 'John'], 'last_name': ['Smith', 'Allen', 'Naran', 'Jacobs']}
# I would want this to be returned:
percent = 81.8 (Max percentage match)
first_name = 'Matt' (First name associated with the max percentage match)
last_name = 'Naran' (Last name associated with the max percentage match)
# Code so Far:
compare_list = []
compare_dict = {}
# Builds my list of Tuples
compare_list.append(tuple(("percent", percentage)))
compare_list.append(tuple(("first_name", first_name)))
compare_list.append(tuple(("last_name", last_name)))
# Builds my Dictionary
for x, y in compare_list:
compare_dict.setdefault(x, []).append(y)
Not sure where to go to return the first and last name associated with the Max percentage.
I really appreciate any and all help that you provide!
I hope this will help you:
data = {'percent': [51.9, 52.3, 81.8, 21.0], 'first_name': ['Bob', 'Bill', 'Matt', 'John'], 'last_name': ['Smith', 'Allen', 'Naran', 'Jacobs']}
percentage_list = data['percent']
percentage = max(percentage_list)
max_index = percentage_list.index(percentage)
first_name = data['first_name'][max_index]
last_name = data['last_name'][max_index]
# Code so Far:
compare_list = []
compare_dict = {}
# Builds my list of Tuples
compare_list.append(tuple(("percent", percentage)))
compare_list.append(tuple(("first_name", first_name)))
compare_list.append(tuple(("last_name", last_name)))
# Builds my Dictionary
for x, y in compare_list:
compare_dict.setdefault(x, []).append(y)
print compare_dict
I have a nested list of tuples of lists (of tuples, etc.) that looks like this:
[(' person',
[(('surname', u'Doe', True),),
(('name', u'John', False),),
('contact',
[(('email', u'john#doe.me', True),),
(('phone', u'+0123456789', False),),
(('fax', u'+0987654321', False),)]),
('connection',
[(('company', u'ibcn', True),),
('contact',
[(('email', u'mail#ibcn.com', True),),
(('address', u'main street 0', False),),
(('city', u'pythonville', False),),
(('fax', u'+0987654321', False),)])])])]
There is no way of knowing neither the number of (double) tuples within a list nor how deep nesting goes.
I want to convert it to a nested dictionary (of dictionaries), eliminating the boolean values, like this:
{'person': {'name': 'John', 'surname': 'Doe',
'contact': {'phone': '+0123456789', 'email': 'john#doe.me','fax': '+0987654321',
'connection': {'company name': 'ibcn', 'contact':{'phone': '+2345678901',
'email': 'mail#ibcn.com', 'address': 'main street 0'
'city': 'pythonville', 'fax': +0987654321'
}}}}}
All I have, so far, is a recursive method that can print the nested structure in a per-line fashion:
def tuple2dict(_tuple):
for item in _tuple:
if type(item) == StringType or type(item) == UnicodeType:
print item
elif type(item) == BooleanType:
pass
else:
tuple2dict(item)
but, I'm not sure I'm on the right track...
EDIT:
I've edited the original structure, since it was missing a comma.
You are on the right track. The recursive approach will work. As far as I can tell from your sample data, each tuple first has string item, containing the key. After that you have either another tuple or list as value, or a String value followed by a boolean true or false.
EDIT:
The trick with recursion is that you have to know when to stop. Basically, in your case it appears the deepest structure are the nested three tuples, matching names to values.
Hacking away a bit. I shamefully admit this is the ugliest code in the world.
def tuple2dict(data):
d = {}
for item in data:
if len(item) == 1 and isinstance(item, tuple):
# remove the nested structure, you may need a loop here
item = item[0]
key = item[0]
value = item[1]
d[key] = value
continue
key = item[0]
value = item[1]
if hasattr(value, '__getitem__'):
value = tuple2dict(value)
d[key] = value
return d
not beautiful ... but it works... basically
def t2d(t):
if isinstance(t,basestring):return t
length = len(t)
if length == 1:
return t2d(t[0])
if length == 2:
t1,t2 = t2d(t[0]),t2d(t[1])
print "T:",t1,t2
if isinstance(t1,dict) and len(t1) == 1:
t2['name'] = t1.values()[0]
t1 = t1.keys()[0]
return dict([[t1,t2]])
if length == 3 and isinstance(t[2],bool):
return t2d(t[:2])
L1 =[t2d(tp) for tp in t]
L2 = [lx.items() for lx in L1]
L3 = dict( [i[0] for i in L2])
return L3
I should mention it works specifically with the dict you posted... (seems like company wasnt quite setup right so I hacked it (see t2['name']...))
You are definitely on the right track. A recursive function is the way to go.
A dictionary can be built from an iterable giving tuples of length 2.
Now, to get rid of the boolean, you can use slicing. Just slice off everything except the first two elements.
>> (1,2,3)[:3]
(1,2)
This is my final solution. Not very elegant, I must admit.
It also takes care of multiple entries for a key by concatenating it with the existing one.
def tuple2dict(_obj):
_dict = {}
for item in _obj:
if isinstance(item, tuple) or isinstance(item, list):
if isinstance(item[0], basestring):
_dict[item[0]] = tuple2dict(item[1])
else:
if isinstance(item[0], tuple):
# if the key already exists, then concatenate the old
# value with the new one, adding a space in between.
_key = item[0][0]
if _key in _dict:
_dict[_key] = " ".join([_dict[_key], item[0][1]])
else:
_dict[_key] = item[0][1]
return _dict
To illustrate what I mean by this, here is an example
messages = [
('Ricky', 'Steve', 'SMS'),
('Steve', 'Karl', 'SMS'),
('Karl', 'Nora', 'Email')
]
I want to convert this list and a definition of groups to a list of integers and a lookup dictionary so that each element in the group gets a unique id. That id should map to the element in the lookup table like this
messages_int, lookup_table = create_lookup_list(
messages, ('person', 'person', 'medium'))
print messages_int
[ (0, 1, 0),
(1, 2, 0),
(2, 3, 1) ]
print lookup_table
{ 'person': ['Ricky', 'Steve', 'Karl', 'Nora'],
'medium': ['SMS', 'Email']
}
I wonder if there is an elegant and pythonic solution to this problem.
I am also open to better terminology than create_lookup_list etc
defaultdict combined with the itertools.count().next method is a good way to assign identifiers to unique items. Here's an example of how to apply this in your case:
from itertools import count
from collections import defaultdict
def create_lookup_list(data, domains):
domain_keys = defaultdict(lambda:defaultdict(count().next))
out = []
for row in data:
out.append(tuple(domain_keys[dom][val] for val, dom in zip(row, domains)))
lookup_table = dict((k, sorted(d, key=d.get)) for k, d in domain_keys.items())
return out, lookup_table
Edit: note that count().next becomes count().__next__ or lambda: next(count()) in Python 3.
Mine's about the same length and complexity:
import collections
def create_lookup_list(messages, labels):
# Collect all the values
lookup = collections.defaultdict(set)
for msg in messages:
for l, v in zip(labels, msg):
lookup[l].add(v)
# Make the value sets lists
for k, v in lookup.items():
lookup[k] = list(v)
# Make the lookup_list
lookup_list = []
for msg in messages:
lookup_list.append([lookup[l].index(v) for l, v in zip(labels, msg)])
return lookup_list, lookup
In Otto's answer (or anyone else's with string->id dicts), I'd replace (if obsessing over speed is your thing):
# create the lookup table
lookup_dict = {}
for group in indices:
lookup_dict[group] = sorted(indices[group].keys(),
lambda e1, e2: indices[group][e1]-indices[group][e2])
by
# k2i must map keys to consecutive ints [0,len(k2i)-1)
def inverse_indices(k2i):
inv=[0]*len(k2i)
for k,i in k2i.iteritems():
inv[i]=k
return inv
lookup_table = dict((g,inverse_indices(gi)) for g,gi in indices.iteritems())
This is better because direct assignment to each item in the inverse array directly is faster than sorting.
Here is my own solution - I doubt it's the best
def create_lookup_list(input_list, groups):
# use a dictionary for the indices so that the index lookup
# is fast (not necessarily a requirement)
indices = dict((group, {}) for group in groups)
output = []
# assign indices by iterating through the list
for row in input_list:
newrow = []
for group, element in zip(groups, row):
if element in indices[group]:
index = indices[group][element]
else:
index = indices[group][element] = len(indices[group])
newrow.append(index)
output.append(newrow)
# create the lookup table
lookup_dict = {}
for group in indices:
lookup_dict[group] = sorted(indices[group].keys(),
lambda e1, e2: indices[group][e1]-indices[group][e2])
return output, lookup_dict
This is a bit simpler, and more direct.
from collections import defaultdict
def create_lookup_list( messages, schema ):
def mapped_rows( messages ):
for row in messages:
newRow= []
for col, value in zip(schema,row):
if value not in lookups[col]:
lookups[col].append(value)
code= lookups[col].index(value)
newRow.append(code)
yield newRow
lookups = defaultdict(list)
return list( mapped_rows(messages) ), dict(lookups)
If the lookups were proper dictionaries, not lists, this could be simplified further.
Make your "lookup table" have the following structure
{ 'person': {'Ricky':0, 'Steve':1, 'Karl':2, 'Nora':3},
'medium': {'SMS':0, 'Email':1}
}
And it can be further reduced in complexity.
You can turn this working copy of the lookups into it's inverse as follows:
>>> lookups = { 'person': {'Ricky':0, 'Steve':1, 'Karl':2, 'Nora':3},
'medium': {'SMS':0, 'Email':1}
}
>>> dict( ( d, dict( (v,k) for k,v in lookups[d].items() ) ) for d in lookups )
{'person': {0: 'Ricky', 1: 'Steve', 2: 'Karl', 3: 'Nora'}, 'medium': {0: 'SMS', 1: 'Email'}}
Here is my solution, it's not better - it's just different :)
def create_lookup_list(data, keys):
encoded = []
table = dict([(key, []) for key in keys])
for record in data:
msg_int = []
for key, value in zip(keys, record):
if value not in table[key]:
table[key].append(value)
msg_int.append(table[key].index(value))
encoded.append(tuple(msg_int))
return encoded, table
Here is mine, the inner function lets me write the index-tuple as a generator.
def create_lookup_list( data, format):
table = {}
indices = []
def get_index( item, form ):
row = table.setdefault( form, [] )
try:
return row.index( item )
except ValueError:
n = len( row )
row.append( item )
return n
for row in data:
indices.append( tuple( get_index( item, form ) for item, form in zip( row, format ) ))
return table, indices