Related
I want to remove pairs of key and value when those pairs have the value smaller than another pairs in a dictionary of dictionaries. Suppose that I have a dictionary of dictionaries as below:
ex_dict = {'C1': {'I1': 1, 'I2': 1.5, 'I3': 2}, 'C2': {'I2': 2, 'I3': 3, 'I4': 3.5}, 'C3': {'I4': 2, 'I5': 4, 'I6': 3}, 'C4': {'I1': 2, 'I3': 3.5, 'I6': 5}, 'C5': {'I7': 1, 'I8': 1.5, 'I9': 2}}
I want the expected output as follow:
new_ex_dict = {'C1': {}, 'C2': {'I2': 2, 'I4': 3.5}, 'C3': {'I5': 4}, 'C4': {'I1': 2, 'I3': 3.5, 'I6': 5}, 'C5': {'I7': 1, 'I8': 1.5, 'I9': 2}}
How can I do this efficiently? Any help will be much appreciated.
This is my quick solution
ex_dict = {'C1': {'I1': 1, 'I2': 1.5, 'I3': 2}, 'C2': {'I2': 2, 'I3': 3, 'I4': 3.5}, 'C3': {'I4': 2, 'I5': 4, 'I6': 3}, 'C4': {'I1': 2, 'I3': 3.5, 'I6': 5}, 'C5': {'I7': 1, 'I8': 1.5, 'I9': 2}}
temp_dict = {}
new_ex_dict = {}
for main_key in ex_dict:
for k, v in ex_dict[main_key].items():
temp_dict.setdefault(k, []).append((main_key, v))
for k, v in temp_dict.items():
max_value = max(v, key=lambda x: x[1])
main_key = max_value[0]
new_ex_dict.setdefault(main_key, {})[k] = max_value[1]
I have two lists in Python and I'm trying to map the values of one to the other.
List 1 (coordinates):
['7,16', '71,84', '72,48', '36,52', '75,36', '52,28', '76,44', '11,69', '56,35',
'15,21', '32,74', '88,32', '10,74', '61,34', '51,85', '10,75', '55,96',
'94,12', '34,64', '71,59', '76,75', '25,16', '54,100', '62,1', '60,85',
'16,32', '14,77', '40,78', '2,60', '71,4', '78,91', '100,98', '42,32', '37,49',
'49,34', '3,5', '42,77', '39,60', '38,77', '49,40', '40,53', '57,48', '14,99',
'66,67', '10,9', '97,3', '66,76', '86,68', '10,60', '8,87']
List 2 (index):
[3, 2, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 1, 3, 2, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
1, 2, 1, 3, 2, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 1, 2, 3, 3, 2, 2, 1, 1]
For the output, I need to have something like:
cluster_1: [x, y], [a,b]...
cluster_2: [c, d], [e, f]...
cluster_3: [g, h], [o, j]...
I tried doing this in a dictionary, but I can only get it to put in the last coordinate in the for loop for each value. It also always outputs keys starting from 0, and I'm looking to label them starting from 1.
for i in range(len(patients)):
# other stuff
k = 3
for b in range(k):
if cluster == (k - b):
dct['cluster_%s' % b] = patients[i]
which outputs:
{'cluster_0': '97,3', 'cluster_1': '86,68', 'cluster_2': '8,87'}
I've tried using dct['cluster_%s' % b].append(patients[i]) but I get a key error on cluster_0. Any help would be much appreciated!
You can zip your indices and coordinates, then loop over them element-wise and populate a dictionary based on the index.
clusters = {}
for idx, coord in zip(index, coords):
if idx in clusters:
clusters[idx].append(coord.split(','))
else:
clusters[idx] = [coord.split(',')]
result, where clusters[i] refers the the i-th cluster.
>>> clusters
{
3: [['7', '16'], ['72', '48'], ['36', '52'], ['75', '36'], ['52', '28'], ['76', '44'], ['56', '35'], ['15', '21'], ['88', '32'], ['61', '34'], ['94', '12'], ['71', '59'], ['25', '16'], ['62', '1'], ['16', '32'], ['71', '4'], ['42', '32'], ['37', '49'], ['49', '34'], ['3', '5'], ['49', '40'], ['40', '53'], ['57', '48'], ['10', '9'], ['97', '3']],
2: [['71', '84'], ['32', '74'], ['51', '85'], ['55', '96'], ['34', '64'], ['76', '75'], ['54', '100'], ['60', '85'], ['40', '78'], ['78', '91'], ['100', '98'], ['42', '77'], ['39', '60'], ['38', '77'], ['66', '67'], ['66', '76'], ['86', '68']],
1: [['11', '69'], ['10', '74'], ['10', '75'], ['14', '77'], ['2', '60'], ['14', '99'], ['10', '60'], ['8', '87']]
}
You could use defaultdict along with zip:
from collections import defaultdict
clusters = defaultdict(list)
for id, value in zip(cluster_indices, values):
clusters[id].append(value.split(","))
print(dict(clusters)) # {3: [['7', '16'], ['72', '48'], ...
A defaultdict can be converted to a dict with dict(clusters). However, this may not be necessary since defaultdict basically extends dict.
Note: If you need int values, then you may replace value.split(",") with [int(v) for v in value.split(",")] or list(map(int, value.split(","))). Casting them already at this point will save you an iteration later.
from collections import defaultdict
clusters = defaultdict(list)
for id, value in zip(cluster_indices, values):
clusters[id].append([int(v) for v in value.split(",")])
print(dict(clusters)) # {3: [[7, 16], [72, 48], ...
The group-by behaviour extracted to a function groupby (using a lambda function to allow any kind of transformation) so it can be reused:
from collections import defaultdict
def groupby(indices, values, map_fn):
grouped = defaultdict(list)
for id, value in zip(indices, values):
grouped[id].append(map_fn(id, value))
return dict(grouped)
clusters = groupby(cluster_indices, values, lambda _, value: value.split(","))
print(clusters) # {3: [['7', '16'], ['72', '48'], ...
Here just another way by using itertools.groupby:
from itertools import groupby
from operator import itemgetter
data = sorted(zip(cluster_indices, values), key=itemgetter(0))
grouped = groupby(data, key=itemgetter(0))
clusters = {
cluster: [value[1].split(",") for value in list(values)]
for cluster, values in grouped
}
print(clusters) # {3: [['7', '16'], ['72', '48'], ...
However, I would use the defaultdict approach above or Cory Kramer's answer as it is more simple and easier to read (and therefore preferable)!
I have a list of dictionaries that look like this:
[
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 16, 'primary': '16', 'secondary': '8'},
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 8, 'primary': '8', 'secondary': '16'},
{'ServiceID': 12, 'primary': '12', 'secondary': '20'},
{'ServiceID': 8, 'primary': '8', 'secondary': '16'}
]
I would like create a new sorted dictionary where the we have the following:
key = value of 'ServiceID'
key = value of how many times that particular 'ServiceID' is listed as a 'primary'
key = value of how many times that particular 'ServiceID' is listed as a 'secondary'
For example:
[
{'ServiceID': 8, 'primaryCount': 2, 'secondaryCount': 1},
{'ServiceID': 12, 'primaryCount': 1, 'secondaryCount': 4},
{'ServiceID': 16, 'primaryCount': 1, 'secondaryCount': 2},
{'ServiceID': 120, 'primaryCount': 4, 'secondaryCount': 1}
]
Code that I have so far that doesn't quite seem to do what I desire, meaning that I am unsure as to how to appropriately increment the number of primaries and secondaries across the entire for loop as well as how to only ensure I am capturing the uniques for the 'ServiceID'
I believe there is something wrong with my logic:
temp_count_list = list()
temp_primary_counts = 0
temp_secondary_counts = 0
for sub_dict in new_list:
temp_dict = dict()
temp_dict['ServiceID'] = sub_dict['ServiceID']
if sub_dict['ServiceID'] == int(sub_dict['primarySlice']):
temp_dict['primaryCount'] = temp_primary_counts +=1
if sub_dict['ServiceID'] == int(sub_dict['secondarySlice']):
temp_dict['secondaryCount'] = temp_secondary_counts +=1
temp_count_list.append(temp_dict)
Basic idea is, get all the ServiceID, primary, secondary in a dict (in code k), and then for each unique ServiceID count the frequency of that ServiceID in the primary and secondary.
l = [
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 16, 'primary': '16', 'secondary': '8'},
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 8, 'primary': '8', 'secondary': '16'},
{'ServiceID': 12, 'primary': '12', 'secondary': '20'},
{'ServiceID': 8, 'primary': '8', 'secondary': '16'}
]
k = {'ServiceID': [], 'primaryCount': [], 'secondaryCount': []}
for i in l:
k['ServiceID'].append(i['ServiceID'])
k['primaryCount'].append(i['primary'])
k['secondaryCount'].append(i['secondary'])
res = {'ServiceID': 0, 'primaryCount': [], 'secondaryCount': []}
result = []
for i in sorted(set(k['ServiceID'])):
res['ServiceID']=i
res['primaryCount'] = k['primaryCount' ].count(str(i))
res['secondaryCount'] = k['secondaryCount' ].count(str(i))
result.append(res)
res = {'ServiceID': 0, 'primaryCount': [], 'secondaryCount': []}
print(result)
output
[
{'ServiceID': 8, 'primaryCount': 2, 'secondaryCount': 1},
{'ServiceID': 12, 'primaryCount': 1, 'secondaryCount': 4},
{'ServiceID': 16, 'primaryCount': 1, 'secondaryCount': 2},
{'ServiceID': 20, 'primaryCount': 4, 'secondaryCount': 1}
]
You can do the following (l is your list):
d={i['ServiceID']:{'primaryCount':0, 'secondaryCount':0} for i in l}
for i in l:
d[int(i['primary'])]['primaryCount']+=1
d[int(i['secondary'])]['secondaryCount']+=1
res=[{'ServiceID':i, 'primaryCount': k['primaryCount'], 'secondaryCount': k['secondaryCount']} for i, k in d.items()]
Output:
>>> print(res)
[{'ServiceID': 20, 'primaryCount': 4, 'secondaryCount': 1}, {'ServiceID': 16, 'primaryCount': 1, 'secondaryCount': 2}, {'ServiceID': 8, 'primaryCount': 2, 'secondaryCount': 1}, {'ServiceID': 12, 'primaryCount': 1, 'secondaryCount': 4}]
It seems like the correct solution here would involve using collections.Counters (or largely equivalently in this case, collections.defaultdict(int)s) to allow you to cheaply and easily increment counts without relying on them being adjacent in the input, and without using intermediate data structures that add pointless overhead; why build the result all at once when you can count the parts you care about with simpler code, then build the result with equally simple code from those simple counts? You don't actually use the 'ServiceID' field in the input, so you may as well just count efficiently, and convert back to the preferred format at the end:
import pprint # For pretty-printing in the example
from collections import Counter
inp = [
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 16, 'primary': '16', 'secondary': '8'},
{'ServiceID': 20, 'primary': '20', 'secondary': '12'},
{'ServiceID': 8, 'primary': '8', 'secondary': '16'},
{'ServiceID': 12, 'primary': '12', 'secondary': '20'},
{'ServiceID': 8, 'primary': '8', 'secondary': '16'}
]
primarycount = Counter()
secondarycount = Counter()
for d in inp:
primarycount[int(d['primary'])] += 1 # Counts times seen as primary
secondarycount[int(d['secondary'])] += 1 # Counts times seen as secondary
# Just to see intermediate results
print(primarycount)
print(secondarycount)
# Make new list mapping each thing seen to its counts
# The union of keys ensures anything with even one count in input appears in the output
# Sorting the union before iterating gets desired output order
result = [{'ServiceID': sid, 'primaryCount': primarycount[sid], 'secondaryCount': secondarycount[sid]}
for sid in sorted(primarycount.keys() | secondarycount.keys())]
pprint.pprint(result)
Try it online!
which produces output:
Counter({20: 4, 8: 2, 16: 1, 12: 1})
Counter({12: 4, 16: 2, 8: 1, 20: 1})
[{'ServiceID': 8, 'primaryCount': 2, 'secondaryCount': 1},
{'ServiceID': 12, 'primaryCount': 1, 'secondaryCount': 4},
{'ServiceID': 16, 'primaryCount': 1, 'secondaryCount': 2},
{'ServiceID': 20, 'primaryCount': 4, 'secondaryCount': 1}]
This might be slightly wrong if some ServiceIDs might be seen in the input, but never as a primary or secondary (they won't appear in the output at all, rather than appearing with zero counts; unclear which is correct), or if primary or secondary values sometimes appear where the ServiceID corresponding never appears in the input (they'll show up in the output with counts, rather than being omitted; again, unclear on which is correct), but it's relatively trivial to fix. Flipping both behaviors would just involve changing primarycount.keys() | secondarycount.keys() to {d['ServiceID'] for d in inp} to ensure values come from the input ServiceID fields, not a combination of all values seen for primary and secondary. For the provided input, both approaches are equivalent (with the former being slightly faster in most cases, where there are many duplicate ServiceIDs in the input).
I have a pandas series of dicts like this:
print(df['genres'])
0 {'0': '1', '1': '4', '2': '23'}
1 {'0': '1', '1': '25', '2': '4', '3': '37'}
2 {'0': '9'}
print(type(df['genres']))
<class 'pandas.core.series.Series'>
print(type(df['genres'][0]))
<class 'dict'>
I want to count the values to get something like this:
{'1': 2, '4': 2, '9': 1, '23': 1, '25': 1, '37': 1}
I tried the following:
print(Counter(chain.from_iterable(df.genres.values)))
Counter({'0': 3, '1': 2, '2': 2, '3': 1})
print(pd.Series(df['genres']).value_counts())
{'0': '1', '1': '4', '2': '23'} 1
{'0': '1', '1': '25', '2': '4', '3': '37'} 1
{'0': '9'} 1
I think it is pretty easy for someone more experienced than me. But I really don't get it ...
Try:
pd.DataFrame(list(df.genres)).stack().value_counts().to_dict()
Output:
{'1': 2, '4': 2, '37': 1, '9': 1, '23': 1, '25': 1}
I have this signature:
def aggregate_by_player_id(input, playerid, fields):
By 'fields', i mean fields to sum up grouping by 'playerID' within the 'input'.
I call the function like this:
aggregate_by_player_id(input, 'player', ['stat1','stat3'])
Input look like this:
[{'player': '1', 'stat1': '3', 'stat2': '4', 'stat3': '5'},
{'player': '1', 'stat1': '1', 'stat2': '4', 'stat3': '1'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '3'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '1'},
{'player': '3', 'stat1': '4', 'stat2': '1', 'stat3': '6'}]
My output structure is:
nested_dic = {value_of_playerid1: {'playerid': value_of_playerid1, 'stat1': value_of_stat1, 'stat2': value_of_stat2},
value_of_playerid2: {'playerid': value_of_playerid2, 'stat2': value_of_stat2, 'stat2': value_of_stat2},
value_of_playerid3: {'playerid': value_of_playerid3, 'stat3': value_of_stat3, 'stat3': value_of_stat3}}
Hence the output should look like:
{'1': {'player': '1', 'stat1': 4, 'stat3': 6},
'2': {'player': '2', 'stat1': 2, 'stat3': 4},
'3': {'player': '3', 'stat1': 4, 'stat3': 6}}
We can use itertools.groupby for this to group on playerid and then sum values across the fields.
from itertools import groupby
from operator import itemgetter
def aggregate_by_player_id(input_, playerid, fields):
player = itemgetter(playerid)
output = {}
for k, v in groupby(input_, key=player):
data = list(v)
stats = {playerid: k}
for field in fields:
stats[field] = sum(int(d.get(field, 0)) for d in data)
output[k] = stats
return output
data.sort(key=player) # data must be pre-sorted on grouping key
results = aggregate_by_player_id(data, 'player', ['stat1', 'stat3'])
{'1': {'player': '1', 'stat1': 4, 'stat3': 6},
'2': {'player': '2', 'stat1': 2, 'stat3': 4},
'3': {'player': '3', 'stat1': 4, 'stat3': 6}}
Capturing the result you're after in a single comprehension might be possible, but is likely not very readable. Here's a simple function that does the work:
data = [
{'player': '1', 'stat1': '3', 'stat2': '4', 'stat3': '5'},
{'player': '1', 'stat1': '1', 'stat2': '4', 'stat3': '1'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '3'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '1'},
{'player': '3', 'stat1': '4', 'stat2': '1', 'stat3': '6'}
]
def aggregate_dicts(ds, id_field, aggr_fields):
result = {}
for d in ds:
identifier = d[id_field]
if identifier not in result:
result[identifier] = {f: 0 for f in aggr_fields}
for f in aggr_fields:
result[identifier][f] += int(d[f])
return result
print(aggregate_dicts(data, 'player', ['stat1', 'stat3']))
Result:
{'1': {'stat1': 4, 'stat3': 6}, '2': {'stat1': 2, 'stat3': 4}, '3': {'stat1': 4, 'stat3': 6}}
If you want to repeat the identifier inside the dict, just add this line to the if block:
result[identifier][id_field] = identifier