I have an example dictionaty for rules, quantifiers, and transformations, essentially, inside each key there belongs another key containing ids equal to id. I am trying to find all those that match and return these id's that match as a dictionary in this format:
dictionary = {'rules':[...], 'quantifiers':[...], 'transformations':[...]}
Here is the sample:
test_dict = {
'rules': [{'id': 123,'logic': '{"$or":[{"$and":[{"baseOperator":null,"operator":"does_not_contain_ignore_case","operand1":"metrics.123","operand2":"metrics.456"}]}]}',},
{'id': 589,
'logic': '{"$or":[{"$and":[{"baseOperator":null,"operator":"does_not_contain_ignore_case","operand1":"metrics.123","operand2":0}, {"baseOperator":null,"operator":"does_not_contain_ignore_case","operand1":"metrics.456","operand2":0}]}]}',},
{'id': 51,
'logic': '{"$or":[{"$and":[{"baseOperator":null,"operator":"does_not_contain_ignore_case","operand1":"metrics.789","operand2":"metrics.1"}]}]}',},],
'quant': [{'id':123,
'transIds': [1, 2, 3],
'qualifiedId': 'metrics.123'},
{'id':456,
'transIds': [1, 6],
'qualifiedId': 'metrics.456'},
{'id':789,
'transIds': [9],
'qualifiedId': 'metrics.789'}],
'trans': [{'id':1,
'rules': [123, 120]},
{'id':6,
'rules':[589, 2]}]
}
Here was my attempt, however, I realised that the list trans, rules would be specific to each index ID, therefore, because rules is first in the test_dict, then the loop won't capture it because all values side by it are empty.
Essentially, I wanted to enter logic and capture all values metric that belong to the ids in quantifiers
Capture all ids from quantifiers that match the values inside attr
attr = [123, 456]
keys = list(test_dict.keys())
trans = []
rules = []
for iter in range(len(keys)):
for in_iter in range(len(test_dict[keys[iter]])):
if test_dict[keys[iter]][in_iter].get('id') in attr:
if test_dict[keys[iter]][in_iter].get('transIds') is not None:
for J in test_dict[keys[iter]][in_iter].get('transIds'):
trans.append(J)
if test_dict[keys[iter]][in_iter].get('id') in trans:
if test_dict[keys[iter]][in_iter].get('rules') is not None:
for K in test_dict[keys[iter]][in_iter].get('rules'):
rules.append(K)
if test_dict[keys[iter]][in_iter].get('id') in rules:
if test_dict[keys[iter]][in_iter].get('logic') is not None:
print(test_dict[keys[iter]][in_iter].get('logic'))
I figured it out thanks to the comments; Instead of running it all inside a single loop, then I split the loops into parts which solved the list issue. However, the lines of code is far too long for this attempt:
attr = [123, 456]
keys = list(test_dict.keys())
trans = []
rules = []
qualified = []
quant_id = set()
import json
for iter in range(len(keys)):
for in_iter in range(len(test_dict[keys[iter]])):
if test_dict[keys[iter]][in_iter].get('id') in attr:
qualified.append(test_dict[keys[iter]][in_iter].get('qualifiedId'))
if test_dict[keys[iter]][in_iter].get('transIds') is not None:
for J in test_dict[keys[iter]][in_iter].get('transIds'):
trans.append(J)
trans2 = set()
for iter in range(len(keys)):
for in_iter in range(len(test_dict[keys[iter]])):
if test_dict[keys[iter]][in_iter].get('id') in trans:
trans2.add(test_dict[keys[iter]][in_iter].get('id'))
if test_dict[keys[iter]][in_iter].get('rules') is not None:
for K in test_dict[keys[iter]][in_iter].get('rules'):
rules.append(K)
rules2 = set()
for iter in range(len(keys)):
for in_iter in range(len(test_dict[keys[iter]])):
if test_dict[keys[iter]][in_iter].get('id') in rules:
rules2.add(test_dict[keys[iter]][in_iter].get('id'))
if test_dict[keys[iter]][in_iter].get('logic') is not None:
logic = json.loads(test_dict[keys[iter]][in_iter].get('logic'))
ks_or = list(logic.keys())
for or_ in range(len(logic)):
for unl_or_ in range(len(logic[ks_or[or_]])):
and_logic = logic[ks_or[or_]][unl_or_]
ks_and = list(logic[ks_or[or_]][unl_or_].keys())
for and_ in range(len(and_logic)):
for unl_and_ in range(len(and_logic[ks_and[and_]])):
if and_logic[ks_and[and_]][unl_and_].get('operand1') in qualified:
quant_id.add(and_logic[ks_and[and_]][unl_and_].get('operand1').split('.')[-1])
elif and_logic[ks_and[and_]][unl_and_].get('operand2') in qualified:
quant_id.add(and_logic[ks_and[and_]][unl_and_].get('operand2').split('.')[-1])
else:
continue
dictionary = {'rules':rules2, 'transformations': trans2, 'quantifiers': quant_id}
print(dictionary)
Result:
{'rules': {123, 589}, 'transformations': {1, 6}, 'quantifiers': {'456', '123'}}
Updated with set instead of list so only unique values remain.
Related
Just having trouble with itertools.groupby. Given a list of dictionaries,
my_list= [
"AD01", "AD01AA", "AD01AB", "AD01AC", "AD01AD","AD02", "AD02AA", "AD02AB", "AD02AC"]
from this list, I expected to create a dictionary, where the key is the shortest name and the values are the longest names
example
[
{"Legacy" : "AD01", "rphy" : ["AD01AA", "AD01AB", "AD01AC", "AD01AD"]},
{"Legacy" : "AD02", "rphy" : ["AD02AA", "AD02AB", "AD02AC"]},
]
could you help me please
You can use itertools.groupby, with some nexts:
from itertools import groupby
my_list= ["AD01", "AD01AA", "AD01AB", "AD01AC", "AD01AD","AD02", "AD02AA", "AD02AB", "AD02AC"]
groups = groupby(my_list, len)
output = [{'Legacy': next(g), 'rphy': list(next(groups)[1])} for _, g in groups]
print(output)
# [{'Legacy': 'AD01', 'rphy': ['AD01AA', 'AD01AB', 'AD01AC', 'AD01AD']},
# {'Legacy': 'AD02', 'rphy': ['AD02AA', 'AD02AB', 'AD02AC']}]
This is not robust to reordering of the input list.
Also, if there is some "gap" in the input, e.g., if "AD01" does not have corresponding 'rphy' entries, then it will throw a StopIteration error as you have found out. In that case you can use a more conventional approach:
from itertools import groupby
my_list= ["AD01", "AD02", "AD02AA", "AD02AB", "AD02AC"]
output = []
for item in my_list:
if len(item) == 4:
dct = {'Legacy': item, 'rphy': []}
output.append(dct)
else:
dct['rphy'].append(item)
print(output)
# [{'Legacy': 'AD01', 'rphy': []}, {'Legacy': 'AD02', 'rphy': ['AD02AA', 'AD02AB', 'AD02AC']}]
One approach would be: (see the note at the end of the answer)
from itertools import groupby
from pprint import pprint
my_list = [
"AD01",
"AD01AA",
"AD01AB",
"AD01AC",
"AD01AD",
"AD02",
"AD02AA",
"AD02AB",
"AD02AC",
]
res = []
for _, g in groupby(my_list, len):
lst = list(g)
if len(lst) == 1:
res.append({"Legacy": lst[0], "rphy": []})
else:
res[-1]["rphy"].append(lst)
pprint(res)
output:
[{'Legacy': 'AD01', 'rphy': [['AD01AA', 'AD01AB', 'AD01AC', 'AD01AD']]},
{'Legacy': 'AD02', 'rphy': [['AD02AA', 'AD02AB', 'AD02AC']]}]
This assumes that your data always starts with your desired key(the name which has the smallest name compare to the next values).
Basically in every iteration you check then length of the created list from groupby. If it is 1, this mean it's your key, if not, it will add the next items to the dictionary.
Note: This code would break if there aren't at least 2 names with the length larger than the keys between two keys.
Recently I've heared about #lru_chace in python and I tried it out with the following function:
#lru_cache(maxsize=1024)
def n_words_frequency(document: str, numberOfWords = 1) -> dict:
"""
This function returns a sorted dict that contatins the frequency of words in document
"""
words = document.split()
queue = deque()
dict = {}
update_dict = dict.update
append_right = queue.append
pop_left = queue.popleft
for idx, word in enumerate(words):
# We need to create the queue
if idx < numberOfWords - 1:
append_right(word)
# Queue size equals to numOfWords
else:
append_right(word)
expre = ' '.join(queue)
if expre in dict:
dict[expre] += 1
else:
update_dict({expre : 1})
pop_left()
return {k[::-1] : v for k, v in sorted(dict.items(), reverse=True, key=lambda item: item[1])}
The function get a string document and number of wanted words in expression and returns all of the expression with thier frequence.
For example, "my name is liam what is your name" and numberOfWords equal to 1 will return:
{'name': 2, 'is': 2, 'my': 1, 'liam': 1, 'what': 1, 'your': 1}
Now, when I added #lru_chace I got improvment of more than X150 in speed and after reading the chace_info I saw that the #lru_chace have'nt done anything
chace_info -> CacheInfo(hits=0, misses=1, maxsize=1024, currsize=1)
Can someone please explain me why was it so helpful?
I am trying to sort a large json file with Steam games in descending order based on the value of key: positive_ratings, without using the built in sort() function.
small_example = [
{'id':10,'game':'Counterstrike','positive_ratings':150},
{'id':20,'game':'Bioshock Infinite','positive_ratings':50},
{'id':30,'game':'Rust','positive_ratings':300},
{'id':40,'game':'Portal','positive_ratings':200}
]
The output in descending order would be the following list:
['Rust', 'Portal', 'Counterstrike', 'Bioshock Infinite']
For school we had to make a quick sort function that sorts lists like below. Now i would like to rewrite it so it sorts a list of dictionaries.
def quick_sort(sequence):
length = len(sequence)
if length <= 1:
return sequence
else:
centre = sequence.pop()
items_bigger = []
items_smaller = []
for item in sequence:
if item > centre:
items_bigger.append(item)
else: items_smaller.append(item)
return quick_sort(items_smaller) + [centre] + quick_sort(items_bigger)
print(quick_sort([1,2,5,6,2,10,34,54,23,1]))
In your code, you sort the list based on the element's value. But what you want is sorting list based on element['positive_ratings']. You just need to alter code a little bit:
def quick_sort(sequence):
length = len(sequence)
if length <= 1:
return sequence
else:
centre = sequence.pop()
items_bigger = []
items_smaller = []
for item in sequence:
if item['positive_ratings'] > centre['positive_ratings']: # I changed only this line
items_bigger.append(item)
else: items_smaller.append(item)
return quick_sort(items_smaller) + [centre] + quick_sort(items_bigger)
sort function also works like that, you just specify the key:
some_list.sort(key= lambda x: x['positive_ratings'])
We can adjust your code to look similar to sort function:
def quick_sort(sequence, key = lambda x: x):
length = len(sequence)
if length <= 1:
return sequence
else:
centre = sequence.pop()
items_bigger = []
items_smaller = []
for item in sequence:
if key(item> key(centre): # I changed only this line
items_bigger.append(item)
else: items_smaller.append(item)
return quick_sort(items_smaller, key) + [centre] + quick_sort(items_bigger, key)
You can call it like this:
quick_sort(small_example, key = lambda x: x['positive_ratings'])
Edit: I forgot to add key in the last line. Thanks to #DarrylG I fixed that
you can sort the example, by sorting the data based, based on the key positive_ratings ie sort the postive_ratings values first and then based on that return the output
small_example = [
{'id':10,'game':'Counterstrike','positive_ratings':150},
{'id':20,'game':'Bioshock Infinite','positive_ratings':50},
{'id':30,'game':'Rust','positive_ratings':300},
{'id':40,'game':'Portal','positive_ratings':200}
]
def func(data, key: int):
dic = {}
for i in data:
if i[key] not in dic:
dic[i[key]] = [i]
else:
dic[i[key]].append(i)
dic_key = list(dic.keys())
# sorting the dic_key, sorting data based on postive_raing values, you can
# use any sort algo here
for i in range(len(dic_key)):
for j in range(i+1, len(dic_key)):
if dic_key[i]>dic_key[j]:
dic_key[i], dic_key[j] = dic_key[j], dic_key[i]
result = []
for i in dic_key:
result.extend(dic[i])
return result
sol = func(small_example, 'positive_ratings')
print(solution)
output
[{'id': 20, 'game': 'Bioshock Infinite', 'positive_ratings': 50},
{'id': 10, 'game': 'Counterstrike', 'positive_ratings': 150},
{'id': 40, 'game': 'Portal', 'positive_ratings': 200},
{'id': 30, 'game': 'Rust', 'positive_ratings': 300}]
I have a list of strings that contains 'literal duplicates' and 'pseudo-duplicates' which differ only in lower- and uppercase writing. I am looking for a function that treats all literal duplicates as one group, returns their indices, and finds all pseudo-duplicates for these elements, again returning their indices.
Here's an example list:
a = ['bar','bar','foo','Bar','Foo','Foo']
And this is the output I am looking for (a list of lists of lists):
dupe_list = [[[0,1],[3]],[[2],[4,5]]]
Explanation: 'bar' appears twice at the indexes 0 and 1 and there is one pseudo-duplicate 'Bar' at index 3. 'foo' appears once at index 2 and there are two pseudo-duplicates 'Foo' at indexes 4 and 5.
Here is one solution (you didn't clarify what the logic of list items will be and i considered that you want the items in lower format as they are met from left to right in the list, let me know if it must be different):
d={i:[[], []] for i in set(k.lower() for k in a)}
for i in range(len(a)):
if a[i] in d.keys():
d[a[i]][0].append(i)
else:
d[a[i].lower()][1].append(i)
result=list(d.values())
Output:
>>> print(result)
[[[0, 1], [3]], [[2], [4, 5]]]
Here's how I would achieve it. But you should consider using a dictionary and not a list of list of list. Dictionaries are excellent data structures for problems like this.
#default argument vars
a = ['bar','bar','foo','Bar','Foo','Foo']
#initalize a dictionary to count occurances
a_dict = {}
for i in a:
a_dict[i] = None
#loop through keys in dictionary, which is values from a_list
#loop through the items from list a
#if the item is exact match to key, add index to list of exacts
#if the item is similar match to key, add index to list of similars
#update the dictionary key's value
for k, v in a_dict.items():
index_exact = []
index_similar = []
for i in range(len(a)):
print(a[i])
print(a[i] == k)
if a[i] == str(k):
index_exact.append(i)
elif a[i].lower() == str(k):
index_similar.append(i)
a_dict[k] = [index_exact, index_similar]
#print out dictionary values to assure answer
print(a_dict.items())
#segregate values from dictionary to its own list.
dup_list = []
for v in a_dict.values():
dup_list.append(v)
print(dup_list)
Here is the solution. I have handled the situation where if there are only pseudo duplicates present or only literal duplicates present
a = ['bar', 'bar', 'foo', 'Bar', 'Foo', 'Foo', 'ka']
# Dictionaries to store the positions of words
literal_duplicates = dict()
pseudo_duplicates = dict()
for index, item in enumerate(a):
# Treates words as literal duplicates if word is in smaller case
if item.islower():
if item in literal_duplicates:
literal_duplicates[item].append(index)
else:
literal_duplicates[item] = [index]
# Handle if only literal_duplicates present
if item not in pseudo_duplicates:
pseudo_duplicates[item] = []
# Treates words as pseudo duplicates if word is in not in smaller case
else:
item_lower = item.lower()
if item_lower in pseudo_duplicates:
pseudo_duplicates[item_lower].append(index)
else:
pseudo_duplicates[item_lower] = [index]
# Handle if only pseudo_duplicates present
if item not in literal_duplicates:
literal_duplicates[item_lower] = []
# Form final list from the dictionaries
dupe_list = [[v, pseudo_duplicates[k]] for k, v in literal_duplicates.items()]
Here is the simple and easy to understand answer for you
a = ['bar','bar','foo','Bar','Foo','Foo']
dupe_list = []
ilist = []
ilist2 =[]
samecase = -1
dupecase = -1
for i in range(len(a)):
if a[i] != 'Null':
ilist = []
ilist2 = []
for j in range(i+1,len(a)):
samecase = -1
dupecase = -1
# print(a)
if i not in ilist:
ilist.append(i)
if a[i] == a[j]:
# print(a[i],a[j])
samecase = j
a[j] = 'Null'
elif a[i] == a[j].casefold():
# print(a[i],a[j])
dupecase = j
a[j] = 'Null'
# print(samecase)
# print(ilist,ilist2)
if samecase != -1:
ilist.append(samecase)
if dupecase != -1:
ilist2.append(dupecase)
dupe_list.append([ilist,ilist2])
a[i]='Null'
print(dupe_list)
To illustrate what I mean by this, here is an example
messages = [
('Ricky', 'Steve', 'SMS'),
('Steve', 'Karl', 'SMS'),
('Karl', 'Nora', 'Email')
]
I want to convert this list and a definition of groups to a list of integers and a lookup dictionary so that each element in the group gets a unique id. That id should map to the element in the lookup table like this
messages_int, lookup_table = create_lookup_list(
messages, ('person', 'person', 'medium'))
print messages_int
[ (0, 1, 0),
(1, 2, 0),
(2, 3, 1) ]
print lookup_table
{ 'person': ['Ricky', 'Steve', 'Karl', 'Nora'],
'medium': ['SMS', 'Email']
}
I wonder if there is an elegant and pythonic solution to this problem.
I am also open to better terminology than create_lookup_list etc
defaultdict combined with the itertools.count().next method is a good way to assign identifiers to unique items. Here's an example of how to apply this in your case:
from itertools import count
from collections import defaultdict
def create_lookup_list(data, domains):
domain_keys = defaultdict(lambda:defaultdict(count().next))
out = []
for row in data:
out.append(tuple(domain_keys[dom][val] for val, dom in zip(row, domains)))
lookup_table = dict((k, sorted(d, key=d.get)) for k, d in domain_keys.items())
return out, lookup_table
Edit: note that count().next becomes count().__next__ or lambda: next(count()) in Python 3.
Mine's about the same length and complexity:
import collections
def create_lookup_list(messages, labels):
# Collect all the values
lookup = collections.defaultdict(set)
for msg in messages:
for l, v in zip(labels, msg):
lookup[l].add(v)
# Make the value sets lists
for k, v in lookup.items():
lookup[k] = list(v)
# Make the lookup_list
lookup_list = []
for msg in messages:
lookup_list.append([lookup[l].index(v) for l, v in zip(labels, msg)])
return lookup_list, lookup
In Otto's answer (or anyone else's with string->id dicts), I'd replace (if obsessing over speed is your thing):
# create the lookup table
lookup_dict = {}
for group in indices:
lookup_dict[group] = sorted(indices[group].keys(),
lambda e1, e2: indices[group][e1]-indices[group][e2])
by
# k2i must map keys to consecutive ints [0,len(k2i)-1)
def inverse_indices(k2i):
inv=[0]*len(k2i)
for k,i in k2i.iteritems():
inv[i]=k
return inv
lookup_table = dict((g,inverse_indices(gi)) for g,gi in indices.iteritems())
This is better because direct assignment to each item in the inverse array directly is faster than sorting.
Here is my own solution - I doubt it's the best
def create_lookup_list(input_list, groups):
# use a dictionary for the indices so that the index lookup
# is fast (not necessarily a requirement)
indices = dict((group, {}) for group in groups)
output = []
# assign indices by iterating through the list
for row in input_list:
newrow = []
for group, element in zip(groups, row):
if element in indices[group]:
index = indices[group][element]
else:
index = indices[group][element] = len(indices[group])
newrow.append(index)
output.append(newrow)
# create the lookup table
lookup_dict = {}
for group in indices:
lookup_dict[group] = sorted(indices[group].keys(),
lambda e1, e2: indices[group][e1]-indices[group][e2])
return output, lookup_dict
This is a bit simpler, and more direct.
from collections import defaultdict
def create_lookup_list( messages, schema ):
def mapped_rows( messages ):
for row in messages:
newRow= []
for col, value in zip(schema,row):
if value not in lookups[col]:
lookups[col].append(value)
code= lookups[col].index(value)
newRow.append(code)
yield newRow
lookups = defaultdict(list)
return list( mapped_rows(messages) ), dict(lookups)
If the lookups were proper dictionaries, not lists, this could be simplified further.
Make your "lookup table" have the following structure
{ 'person': {'Ricky':0, 'Steve':1, 'Karl':2, 'Nora':3},
'medium': {'SMS':0, 'Email':1}
}
And it can be further reduced in complexity.
You can turn this working copy of the lookups into it's inverse as follows:
>>> lookups = { 'person': {'Ricky':0, 'Steve':1, 'Karl':2, 'Nora':3},
'medium': {'SMS':0, 'Email':1}
}
>>> dict( ( d, dict( (v,k) for k,v in lookups[d].items() ) ) for d in lookups )
{'person': {0: 'Ricky', 1: 'Steve', 2: 'Karl', 3: 'Nora'}, 'medium': {0: 'SMS', 1: 'Email'}}
Here is my solution, it's not better - it's just different :)
def create_lookup_list(data, keys):
encoded = []
table = dict([(key, []) for key in keys])
for record in data:
msg_int = []
for key, value in zip(keys, record):
if value not in table[key]:
table[key].append(value)
msg_int.append(table[key].index(value))
encoded.append(tuple(msg_int))
return encoded, table
Here is mine, the inner function lets me write the index-tuple as a generator.
def create_lookup_list( data, format):
table = {}
indices = []
def get_index( item, form ):
row = table.setdefault( form, [] )
try:
return row.index( item )
except ValueError:
n = len( row )
row.append( item )
return n
for row in data:
indices.append( tuple( get_index( item, form ) for item, form in zip( row, format ) ))
return table, indices