combining sets within a list

combining sets within a list - python

Hi so I'm trying to do the following but have gotten a bit stuck. Say I have a list of sets:
A = [set([1,2]), set([3,4]), set([1,6]), set([1,5])]
I want to create a new list which looks like the following:
B = [ set([1,2,5,6]), set([3,4]) ]
i.e create a list of sets with the sets joined if they overlap. This is probably simple but I can't quite get it right this morning.

This also works and is quite short:
import itertools
groups = [{'1', '2'}, {'3', '2'}, {'2', '4'}, {'5', '6'}, {'7', '8'}, {'7','9'}]
while True:
for s1, s2 in itertools.combinations(groups, 2):
if s1.intersection(s2):
break
else:
break
groups.remove(s1)
groups.remove(s2)
groups.append(s1.union(s2))
groups
This gives the following output:
[{'5', '6'}, {'1', '2', '3', '4'}, {'7', '8', '9'}]
The while True does seems a bit dangerous to me, any thoughts anyone?

How about:
from collections import defaultdict
def sortOverlap(listOfTuples):
# The locations of the values
locations = defaultdict(lambda: [])
# 'Sorted' list to return
sortedList = []
# For each tuple in the original list
for i, a in enumerate(listOfTuples):
for k, element in enumerate(a):
locations[element].append(i)
# Now construct the sorted list
coveredElements = set()
for element, tupleIndices in locations.iteritems():
# If we've seen this element already then skip it
if element in coveredElements:
continue
# Combine the lists
temp = []
for index in tupleIndices:
temp += listOfTuples[index]
# Add to the list of sorted tuples
sortedList.append(list(set(temp)))
# Record that we've covered this element
for element in sortedList[-1]:
coveredElements.add(element)
return sortedList
# Run the example (with tuples)
print sortOverlap([(1,2), (3,4), (1,5), (1,6)])
# Run the example (with sets)
print sortOverlap([set([1,2]), set([3,4]), set([1,5]), set([1,6])])

You could use intersection() and union() in for loops:
A = [set([1,2]), set([3,4]), set([1,6]), set([1,5])]
intersecting = []
for someSet in A:
for anotherSet in A:
if someSet.intersection(anotherSet) and someSet != anotherSet:
intersecting.append(someSet.union(anotherSet))
A.pop(A.index(anotherSet))
A.pop(A.index(someSet))
finalSet = set([])
for someSet in intersecting:
finalSet = finalSet.union(someSet)
A.append(finalSet)
print A
Output: [set([3, 4]), set([1, 2, 5, 6])]

A slightly more straightforward solution,
def overlaps(sets):
overlapping = []
for a in sets:
match = False
for b in overlapping:
if a.intersection(b):
b.update(a)
match = True
break
if not match:
overlapping.append(a)
return overlapping
examples
>>> overlaps([set([1,2]), set([1,3]), set([1,6]), set([3,5])])
[{1, 2, 3, 5, 6}]
>>> overlaps([set([1,2]), set([3,4]), set([1,6]), set([1,5])])
[{1, 2, 5, 6}, {3, 4}]

for set_ in A:
new_set = set(set_)
for other_set in A:
if other_set == new_set:
continue
for item in other_set:
if item in set_:
new_set = new_set.union(other_set)
break
if new_set not in B:
B.append(new_set)
Input/Output:
A = [set([1,2]), set([3,4]), set([2,3]) ]
B = [set([1, 2, 3]), set([2, 3, 4]), set([1, 2, 3, 4])]
A = [set([1,2]), set([3,4]), set([1,6]), set([1,5])]
B = [set([1, 2, 5, 6]), set([3, 4])]
A = [set([1,2]), set([1,3]), set([1,6]), set([3,5])]
B = [set([1, 2, 3, 6]), set([1, 2, 3, 5, 6]), set([1, 3, 5])]

This function will do the job, without touching the input:
from copy import deepcopy
def remove_overlapped(input_list):
input = deepcopy(input_list)
output = []
index = 1
while input:
head = input[0]
try:
next_item = input[index]
except IndexError:
output.append(head)
input.remove(head)
index = 1
continue
if head & next_item:
head.update(next_item)
input.remove(next_item)
index = 1
else:
index += 1
return output

Here is a function that does what you want. Probably not the most pythonic one but does the job, most likely can be improved a lot.
from sets import Set
A = [set([1,2]), set([3,4]), set([2,3]) ]
merges = any( a&b for a in A for b in A if a!=b)
while(merges):
B = [A[0]]
for a in A[1:] :
merged = False
for i,b in enumerate(B):
if a&b :
B[i]=b | a
merged =True
break
if not merged:
B.append(a)
A = B
merges = any( a&b for a in A for b in A if a!=b)
print B
What is happening there is the following, we loop all the sets in A, (except the first since we added that to B already. We check the intersection with all the sets in B, if the intersection result anything but False (aka empty set) we perform a union on the set and start the next iteration, about set operation check this page:
https://docs.python.org/2/library/sets.html
& is intersection operator
| is union operator
You can probably go more pythonic using any() etc but wuold have required more processing so I avoided that

Related

Find consecutive and nonconsecutive ordered sequences of items in a list

I have two lists:
lookup_list = [1,2,3]
my_list = [1,2,3,4,5,2,1,2,2,1,2,3,4,5,1,3,2,3,1]
I want to count how many times the lookup_list appeared in my_list with the following logic:
The order should be 1 -> 2 -> 3
In my_list, the lookup_list items doesn't have to be next to each other: 1,4,2,1,5,3 -> should generate a match since there is a 2 comes after a 1 and a 3 comes after 2.
The mathces based on the logic:
1st match: [1,2,3,4,5,2,1,2,2,1,2,3,4,5,1,3,2,3,1]
2nd match: [1,2,3,4,5,2,1,2,2,1,2,3,4,5,1,3,2,3,1]
3rd match: [1,2,3,4,5,2,1,2,2,1,2,3,4,5,1,3,2,3,1]
4th match: [1,2,3,4,5,2,1,2,2,1,2,3,4,5,1,3,2,3,1]
The lookup_list is dynamic, it could be defined as [1,2] or [1,2,3,4], etc. How can I solve it? All the answers I've found is about finding matches where 1,2,3 appears next to each other in an ordered way like this one: Find matching sequence of items in a list
I can find the count of consecutive sequences with the below code but it doesn't count the nonconsecutive sequences:
from nltk import ngrams
lookup_list = [1,2,3]
my_list = [1,2,3,4,5,2,1,2,2,1,2,3,4,5,1,3,2,3,1]
all_counts = Counter(ngrams(l2, len(l1)))
counts = {k: all_counts[k] for k in [tuple(lookup_list)]}
counts
>>> {(1, 2, 3): 2}
I tried using pandas rolling window functions but they don't have a custom reset option.

def find_all_sequences(source, sequence):
def find_sequence(source, sequence, index, used):
for i in sequence:
while True:
index = source.index(i, index + 1)
if index not in used:
break
yield index
first, *rest = sequence
index = -1
used = set()
while True:
try:
index = source.index(first, index + 1)
indexes = index, *find_sequence(source, rest, index, used)
except ValueError:
break
else:
used.update(indexes)
yield indexes
Usage:
lookup_list = [1,2,3]
my_list = [1,2,3,4,5,2,1,2,2,1,2,3,4,5,1,3,2,3,1]
print(*find_all_sequences(my_list, lookup_list), sep="\n")
Output:
(0, 1, 2)
(6, 7, 11)
(9, 10, 15)
(14, 16, 17)
Generator function find_all_sequences() yields tuples with indexes of sequence matches. In this function we initialize loop which will be stopped when list.index() call will throw ValueError. Internal generator function find_sequence() yields index of every sequence item.
According to this benchmark, my method is about 60% faster than one from Andrej Kesely's answer.

The function find_matches() returns indices where the matches from lookup_list are:
def find_matches(lookup_list, lst):
buckets = []
def _find_bucket(i, v):
for b in buckets:
if lst[b[-1]] == lookup_list[len(b) - 1] and v == lookup_list[len(b)]:
b.append(i)
if len(b) == len(lookup_list):
buckets.remove(b)
return b
break
else:
if v == lookup_list[0]:
buckets.append([i])
rv = []
for i, v in enumerate(my_list):
b = _find_bucket(i, v)
if b:
rv.append(b)
return rv
lookup_list = [1, 2, 3]
my_list = [1, 2, 3, 4, 5, 2, 1, 2, 2, 1, 2, 3, 4, 5, 1, 3, 2, 3, 1]
print(find_matches(lookup_list, my_list))
Prints:
[[0, 1, 2], [6, 7, 11], [9, 10, 15], [14, 16, 17]]

Here is a recursive solution:
lookup_list = [1,2,3]
my_list = [1,2,3,4,5,2,1,2,2,1,2,3,4,5,1,3,2,3,1]
def find(my_list, continue_from_index):
if continue_from_index > (len(my_list) - 1):
return 0
last_found_index = 0
found_indizes = []
first_occuring_index = 0
found = False
for l in lookup_list:
for m_index in range(continue_from_index, len(my_list)):
if my_list[m_index] is l and m_index >= last_found_index:
if not found:
found = True
first_occuring_index = m_index
last_found_index = m_index
found += 1
found_indizes.append(str(m_index))
break
if len(found_indizes) is len(lookup_list):
return find(my_list, first_occuring_index+1) + 1
return 0
print(find(my_list, 0))

my_list = [5, 6, 3, 8, 2, 1, 7, 1]
lookup_list = [8, 2, 7]
counter =0
result =False
for i in my_list:
if i in lookup_list:
counter+=1
if(counter==len(lookup_list)):
result=True
print (result)

Take the mean of values in a list if a duplicate is found

I have 2 lists which are associated with each other. E.g., here, 'John' is associated with '1', 'Bob' is associated with 4, and so on:
l1 = ['John', 'Bob', 'Stew', 'John']
l2 = [1, 4, 7, 3]
My problem is with the duplicate John. Instead of adding the duplicate John, I want to take the mean of the values associated with the Johns, i.e., 1 and 3, which is (3 + 1)/2 = 2. Therefore, I would like the lists to actually be:
l1 = ['John', 'Bob', 'Stew']
l2 = [2, 4, 7]
I have experimented with some solutions including for-loops and the "contains" function, but can't seem to piece it together. I'm not very experienced with Python, but linked lists sound like they could be used for this.
Thank you

I believe you should use a dict. :)
def mean_duplicate(l1, l2):
ret = {}
# Iterating through both lists...
for name, value in zip(l1, l2):
if not name in ret:
# If the key doesn't exist, create it.
ret[name] = value
else:
# If it already does exist, update it.
ret[name] += value
# Then for the average you're looking for...
for key, value in ret.iteritems():
ret[key] = value / l1.count(key)
return ret
def median_between_listsElements(l1, l2):
ret = {}
for name, value in zip(l1, l2):
# Creating key + list if doesn't exist.
if not name in ret:
ret[name] = []
ret[name].append(value)
for key, value in ret.iteritems():
ret[key] = np.median(value)
return ret
l1 = ['John', 'Bob', 'Stew', 'John']
l2 = [1, 4, 7, 3]
print mean_duplicate(l1, l2)
print median_between_listsElements(l1, l2)
# {'Bob': 4, 'John': 2, 'Stew': 7}
# {'Bob': 4.0, 'John': 2.0, 'Stew': 7.0}

The following might give you an idea. It uses an OrderedDict assuming that you want the items in the order of appearance from the original list:
from collections import OrderedDict
d = OrderedDict()
for x, y in zip(l1, l2):
d.setdefault(x, []).get(x).append(y)
# OrderedDict([('John', [1, 3]), ('Bob', [4]), ('Stew', [7])])
names, values = zip(*((k, sum(v)/len(v)) for k, v in d.items()))
# ('John', 'Bob', 'Stew')
# (2.0, 4.0, 7.0)

Here is a shorter version using dict,
final_dict = {}
l1 = ['John', 'Bob', 'Stew', 'John']
l2 = [1, 4, 7, 3]
for i in range(len(l1)):
if final_dict.get(l1[i]) == None:
final_dict[l1[i]] = l2[i]
else:
final_dict[l1[i]] = int((final_dict[l1[i]] + l2[i])/2)
print(final_dict)

Something like this:
#!/usr/bin/python
l1 = ['John', 'Bob', 'Stew', 'John']
l2 = [1, 4, 7, 3]
d={}
for i in range(0, len(l1)):
key = l1[i]
if d.has_key(key):
d[key].append(l2[i])
else:
d[key] = [l2[i]]
r = []
for values in d.values():
r.append((key,sum(values)/len(values)))
print r

Hope following code helps
l1 = ['John', 'Bob', 'Stew', 'John']
l2 = [1, 4, 7, 3]
def remove_repeating_names(names_list, numbers_list):
new_names_list = []
new_numbers_list = []
for first_index, first_name in enumerate(names_list):
amount_of_occurencies = 1
number = numbers_list[first_index]
for second_index, second_name in enumerate(names_list):
# Check if names match and
# if this name wasn't read in earlier cycles or is not same element.
if (second_name == first_name):
if (first_index < second_index):
number += numbers_list[second_index]
amount_of_occurencies += 1
# Break the loop if this name was read earlier.
elif (first_index > second_index):
amount_of_occurencies = -1
break
if amount_of_occurencies is not -1:
new_names_list.append(first_name)
new_numbers_list.append(number/amount_of_occurencies)
return [new_names_list, new_numbers_list]
# Unmodified arrays
print(l1)
print(l2)
l1, l2 = remove_repeating_names(l1, l2)
# If you want numbers list to be integer, not float, uncomment following line:
# l2 = [int(number) for number in l2]
# Modified arrays
print(l1)
print(l2)

Finding item frequency in list of lists

Let's say I have a list of lists and I want to find the frequency in which pairs (or more) of elements appears in total.
For example, if i have [[a,b,c],[b,c,d],[c,d,e]
I want :(a,b) = 1, (b,c) = 2, (c,d) = 2, etc.
I tried finding a usable apriori algorithm that would allow me to do this, but i couldn't find a easy to implement one in python.
How would I approach this problem in a better way?

This is a way to do it:
from itertools import combinations
l = [['a','b','c'],['b','c','d'],['c','d','e']]
d = {}
for i in l:
# for every item on l take all the possible combinations of 2
comb = combinations(i, 2)
for c in comb:
k = ''.join(c)
if d.get(k):
d[k] += 1
else:
d[k] = 1
Result:
>>> d
{'bd': 1, 'ac': 1, 'ab': 1, 'bc': 2, 'de': 1, 'ce': 1, 'cd': 2}

Python - Comparing lists

I have 3 lists, the first of which is a list of 5 random digits, as shown below:
import random
def rollDice():
dice = []
for i in range(5):
dice.append(random.randint(1,6))
return sorted(dice)
dice = rollDice()
largeStraight = [[1,2,3,4,5] , [2,3,4,5,6]]
smallStraight = [[1,2,3,4] , [2,3,4,5] , [3,4,5,6]]
My question is what is the best way to see if dice is equal to either of the nested lists in largeStraight, and secondly whether any of the nested lists in smallStraight are a subset of dice. I am looking for a simple true or false return.
Thanks for any help.

Use sets instead of lists:
largeStraight = [{1, 2, 3, 4, 5}, {2, 3, 4, 5, 6}]
smallStraight = [{1, 2, 3, 4}, {2, 3, 4, 5} , {3 ,4, 5, 6}]
Now you can use set operations:
if any(ls.issubset(dice) for ls in largeStraight):
# a large straight
elif any(ss.issubset(dice) for ss in smallStraight):
# a small straight
You can still turn each list in largeStraight and smallStraight into a set in the generator expression passed to any(), but that'd be a waste of CPU cycles.
Demo:
>>> dice = [2, 3, 5, 1, 4]
>>> if any(ls.issubset(dice) for ls in largeStraight):
... print 'Large!'
... elif any(ss.issubset(dice) for ss in smallStraight):
... print 'Small!'
...
Large!
>>> dice = [2, 3, 5, 1, 6]
>>> if any(ls.issubset(dice) for ls in largeStraight):
... print 'Large!'
... elif any(ss.issubset(dice) for ss in smallStraight):
... print 'Small!'
...
>>> dice = [2, 3, 6, 4, 1]
>>> if any(ls.issubset(dice) for ls in largeStraight):
... print 'Large!'
... elif any(ss.issubset(dice) for ss in smallStraight):
... print 'Small!'
...
Small!

If order is important, for the large straight, you could simply do this:
dice in largeStraight
and for the small straight, you could do this:
any(i in (dice[0:4], dice[1:5]) for i in smallStraight)
Alternatively, you could replace the rollDice function with this:
def rollDice():
dice = set()
for i in range(5):
dice.add(random.randint(1, 6))
return dice
and use sets as suggested in other answers.
You could also replace your current rollDice definition with a list comprehension:
def rollDice():
return sorted([random.randint(1, 6) for _ in range(5)])
or
def rollDice():
return {random.randint(1, 6) for _ in range(5)}
for a set.
However, I'd advise against using sets. It would work in this case, but I presume that this is part of a larger program. Sets can't have duplicate elements, so if you later wanted to check if there was a pair of equal numbers in dice, and it was a set, then you would always get a negative response.

Here is possible solution
is_large_straight = any(set(x) & set(dice) == set(x) for x in largeStraight)
is_small_straight = any(set(x) & set(dice) == set(x) for x in smallStraight)
I hope it'll help you.

efficient list mapping in python

I have the following input:
input = [(dog, dog, cat, mouse), (cat, ruby, python, mouse)]
and trying to have the following output:
outputlist = [[0, 0, 1, 2], [1, 3, 4, 2]]
outputmapping = {0:dog, 1:cat, 2:mouse, 3:ruby, 4:python, 5:mouse}
Any tips on how to handle given with scalability in mind (var input can get really large).

You probably want something like:
import collections
import itertools
def build_catalog(L):
counter = itertools.count().next
names = collections.defaultdict(counter)
result = []
for t in L:
new_t = [ names[item] for item in t ]
result.append(new_t)
catalog = dict((name, idx) for idx, name in names.iteritems())
return result, catalog
Using it:
>>> input = [('dog', 'dog', 'cat', 'mouse'), ('cat', 'ruby', 'python', 'mouse')]
>>> outputlist, outputmapping = build_catalog(input)
>>> outputlist
[[0, 0, 1, 2], [1, 3, 4, 2]]
>>> outputmapping
{0: 'dog', 1: 'cat', 2: 'mouse', 3: 'ruby', 4: 'python'}

This class will automatically map objects to increasing integer values:
class AutoMapping(object):
def __init__(self):
self.map = {}
self.objects = []
def __getitem__(self, val):
if val not in self.map:
self.map[val] = len(self.objects)
self.objects.append(val)
return self.map[val]
Example usage, for your input:
>>> input = [('dog', 'dog', 'cat', 'mouse'), ('cat', 'ruby', 'python', 'mouse')]
>>> map = AutoMapping()
>>> [[map[x] for x in y] for y in input]
[[0, 0, 1, 2], [1, 3, 4, 2]]
>>> map.objects
['dog', 'cat', 'mouse', 'ruby', 'python']
>>> dict(enumerate(map.objects))
{0: 'dog', 1: 'cat', 2: 'mouse', 3: 'ruby', 4: 'python'}

Here is one possible solution, although it isn't the greatest. It could be made slightly more efficient if you know how many elements each entry in the list will have before-hand, by pre-allocating them.
labels=[];
label2index={};
outputlist=[];
for group in input:
current=[];
for label in group:
if label not in label2index:
label2index[label]=len(labels);
labels.append(label);
current.append(label2index[label]);
outputlist.append(current);
outputmapping={};
for idx, val in enumerate(labels):
outputmapping[idx]=val;

I had the same problem quite often in my projects, so I wrapped up a class some time ago that does exactly this:
class UniqueIdGenerator(object):
"""A dictionary-like class that can be used to assign unique integer IDs to
names.
Usage:
>>> gen = UniqueIdGenerator()
>>> gen["A"]
0
>>> gen["B"]
1
>>> gen["C"]
2
>>> gen["A"] # Retrieving already existing ID
0
>>> len(gen) # Number of already used IDs
3
"""
def __init__(self, id_generator=None):
"""Creates a new unique ID generator. `id_generator` specifies how do we
assign new IDs to elements that do not have an ID yet. If it is `None`,
elements will be assigned integer identifiers starting from 0. If it is
an integer, elements will be assigned identifiers starting from the given
integer. If it is an iterator or generator, its `next` method will be
called every time a new ID is needed."""
if id_generator is None:
id_generator = 0
if isinstance(id_generator, int):
import itertools
self._generator = itertools.count(id_generator)
else:
self._generator = id_generator
self._ids = {}
def __getitem__(self, item):
"""Retrieves the ID corresponding to `item`. Generates a new ID for `item`
if it is the first time we request an ID for it."""
try:
return self._ids[item]
except KeyError:
self._ids[item] = self._generator.next()
return self._ids[item]
def __len__(self):
"""Retrieves the number of added elements in this UniqueIDGenerator"""
return len(self._ids)
def reverse_dict(self):
"""Returns the reversed mapping, i.e., the one that maps generated IDs to their
corresponding items"""
return dict((v, k) for k, v in self._ids.iteritems())
def values(self):
"""Returns the list of items added so far. Items are ordered according to
the standard sorting order of their keys, so the values will be exactly
in the same order they were added if the ID generator generates IDs in
ascending order. This hold, for instance, to numeric ID generators that
assign integers starting from a given number."""
return sorted(self._ids.keys(), key = self._ids.__getitem__)
Usage example:
>>> input = [(dog, dog, cat, mouse), (cat, ruby, python, mouse)]
>>> gen = UniqueIdGenerator()
>>> outputlist = [[gen[x] for x in y] for y in input]
[[0, 0, 1, 2], [1, 3, 4, 2]]
>>> print outputlist
>>> outputmapping = gen.reverse_dict()
>>> print outputmapping
{0: 'dog', 1: 'cat', 2: 'mouse', 3: 'ruby', 4: 'python'}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

combining sets within a list - python

Related

Find consecutive and nonconsecutive ordered sequences of items in a list

Take the mean of values in a list if a duplicate is found

Finding item frequency in list of lists

Python - Comparing lists

efficient list mapping in python

Categories

Resources