return all possible combos of DNA - python

My code below gives me all possible combinations of DNA. Is there a more efficient, cleaner way to do this? Also, for any bioinformatics or biotech programmers, which modules should I become most familiar with?
DNA = 'a', 't', 'g', 'c'
lis = []
def all_combos():
for a in A:
for t in A:
for g in A:
for c in A:
lis.append([a, t, g, c])
return lis
print(all_combos())

You can use itertools.product to generate the list of all combinations. This will generate tuples instead of lists, but I guess that's fine?
from itertools import product
lis = list(product('atgc',repeat=4))
Here 4 means you want to construct 4-tuples.
The algorithm is of course not faster - complexity-wise - than using the for loops, since it is inherently O(mn) with m the number of elements (len('atgc')) and n=4 (the number of elements per tuple). Both algorithms are in terms of big-oh equally fast (although there can be differences).
This yields:
>>> list(product('atgc',repeat=4))
[('a', 'a', 'a', 'a'), ('a', 'a', 'a', 't'), ('a', 'a', 'a', 'g'), ('a', 'a', 'a', 'c'), ('a', 'a', 't', 'a'), ('a', 'a', 't', 't'), ('a', 'a', 't', 'g'), ('a', 'a', 't', 'c'), ('a', 'a', 'g', 'a'), ('a', 'a', 'g', 't'), ('a', 'a', 'g', 'g'), ('a', 'a', 'g', 'c'), ('a', 'a', 'c', 'a'), ('a', 'a', 'c', 't'), ('a', 'a', 'c', 'g'), ('a', 'a', 'c', 'c'), ('a', 't', 'a', 'a'), ('a', 't', 'a', 't'), ('a', 't', 'a', 'g'), ('a', 't', 'a', 'c'), ('a', 't', 't', 'a'), ('a', 't', 't', 't'), ('a', 't', 't', 'g'), ('a', 't', 't', 'c'), ('a', 't', 'g', 'a'), ('a', 't', 'g', 't'), ('a', 't', 'g', 'g'), ('a', 't', 'g', 'c'), ('a', 't', 'c', 'a'), ('a', 't', 'c', 't'), ('a', 't', 'c', 'g'), ('a', 't', 'c', 'c'), ('a', 'g', 'a', 'a'), ('a', 'g', 'a', 't'), ('a', 'g', 'a', 'g'), ('a', 'g', 'a', 'c'), ('a', 'g', 't', 'a'), ('a', 'g', 't', 't'), ('a', 'g', 't', 'g'), ('a', 'g', 't', 'c'), ('a', 'g', 'g', 'a'), ('a', 'g', 'g', 't'), ('a', 'g', 'g', 'g'), ('a', 'g', 'g', 'c'), ('a', 'g', 'c', 'a'), ('a', 'g', 'c', 't'), ('a', 'g', 'c', 'g'), ('a', 'g', 'c', 'c'), ('a', 'c', 'a', 'a'), ('a', 'c', 'a', 't'), ('a', 'c', 'a', 'g'), ('a', 'c', 'a', 'c'), ('a', 'c', 't', 'a'), ('a', 'c', 't', 't'), ('a', 'c', 't', 'g'), ('a', 'c', 't', 'c'), ('a', 'c', 'g', 'a'), ('a', 'c', 'g', 't'), ('a', 'c', 'g', 'g'), ('a', 'c', 'g', 'c'), ('a', 'c', 'c', 'a'), ('a', 'c', 'c', 't'), ('a', 'c', 'c', 'g'), ('a', 'c', 'c', 'c'), ('t', 'a', 'a', 'a'), ('t', 'a', 'a', 't'), ('t', 'a', 'a', 'g'), ('t', 'a', 'a', 'c'), ('t', 'a', 't', 'a'), ('t', 'a', 't', 't'), ('t', 'a', 't', 'g'), ('t', 'a', 't', 'c'), ('t', 'a', 'g', 'a'), ('t', 'a', 'g', 't'), ('t', 'a', 'g', 'g'), ('t', 'a', 'g', 'c'), ('t', 'a', 'c', 'a'), ('t', 'a', 'c', 't'), ('t', 'a', 'c', 'g'), ('t', 'a', 'c', 'c'), ('t', 't', 'a', 'a'), ('t', 't', 'a', 't'), ('t', 't', 'a', 'g'), ('t', 't', 'a', 'c'), ('t', 't', 't', 'a'), ('t', 't', 't', 't'), ('t', 't', 't', 'g'), ('t', 't', 't', 'c'), ('t', 't', 'g', 'a'), ('t', 't', 'g', 't'), ('t', 't', 'g', 'g'), ('t', 't', 'g', 'c'), ('t', 't', 'c', 'a'), ('t', 't', 'c', 't'), ('t', 't', 'c', 'g'), ('t', 't', 'c', 'c'), ('t', 'g', 'a', 'a'), ('t', 'g', 'a', 't'), ('t', 'g', 'a', 'g'), ('t', 'g', 'a', 'c'), ('t', 'g', 't', 'a'), ('t', 'g', 't', 't'), ('t', 'g', 't', 'g'), ('t', 'g', 't', 'c'), ('t', 'g', 'g', 'a'), ('t', 'g', 'g', 't'), ('t', 'g', 'g', 'g'), ('t', 'g', 'g', 'c'), ('t', 'g', 'c', 'a'), ('t', 'g', 'c', 't'), ('t', 'g', 'c', 'g'), ('t', 'g', 'c', 'c'), ('t', 'c', 'a', 'a'), ('t', 'c', 'a', 't'), ('t', 'c', 'a', 'g'), ('t', 'c', 'a', 'c'), ('t', 'c', 't', 'a'), ('t', 'c', 't', 't'), ('t', 'c', 't', 'g'), ('t', 'c', 't', 'c'), ('t', 'c', 'g', 'a'), ('t', 'c', 'g', 't'), ('t', 'c', 'g', 'g'), ('t', 'c', 'g', 'c'), ('t', 'c', 'c', 'a'), ('t', 'c', 'c', 't'), ('t', 'c', 'c', 'g'), ('t', 'c', 'c', 'c'), ('g', 'a', 'a', 'a'), ('g', 'a', 'a', 't'), ('g', 'a', 'a', 'g'), ('g', 'a', 'a', 'c'), ('g', 'a', 't', 'a'), ('g', 'a', 't', 't'), ('g', 'a', 't', 'g'), ('g', 'a', 't', 'c'), ('g', 'a', 'g', 'a'), ('g', 'a', 'g', 't'), ('g', 'a', 'g', 'g'), ('g', 'a', 'g', 'c'), ('g', 'a', 'c', 'a'), ('g', 'a', 'c', 't'), ('g', 'a', 'c', 'g'), ('g', 'a', 'c', 'c'), ('g', 't', 'a', 'a'), ('g', 't', 'a', 't'), ('g', 't', 'a', 'g'), ('g', 't', 'a', 'c'), ('g', 't', 't', 'a'), ('g', 't', 't', 't'), ('g', 't', 't', 'g'), ('g', 't', 't', 'c'), ('g', 't', 'g', 'a'), ('g', 't', 'g', 't'), ('g', 't', 'g', 'g'), ('g', 't', 'g', 'c'), ('g', 't', 'c', 'a'), ('g', 't', 'c', 't'), ('g', 't', 'c', 'g'), ('g', 't', 'c', 'c'), ('g', 'g', 'a', 'a'), ('g', 'g', 'a', 't'), ('g', 'g', 'a', 'g'), ('g', 'g', 'a', 'c'), ('g', 'g', 't', 'a'), ('g', 'g', 't', 't'), ('g', 'g', 't', 'g'), ('g', 'g', 't', 'c'), ('g', 'g', 'g', 'a'), ('g', 'g', 'g', 't'), ('g', 'g', 'g', 'g'), ('g', 'g', 'g', 'c'), ('g', 'g', 'c', 'a'), ('g', 'g', 'c', 't'), ('g', 'g', 'c', 'g'), ('g', 'g', 'c', 'c'), ('g', 'c', 'a', 'a'), ('g', 'c', 'a', 't'), ('g', 'c', 'a', 'g'), ('g', 'c', 'a', 'c'), ('g', 'c', 't', 'a'), ('g', 'c', 't', 't'), ('g', 'c', 't', 'g'), ('g', 'c', 't', 'c'), ('g', 'c', 'g', 'a'), ('g', 'c', 'g', 't'), ('g', 'c', 'g', 'g'), ('g', 'c', 'g', 'c'), ('g', 'c', 'c', 'a'), ('g', 'c', 'c', 't'), ('g', 'c', 'c', 'g'), ('g', 'c', 'c', 'c'), ('c', 'a', 'a', 'a'), ('c', 'a', 'a', 't'), ('c', 'a', 'a', 'g'), ('c', 'a', 'a', 'c'), ('c', 'a', 't', 'a'), ('c', 'a', 't', 't'), ('c', 'a', 't', 'g'), ('c', 'a', 't', 'c'), ('c', 'a', 'g', 'a'), ('c', 'a', 'g', 't'), ('c', 'a', 'g', 'g'), ('c', 'a', 'g', 'c'), ('c', 'a', 'c', 'a'), ('c', 'a', 'c', 't'), ('c', 'a', 'c', 'g'), ('c', 'a', 'c', 'c'), ('c', 't', 'a', 'a'), ('c', 't', 'a', 't'), ('c', 't', 'a', 'g'), ('c', 't', 'a', 'c'), ('c', 't', 't', 'a'), ('c', 't', 't', 't'), ('c', 't', 't', 'g'), ('c', 't', 't', 'c'), ('c', 't', 'g', 'a'), ('c', 't', 'g', 't'), ('c', 't', 'g', 'g'), ('c', 't', 'g', 'c'), ('c', 't', 'c', 'a'), ('c', 't', 'c', 't'), ('c', 't', 'c', 'g'), ('c', 't', 'c', 'c'), ('c', 'g', 'a', 'a'), ('c', 'g', 'a', 't'), ('c', 'g', 'a', 'g'), ('c', 'g', 'a', 'c'), ('c', 'g', 't', 'a'), ('c', 'g', 't', 't'), ('c', 'g', 't', 'g'), ('c', 'g', 't', 'c'), ('c', 'g', 'g', 'a'), ('c', 'g', 'g', 't'), ('c', 'g', 'g', 'g'), ('c', 'g', 'g', 'c'), ('c', 'g', 'c', 'a'), ('c', 'g', 'c', 't'), ('c', 'g', 'c', 'g'), ('c', 'g', 'c', 'c'), ('c', 'c', 'a', 'a'), ('c', 'c', 'a', 't'), ('c', 'c', 'a', 'g'), ('c', 'c', 'a', 'c'), ('c', 'c', 't', 'a'), ('c', 'c', 't', 't'), ('c', 'c', 't', 'g'), ('c', 'c', 't', 'c'), ('c', 'c', 'g', 'a'), ('c', 'c', 'g', 't'), ('c', 'c', 'g', 'g'), ('c', 'c', 'g', 'c'), ('c', 'c', 'c', 'a'), ('c', 'c', 'c', 't'), ('c', 'c', 'c', 'g'), ('c', 'c', 'c', 'c')]
Mind that itertools usually work lazily: they do return an iterator. Since O(mn) usually blows up fast, it can therefore be useful to use a generator instead of constructing a list: in that case at least you save on memory. Furthermore if n is large (like 16 or larger for m=4), usually a computer will start having difficulty processing the elements.

Guess I was beaten while trying this out .. will leave it up just for my statistics.
If what you want is actually all possible permutations (i.e aaaa, aaat, aaag, aaac... ), you can use itertools this way:
from itertools import product
print(list(product('atgc', repeat=4)))

There is a python function for generating combinations from a list:
itertools.combinations
A person was trying to list all combinations of a list taken two at a time in this post: Python - list the combination pair for a function value

As a student's exercise, your code is readable and does what you want.
I guess the question is, why do you need all these combinations? Practical bioinformatics is, among other things, a mess of file types and formats, and you'll probably encounter some input data using a different alphabet than the one you're working with.
Regarding modules, there are two general-purpose I'll mention. The rest really depends on what specific task you're trying to accomplish. Biopython is the more mature and widely supported, but the code base is it's showing it's age. scikit-bio is the new kid on the block with beautiful, fully tested code, but with less features and less support for obscure file formats.

list comprehension:
proteins = ['a', 't', 'c', 'g']
all_combos = [x+y for x in proteins for y in proteins]

Related

How to Pair elements from a list without repeating the same combination?

I have a list of elements, that I would like to group (of size 2,3,4 etc.) and find some unique combinations in each iteration. I have the following snippet, that forms combinations of size group_size of members.
I would like to know how can I avoid duplicate combinations in the new iterations.
For group_size > 2, I want to also avoid any two elements of members repeating. Let's say: group_size = 3; then ['A', 'B', 'C'] is accepted, but any other combination of ['A', 'B',~] or ['B', 'C',~] or ['A', 'C',~] is not accepted in the future iterations, where '~' represents any element other than ['A', 'B', 'C'].
import random
from itertools import zip_longest
members = ['A', 'B', 'C', 'D', 'E', 'F', 'U', 'V', 'W', 'X', 'Y', 'Z']
group_size = 2
for i in range(10):
random.shuffle(members)
pairs_loc = [iter(members)] * group_size
pairs = zip_longest(*pairs_loc)
print(*pairs)
Honestly I'm not sure I understood correctly what you want to do, but let me try, maybe it is useful to you all the same.
For the first point Python already has what (I believe that) you're looking for: itertools.combinations.
For the second point we need some code. One note: I'm sure you realize that with this second requirement you will have some cases when not all members appear in at least one combination: e.g., with 12 members and a groupsize > 6.
The code:
def select_combos(members, groupsize):
assert groupsize > 1
shuffle(members)
if groupsize == 2:
return list(combinations(members, 2))
finalcombos = []
usedcombos = []
for c in combinations(members, groupsize):
tempcombos = list(combinations(c, 2))
for c2 in tempcombos:
if c2 in usedcombos:
break
else:
usedcombos += tempcombos
finalcombos.append(c)
return finalcombos
m = ['A', 'B', 'C', 'D', 'E', 'F', 'U', 'V', 'W', 'X', 'Y', 'Z']
select_combos(m, 2)
[('C', 'A'), ('C', 'Z'), ('C', 'Y'), ('C', 'E'), ('C', 'W'), ('C', 'B'), ('C', 'U'), ('C', 'X'), ('C', 'D'), ('C', 'V'), ('C', 'F'), ('A', 'Z'), ('A', 'Y'), ('A', 'E'), ('A', 'W'), ('A', 'B'), ('A', 'U'), ('A', 'X'), ('A', 'D'), ('A', 'V'), ('A', 'F'), ('Z', 'Y'), ('Z', 'E'), ('Z', 'W'), ('Z', 'B'), ('Z', 'U'), ('Z', 'X'), ('Z', 'D'), ('Z', 'V'), ('Z', 'F'), ('Y', 'E'), ('Y', 'W'), ('Y', 'B'), ('Y', 'U'), ('Y', 'X'), ('Y', 'D'), ('Y', 'V'), ('Y', 'F'), ('E', 'W'), ('E', 'B'), ('E', 'U'), ('E', 'X'), ('E', 'D'), ('E', 'V'), ('E', 'F'), ('W', 'B'), ('W', 'U'), ('W', 'X'), ('W', 'D'), ('W', 'V'), ('W', 'F'), ('B', 'U'), ('B', 'X'), ('B', 'D'), ('B', 'V'), ('B', 'F'), ('U', 'X'), ('U', 'D'), ('U', 'V'), ('U', 'F'), ('X', 'D'), ('X', 'V'), ('X', 'F'), ('D', 'V'), ('D', 'F'), ('V', 'F')]
select_combos(m, 5)
[('W', 'V', 'C', 'U', 'E'), ('W', 'A', 'X', 'B', 'F'), ('V', 'A', 'D', 'Y', 'Z')]
EDIT
Now that it's clearer, the request for group size 2 is equivalent to scheduling a round-robin tournament, so we can use the standard circle method here.
One quick and dirty implementation of the rotation:
def rotate(roster):
half = (len(roster)+1)//2
t=roster[1]
roster[1] = roster[half]
for j in range(half, len(roster)-1):
roster[j] = roster[j+1]
roster[-1] = roster[half-1]
for j in range(half-1, 1, -1):
roster[j] = roster[j-1]
roster[2] = t
for i in range(half):
print(f'{roster[i]}-{roster[i+half]} ', end = '')
print()
members = ['A', 'B', 'C', 'D', 'E', 'F', 'U', 'V', 'W', 'X', 'Y', 'Z']
shuffle(members)
for r in range(len(members)):
rotate(members)
At each iteration this will rotate the roster one step and print the pairings. Note that at the n-th iteration the roster, and hence the pairings, will be the same as at the start.
You can use a dictionary of sets to keep track of the pairings that have already been used in previous groups. Then assemble a new group based on the eligible members that you constrain with each addition to the group. Note that it is possible to hit a dead-end so your group forming logic needs to be able to reset itself and start over with different random members:
import random
members = ['A', 'B', 'C', 'D', 'E', 'F', 'U', 'V', 'W', 'X', 'Y', 'Z']
remaining = {M:set(members)-{M} for M in members} # unused pairs by member
group_size = 3
for _ in range(10):
more = set() # set of members that can be added to group
group = [] # current group
while len(group)<group_size:
if len(more)+len(group)<group_size: # group not feasible
more = {m for m,r in remaining.items() if r} # reset
group = []
m = random.sample(more,1)[0] # select eligible member
group.append(m) # add to group
more &= remaining[m] # constrain next members
print(group)
for m in group: # track unused pairs
remaining[m].difference_update(group)
['B', 'Z', 'Y']
['X', 'U', 'W']
['Y', 'U', 'C']
['D', 'Y', 'W']
['B', 'X', 'A']
['X', 'E', 'D']
['B', 'V', 'F']
['D', 'V', 'Z']
['E', 'A', 'F']
['A', 'C', 'Z']

Filter List of Tuples to Exclude from Another List of Tuples which Contains

(Using Python3)
I have a list of tuples, (of strings)
have = [
('a', 'b', 'c', 'd'), ('a', 'b', 'c', 'e'), ('a', 'b', 'c', 'f'), ('a', 'b', 'c', 'g'), ('a', 'b', 'd', 'e'),
('a', 'b', 'd', 'f'), ('a', 'b', 'd', 'g'), ('a', 'b', 'e', 'f'), ('a', 'b', 'e', 'g'), ('a', 'b', 'f', 'g'),
('a', 'c', 'd', 'e'), ('a', 'c', 'd', 'f'), ('a', 'c', 'd', 'g'), ('a', 'c', 'e', 'f'), ('a', 'c', 'e', 'g'),
('a', 'c', 'f', 'g'), ('a', 'd', 'e', 'f'), ('a', 'd', 'e', 'g'), ('a', 'd', 'f', 'g'), ('a', 'e', 'f', 'g'),
('b', 'c', 'd', 'e'), ('b', 'c', 'd', 'f'), ('b', 'c', 'd', 'g'), ('b', 'c', 'e', 'f'), ('b', 'c', 'e', 'g'),
('b', 'c', 'f', 'g'), ('b', 'd', 'e', 'f'), ('b', 'd', 'e', 'g'), ('b', 'd', 'f', 'g'), ('b', 'e', 'f', 'g'),
('c', 'd', 'e', 'f'), ('c', 'd', 'e', 'g'), ('c', 'd', 'f', 'g'), ('c', 'e', 'f', 'g'), ('d', 'e', 'f', 'g')
]
I also have a list of tuples (also strings) which I want to "exclude"
exclude = [('a', 'd'), ('b', 'c')]
I'm trying to find an efficient way to remove any element in have that contains both the elements in each exclude tuple. Ordering does not matter.
My goal is to return something like this:
[
('a', 'b', 'e', 'f'), ('a', 'b', 'e', 'g'), ('a', 'b', 'f', 'g'), ('a', 'c', 'e', 'f'), ('a', 'c', 'e', 'g'),
('a', 'c', 'f', 'g'), ('a', 'e', 'f', 'g'), ('b', 'd', 'e', 'f'), ('b', 'd', 'e', 'g'), ('b', 'd', 'f', 'g'),
('b', 'e', 'f', 'g'), ('c', 'd', 'e', 'f'), ('c', 'd', 'e', 'g'), ('c', 'd', 'f', 'g'), ('c', 'e', 'f', 'g'),
('d', 'e', 'f', 'g')
]
You could convert the exclude tuples to sets and then check for each element of have is the excluded set isn't a subset of it:
excludeSet = [set(e) for e in exclude]
filteredHave = [h for h in have if not any(e for e in excludeSet if e.issubset(h))]

Creating combination on list values

I have a requirement where say i have a list
lis = ['a','b','c','d','e','f']
I have to now create a combination of them eg:
l1 = [a],['b,c,d,e,f]
l2: [b],[a,c,d,e,f]
.
.
l10 [a,b,c],[d,e,f]
.
l11 [a,b,c,d] [e,f]
The repeated elements on the left and right nodes will be removed:
eg: i don't need two lists as:
l1: [b,c] , [a,d,e,f]
l2: [a,d,e,f], [b,c]
Since they are the same
The pseudo code i have in mind is:
for length = 1, i will take one element from list and club others
similar for length=2, will take two element and club others
till length=len(list)-1, will do the clubbing
and then later remove the duplicates.
Any better solution i could try?
This may no be optimal, but is very straightforward:
from itertools import chain, combinations
def power_set(iterable):
"""power_set([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"""
source = list(iterable)
return chain.from_iterable(combinations(source, r) for r in range(1, len(source) // 2 + 1))
def complement(source, universe):
return tuple(set(universe) - set(source))
lst = ['a', 'b', 'c', 'd', 'e', 'f']
result = set(frozenset({si, complement(si, lst)}) for si in power_set(lst))
for s, c in result:
print(s, c)
Output
('a', 'd', 'e') ('f', 'c', 'b')
('f', 'a', 'c', 'b') ('d', 'e')
('b', 'e') ('d', 'f', 'a', 'c')
('a', 'b', 'f') ('d', 'e', 'c')
('e', 'd', 'a', 'f', 'b') ('c',)
('c', 'f') ('d', 'a', 'e', 'b')
('d', 'f') ('a', 'e', 'b', 'c')
('d',) ('e', 'c', 'a', 'f', 'b')
('f', 'a', 'e', 'c') ('b', 'd')
('e', 'c', 'd', 'a', 'b') ('f',)
('b', 'c', 'd') ('f', 'a', 'e')
('a', 'b', 'e') ('d', 'f', 'c')
('b', 'c') ('d', 'f', 'a', 'e')
('f', 'a', 'b') ('c', 'd', 'e')
('d', 'e', 'b', 'c') ('a', 'f')
('c', 'd', 'f') ('a', 'e', 'b')
('e', 'c', 'd', 'f', 'b') ('a',)
('a', 'c') ('d', 'f', 'e', 'b')
('f', 'e', 'c') ('a', 'b', 'd')
('a', 'd') ('f', 'e', 'b', 'c')
('b', 'c', 'e') ('d', 'f', 'a')
('a', 'c', 'e') ('d', 'f', 'b')
('d', 'e', 'f') ('a', 'c', 'b')
('a', 'c', 'd') ('f', 'e', 'b')
('d', 'f', 'e', 'c') ('a', 'b')
('f', 'a', 'e', 'b') ('c', 'd')
('d', 'a', 'c') ('b', 'e', 'f')
('a', 'e') ('d', 'f', 'c', 'b')
('a', 'b', 'c') ('d', 'f', 'e')
('a', 'd', 'f') ('e', 'b', 'c')
('d', 'e', 'b') ('a', 'c', 'f')
('c', 'd', 'a', 'f', 'b') ('e',)
('b',) ('e', 'c', 'd', 'a', 'f')
('e', 'f') ('d', 'a', 'c', 'b')
('d', 'c', 'b') ('a', 'e', 'f')
('b', 'f') ('d', 'a', 'e', 'c')
('d', 'a', 'e') ('b', 'c', 'f')
('b', 'd', 'e') ('f', 'a', 'c')
('a', 'e', 'c') ('b', 'd', 'f')
('c', 'e') ('d', 'f', 'a', 'b')
('d', 'a', 'b') ('c', 'e', 'f')

Getting one element from each list, random number of lists

I want to create a function to take one number from each list and create possible combinations of the numbers. I don't know how many lists will appear every time... For example:
Getting 4 lists: x1, x2, x3 and x4, I need to get output:
(x1[0], x1[0], x1[0], x1[0]), (x1[0], x1[0], x1[0], x1[1]), (x1[0], x1[0], x1[0], x1[2])... or in any other format.
I know how to get it from specific amount of lists using for loop:
for i in range(len(x1)):
for j in range(len(x2)):
etc ...
but how may I get it when the number of lists is random?
Thanks
itertools.product does exactly that. It takes a variable number of arguments, so we can use *lists to unpack our list-of-lists into one argument for each list:
>>> import itertools
>>> x1 = ['f', 'g', 't']
>>> x2 = ['a', 'e', 'o']
>>> x3 = ['t', 'd', 'r']
>>> lists = [x1, x2, x3]
>>> list(itertools.product(*lists))
[('f', 'a', 't'), ('f', 'a', 'd'), ('f', 'a', 'r'), ('f', 'e', 't'), ('f', 'e', 'd'), ('f', 'e', 'r'), ('f', 'o', 't'), ('f', 'o', 'd'), ('f', 'o', 'r'), ('g', 'a', 't'), ('g', 'a', 'd'), ('g', 'a', 'r'), ('g', 'e', 't'), ('g', 'e', 'd'), ('g', 'e', 'r'), ('g', 'o', 't'), ('g', 'o', 'd'), ('g', 'o', 'r'), ('t', 'a', 't'), ('t', 'a', 'd'), ('t', 'a', 'r'), ('t', 'e', 't'), ('t', 'e', 'd'), ('t', 'e', 'r'), ('t', 'o', 't'), ('t', 'o', 'd'), ('t', 'o', 'r')]

Python - Calculate combinations of different values as a sum

Given a list of tuples as following:
values = [
('a', 'b', 'c'),
('d', 'e'),
('f', 'g', 'h')
]
I'd like to calculate different combinations of those values, but not as a cartesian product, rather as a sum on some custom rules. To clarify, if we calculate the cartesian product between those tuples, we will get 3*2*3 = 18 different combinations. But my desire is to get something like this:
combinations = [
('a', 'd', 'f'),
('a', 'e', 'g'),
('a', 'e', 'h'),
('b', 'd', 'f'),
('b', 'e', 'g'),
('b', 'e', 'h'),
('c', 'd', 'f'),
('c', 'e', 'g'),
('c', 'e', 'h')
]
So the resulting list contains 9 different combinations instead of 18.
Example with 4 tuples:
values = [
('a', 'b', 'c'),
('d', 'e'),
('f', 'g', 'h'),
('i', 'j', 'k', 'l')
]
The result would be
combinations = [
('a', 'd', 'f', 'i'),
('a', 'e', 'g', 'j'),
('a', 'e', 'h', 'k'),
('a', 'e', 'h', 'l'),
('b', 'd', 'f', 'i'),
('b', 'e', 'g', 'j'),
('b', 'e', 'h', 'k'),
('b', 'e', 'h', 'l'),
('c', 'd', 'f', 'i'),
('c', 'e', 'g', 'j'),
('c', 'e', 'h', 'k'),
('c', 'e', 'h', 'l'),
]
To Explain the logic for the outputs further:
In both inputs, the first tuple is behaving as it would in a cartesian product.
However, all the other tuples except the first are being iterated (or zipped) together. Additionally, if one of the tuples being iterated together "runs out of values" so to speak, we use the last value in the tuple instead.
What would be the efficient way to achieve this?
With the extra example provided, we can figure out how the logic will look. Essentially, the first row is being treated specially and used in the normal "cartesian product" sense.
However, the rest of the rows are being effectively extended to the largest length, and being zipped together. Coding that up, it can look something like follows:
from itertools import product
def extend_to_max_len(tup, length):
'''extends a tuple to a specified length by
filling the empty spaces with last element of given tuple
'''
fill_count = length - len(tup)
return (*tup, *[tup[-1]]*fill_count)
def non_cartesian_sum(values):
'''Expects a list of tuples.
gives the output according to the custom rules:
top: first row: to be used for cartesian product with zip of remaining rows
bottom: remaining rows: extended to longest length before zipping
'''
if len(values) < 2:
print("Check length of input provided")
return None
top = values[0]
bottom = values[1:]
max_len = max(len(row) for row in bottom)
bottom = [extend_to_max_len(row, max_len) for row in bottom]
out = [(first, *rest) for first, rest in product(top, zip(*bottom))]
return out
values = [
('a', 'b', 'c'),
('d', 'e'),
('f', 'g', 'h'),
('i', 'j', 'k', 'l')
]
out = non_cartesian_sum(values)
print(out)
Output:
[('a', 'd', 'f', 'i'),
('a', 'e', 'g', 'j'),
('a', 'e', 'h', 'k'),
('a', 'e', 'h', 'l'),
('b', 'd', 'f', 'i'),
('b', 'e', 'g', 'j'),
('b', 'e', 'h', 'k'),
('b', 'e', 'h', 'l'),
('c', 'd', 'f', 'i'),
('c', 'e', 'g', 'j'),
('c', 'e', 'h', 'k'),
('c', 'e', 'h', 'l')]
Note that you may want to add more input validation as required, before using this function for your use case.
This works for the data provided.
values = [
('a', 'b', 'c'),
('d', 'e'),
('f', 'g', 'h')
]
length_of_1 = len(values[1])
length_of_2 = len(values[2])
output = []
for item0 in values[0]:
for i in range(max(length_of_1, length_of_2)):
if i >= length_of_1:
item1 = values[1][-1]
else:
item1 = values[1][i]
if i >= length_of_2:
item2 = values[2][-1]
else:
item2 = values[2][i]
triple = (item0, item1, item2)
output.append(triple)
for tup in output:
print(tup)
Output:
('a', 'd', 'f')
('a', 'e', 'g')
('a', 'e', 'h')
('b', 'd', 'f')
('b', 'e', 'g')
('b', 'e', 'h')
('c', 'd', 'f')
('c', 'e', 'g')
('c', 'e', 'h')
Try this
values = [
('a', 'b', 'c'),
('d', 'e'),
('f', 'g', 'h')
]
combination = [(a,b,c) for a in values[0] for b in values[1] for c in values[2]]
print(combination)

Categories