Alternatives to cartesian in Spark? - python

I have a RDD of this type:
[(1, [3, 10, 11]), (2, [3, 4, 10, 11]), (3, [1, 4]), (4, [2, 3, 10])...]
And I need a function that follows this rule:
if the key x does not contain the key y (and vice versa) in its value-list, then outputs a tuple having the following syntax:
[(x, [y, len(values_x ^ values_y)]), ...]
where len(values_x ^ values_y) is the number of values in common between the two keys. If this value is 0 (i.e., no values in common), just skip this pair of keys.
E.g., from the sample above, the output should be:
(1, [2, 3]) # because keys 1 and 2 share the values 3, 10, 11
(1, [4, 2]) # because keys 1 and 4 share the values 3, 10
skipping: (2, [1, 3]) is the inverse of (1, [2, 3]), so it can be skipped
(2, [3, 1]) # because keys 2 and 3 share the value 4
...
The pair of keys 1 and 3 (and other similar cases) is skipped because key 3 is included in the list-value of key 1 and vice versa.
A solution that I've implemented (but I don't like at all), is using the cartesian function to create all the combinations between keys and then a mapping and a filtering to delete unnecessary pairs.
Is there a better solution without using cartesian?

First lets define some helpers:
def swap(x):
"""Given a tuple (x1, x2) return (x2, 1)"""
return (x[1], 1)
def filter_source(x):
"""Check if s1 < s2 in (x, (s1, s2))"""
return x[1][0] < x[1][1]
def reshape(kv):
"""Reshape ((k1, k2), v) to get final result"""
((k1, k2), v) = kv
return (k1, (k2, v))
and create an example RDD:
rdd = sc.parallelize([
(1, [3, 10, 11]), (2, [3, 4, 10, 11]),
(3, [1, 4]), (4, [2, 3, 10])])
Finally you can do something like this:
from operator import add
flattened = rdd.flatMap(lambda kv: ((v, kv[0]) for v in kv[1])) # Flatten input
flattened.first()
# (1, 3) <- from (3, [1, 4])
result = (flattened
.join(flattened) # Perform self join using value from input as key
.filter(filter_source) # Remove pairs from the same source
.map(swap)
.reduceByKey(add)
.map(reshape)) # Get final output

Related

Fastest algorithm to get all the tuple groups that has the same N intersections from a list of tuples without duplicates

I have a list of 100 tuples. Each tuple contains 5 unique integers. I want to know the fastest way to find all the groups that have exactly the same N = 2 intersections. If a tuple has multiple pairs of elements that has 2 intersections with other tuples, then find all of them and store in different groups. The expected output is a list of unique lists ([(1,2,3,4,5),(4,5,6,7,8)] is the same as [(4,5,6,7,8),(1,2,3,4,5)]), where each list is a group that has all tuples with the same N=2 intersections. Below is my code:
from collections import defaultdict
from random import sample, choice
lst = [tuple(sample(range(10), 5)) for _ in range(100)]
dct = defaultdict(list)
N = 2
for i in lst:
for j in lst:
if len(set(i).intersection(set(j))) == N:
dct[i].append(j)
key = choice(list(dct))
print([key] + dct[key])
>>> [(4, 5, 2, 3, 7), (4, 6, 2, 5, 0), (9, 4, 2, 1, 8), (7, 6, 5, 2, 0), (2, 4, 0, 7, 8)]
Obviously, all last 4 tuples have 2 intersections with the first tuple, but not necessarily the same 2 elements. So how should I get the tuples that has the same 2 intersections?
An obvious solution is to brute force enumerate all possible (x, y) integer pairs and group tuples that has this (x, y) intersection accordingly, but is there a faster algorithm to do this?
Edit: [(1, 2, 3, 4, 5), (4, 5, 6, 7, 8), (4, 5, 9, 10, 11)] is allowed to be in a same group, but [(1, 2, 3, 4, 5),(4, 5, 6, 7, 8), (4, 5, 6, 10, 11)] is not, because (4, 5, 6, 7, 8) has 3 intersections with (4, 5, 6, 10, 11). In this case, it should be divided into 2 groups [(1, 2, 3, 4, 5), (4, 5, 6, 7, 8)] and [(1, 2, 3, 4, 5), (4, 5, 6, 10, 11)]. The final result will of course contains groups with various sizes, including many short lists with only two tuples, but this is what I want.
a simple combinations-based approach will suffice:
from collections import defaultdict
from itertools import combinations
res = defaultdict(set)
for t1, t2 in combinations(tuples, 2):
overlap = set(t1) & set(t2)
if len(overlap) == 2:
cur = res[frozenset(overlap)]
cur.add(t1)
cur.add(t2)
result:
defaultdict(set,
{frozenset({2, 4}): {(2, 4, 0, 7, 8),
(4, 5, 2, 2, 4),
(4, 6, 2, 6, 0),
(8, 4, 2, 1, 8)},
frozenset({2, 5}): {(4, 5, 2, 2, 4), (7, 6, 5, 2, 0)}})
I like how clean #acushner's solution looks, but I wrote one that's substantially faster:
def all_n_intersections2(xss, n):
xss = [frozenset(xs) for xs in xss]
result = {}
while xss:
xsa = xss.pop()
for xsb in xss:
ixs = xsa.intersection(xsb)
if len(ixs) == n:
if ixs not in result:
result[ixs] = [xsa, xsb]
else:
result[ixs].append(xsb)
return result
If I pit them against each other:
from timeit import timeit
from random import sample
from collections import defaultdict
from itertools import combinations
def all_n_intersections1(xss, n):
res = defaultdict(set)
for t1, t2 in combinations(xss, 2):
overlap = set(t1) & set(t2)
if len(overlap) == n:
cur = res[frozenset(overlap)]
cur.add(t1)
cur.add(t2)
def all_n_intersections2(xss, n):
xss = [frozenset(xs) for xs in xss]
result = {}
while xss:
xsa = xss.pop()
for xsb in xss:
ixs = xsa.intersection(xsb)
if len(ixs) == n:
if ixs not in result:
result[ixs] = [xsa, xsb]
else:
result[ixs].append(xsb)
return result
data = [tuple(sample(range(10), 5)) for _ in range(100)]
print(timeit(lambda: all_n_intersections1(data, 2), number=1000))
print(timeit(lambda: all_n_intersections2(data, 2), number=1000))
Results:
3.4294801999999995
1.4871790999999999
With some commentary:
def all_n_intersections2(xss, n):
# using frozensets to be able to use them as dict keys, convert only once
xss = [frozenset(xs) for xs in xss]
result = {}
# keep going until there are no more items left to combine
while xss:
# popping to compare against all others remaining, intersect each pair only once
xsa = xss.pop()
for xsb in xss:
# using library intersection, assuming the native implementation is fastest
ixs = xsa.intersection(xsb)
if len(ixs) == n:
if ixs not in result:
# not using default dict, initialising with these two
result[ixs] = [xsa, xsb]
else:
# otherwise, xsa was already in there, appending xsb
result[ixs].append(xsb)
return result
What the solution does:
for each combination of xsa, xsb from xss, it computes the intersection
if the intersection ixs is the target length n, xsa and xsb are added to a list in a dictionary using ixs as a key
duplicate appends are avoided (unless there are duplicate tuples in the source data)

More elegant way of find a range of repeating elements

I have this problem.
let l be a list containing only 0's and 1's, find all tuples that represents the start and end of a repeating sequence of 1's.
example
l=[1,1,0,0,0,1,1,1,0,1]
answer:
[(0,2),(5,8),(9,10)]
i solved the problem with the following code, but i think it is pretty messy, i would like to know if there is a cleaner way to solve this problem (maybe using map/reduce ?)
from collections import deque
def find_range(l):
pairs=deque((i,i+1) for i,e in enumerate(l) if e==1)
ans=[]
p=[0,0]
while(len(pairs)>1):
act=pairs.popleft()
nex=pairs[0]
if p==[0,0]:
p=list(act)
if act[1]==nex[0]:
p[1]=nex[1]
else:
ans.append(tuple(p))
p=[0,0]
if(len(pairs)==1):
if p==[0,0]:
ans.append(pairs.pop())
else:
ans.append((p[0],pairs.pop()[1]))
return ans
With itertools.groupby magic:
from itertools import groupby
lst = [1, 1, 0, 0, 0, 1, 1, 1, 0, 1]
indices, res = range(len(lst)), []
for k, group in groupby(indices, key=lambda i: lst[i]):
if k == 1:
group = list(group)
sl = group[0], group[-1] + 1
res.append(sl)
print(res)
The output:
[(0, 2), (5, 8), (9, 10)]
Or with a more efficient generator function:
def get_ones_coords(lst):
indices = range(len(lst))
for k, group in groupby(indices, key=lambda i: lst[i]):
if k == 1:
group = list(group)
yield group[0], group[-1] + 1
lst = [1, 1, 0, 0, 0, 1, 1, 1, 0, 1]
print(list(get_ones_coords(lst))) # [(0, 2), (5, 8), (9, 10)]
As a short bonus, here's alternative numpy approach, though sophisticated, based on discrete difference between consecutive numbers (numpy.diff) and extracting indices of non-zero items (numpy.faltnonzero):
In [137]: lst = [1,1,0,0,0,1,1,1,0,1]
In [138]: arr = np.array(lst)
In [139]: np.flatnonzero(np.diff(np.r_[0, arr, 0]) != 0).reshape(-1, 2)
Out[139]:
array([[ 0, 2],
[ 5, 8],
[ 9, 10]])
Code:
a = [[l.index(1)]]
[l[i] and len(a[-1])==2 and a.append([i]) or l[i] or len(a[-1])==1 and a[-1].append(i) for i in range(len(l))]
Output:
[[0, 2], [5, 8], [9]]
Code:
l=[1,1,0,0,0,1,1,1,0,1]
indices = [ind for ind, elem in enumerate(l) if elem == 1]
diff = [0]+[x - indices[i - 1] for i, x in enumerate(indices)][1:]
change_ind = [0]+[i for i, change in enumerate(diff) if change > 1]+[len(indices)]
split_indices = [tuple(indices[i:j]) for i,j in zip(change_ind,change_ind[1:])]
proper_tuples = [(tup[0], tup[-1]) if len(tup)>2 else tup for tup in split_indices]
print(proper_tuples)
Logic:
indices is the list of indices where l elements = 1 => [0, 1, 5, 6, 7, 9]
diff calculates the difference between the indices found above and appends a 0 at the start to keep their lengths the same => [0, 1, 4, 1, 1, 2]
change_ind indicates the locations where a split needs to happen which corresponds to where diff is greater than 1. Also append the first index and last index for later use or else you will only have the middle tuple => [0, 2, 5, 6]
split_indices creates tuples based on the range indicated in consecutive elements in change_ind (using zip which creates the combination of ranges) => [(0, 1), (5, 6, 7), (9,)]
Lastly, proper_tuples loops through the tuples create in split_indices and insures that if their length is greater than 2, then only consider the first and last elements, otherwise keep as is => [(0, 1), (5, 7), (9,)]
Output:
[(0, 1), (5, 7), (9,)]
Final Comments:
Although this does not match what OP suggested in the original question:
[(0,2),(5,8),(9,10)]
It does make more logical sense and seems to follow what OP indicated in the comments.
For example, at the start of l there are two ones - so the tuple should be (0, 1) not (0, 2) to match the proposed (start, end) notation.
Likewise at the end there is only a single one - so the tuple corresponding to this is (9,) not (9, 10)

How does a Python custom comparator work?

I have the following Python dict:
[(2, [3, 4, 5]), (3, [1, 0, 0, 0, 1]), (4, [-1]), (10, [1, 2, 3])]
Now I want to sort them on the basis of sum of values of the values of dictionary, so for the first key the sum of values is 3+4+5=12.
I have written the following code that does the job:
def myComparator(a,b):
print "Values(a,b): ",(a,b)
sum_a=sum(a[1])
sum_b=sum(b[1])
print sum_a,sum_b
print "Comparision Returns:",cmp(sum_a,sum_b)
return cmp(sum_a,sum_b)
items.sort(myComparator)
print items
This is what the output that I get after running above:
Values(a,b): ((3, [1, 0, 0, 0, 1]), (2, [3, 4, 5]))
2 12
Comparision Returns: -1
Values(a,b): ((4, [-1]), (3, [1, 0, 0, 0, 1]))
-1 2
Comparision Returns: -1
Values(a,b): ((10, [1, 2, 3]), (4, [-1]))
6 -1
Comparision Returns: 1
Values(a,b): ((10, [1, 2, 3]), (3, [1, 0, 0, 0, 1]))
6 2
Comparision Returns: 1
Values(a,b): ((10, [1, 2, 3]), (2, [3, 4, 5]))
6 12
Comparision Returns: -1
[(4, [-1]), (3, [1, 0, 0, 0, 1]), (10, [1, 2, 3]), (2, [3, 4, 5])]
Now I am unable to understand as to how the comparator is working, which two values are being passed and how many such comparisons would happen? Is it creating a sorted list of keys internally where it keeps track of each comparison made? Also the behavior seems to be very random. I am confused, any help would be appreciated.
The number and which comparisons are done is not documented and in fact, it can freely change from different implementations. The only guarantee is that if the comparison function makes sense the method will sort the list.
CPython uses the Timsort algorithm to sort lists, so what you see is the order in which that algorithm is performing the comparisons (if I'm not mistaken for very short lists Timsort just uses insertion sort)
Python is not keeping track of "keys". It just calls your comparison function every time a comparison is made. So your function can be called many more than len(items) times.
If you want to use keys you should use the key argument. In fact you could do:
items.sort(key=lambda x: sum(x[1]))
This will create the keys and then sort using the usual comparison operator on the keys. This is guaranteed to call the function passed by key only len(items) times.
Given that your list is:
[a,b,c,d]
The sequence of comparisons you are seeing is:
b < a # -1 true --> [b, a, c, d]
c < b # -1 true --> [c, b, a, d]
d < c # 1 false
d < b # 1 false
d < a # -1 true --> [c, b, d, a]
how the comparator is working
This is well documented:
Compare the two objects x and y and return an integer according to the outcome. The return value is negative if x < y, zero if x == y and strictly positive if x > y.
Instead of calling the cmp function you could have written:
sum_a=sum(a[1])
sum_b=sum(b[1])
if sum_a < sum_b:
return -1
elif sum_a == sum_b:
return 0
else:
return 1
which two values are being passed
From your print statements you can see the two values that are passed. Let's look at the first iteration:
((3, [1, 0, 0, 0, 1]), (2, [3, 4, 5]))
What you are printing here is a tuple (a, b), so the actual values passed into your comparison functions are
a = (3, [1, 0, 0, 0, 1])
b = (2, [3, 4, 5]))
By means of your function, you then compare the sum of the two lists in each tuple, which you denote sum_a and sum_b in your code.
and how many such comparisons would happen?
I guess what you are really asking: How does the sort work, by just calling a single function?
The short answer is: it uses the Timsort algorithm, and it calls the comparison function O(n * log n) times (note that the actual number of calls is c * n * log n, where c > 0).
To understand what is happening, picture yourself sorting a list of values, say v = [4,2,6,3]. If you go about this systematically, you might do this:
start at the first value, at index i = 0
compare v[i] with v[i+1]
If v[i+1] < v[i], swap them
increase i, repeat from 2 until i == len(v) - 2
start at 1 until no further swaps occurred
So you get, i =
0: 2 < 4 => [2, 4, 6, 3] (swap)
1: 6 < 4 => [2, 4, 6, 3] (no swap)
2: 3 < 6 => [2, 4, 3, 6] (swap)
Start again:
0: 4 < 2 => [2, 4, 3, 6] (no swap)
1: 3 < 4 => [2, 3, 4, 6] (swap)
2: 6 < 4 => [2, 3, 4, 6] (no swap)
Start again - there will be no further swaps, so stop. Your list is sorted. In this example we have run through the list 3 times, and there were 3 * 3 = 9 comparisons.
Obviously this is not very efficient -- the sort() method only calls your comparator function 5 times. The reason is that it employs a more efficient sort algorithm than the simple one explained above.
Also the behavior seems to be very random.
Note that the sequence of values passed to your comparator function is not, in general, defined. However, the sort function does all the necessary comparisons between any two values of the iterable it receives.
Is it creating a sorted list of keys internally where it keeps track of each comparison made?
No, it is not keeping a list of keys internally. Rather the sorting algorithm essentially iterates over the list you give it. In fact it builds subsets of lists to avoid doing too many comparisons - there is a nice visualization of how the sorting algorithm works at Visualising Sorting Algorithms: Python's timsort by Aldo Cortesi
Basically, for the simple list such as [2, 4, 6, 3, 1] and the complex list you provided, the sorting algorithms are the same.
The only differences are the complexity of elements in the list and the comparing scheme that how to compare any tow elements (e.g. myComparator you provided).
There is a good description for Python Sorting: https://wiki.python.org/moin/HowTo/Sorting
First, the cmp() function:
cmp(...)
cmp(x, y) -> integer
Return negative if x<y, zero if x==y, positive if x>y.
You are using this line: items.sort(myComparator) which is equivalent to saying: items.sort(-1) or items.sort(0) or items.sort(1)
Since you want to sort based on the sum of each tuples list, you could do this:
mylist = [(2, [3, 4, 5]), (3, [1, 0, 0, 0, 1]), (4, [-1]), (10, [1, 2, 3])]
sorted(mylist, key=lambda pair: sum(pair[1]))
What this is doing is, I think, exactly what you wanted. Sorting mylist based on the sum() of each tuples list

Unsort a list back to original sequence

aList = [2, 1, 4, 3, 5]
aList.sort()
=[1, 2, 3, 4, 5]
del aList[2]
=[1, 2, 4, 5]
**unsort the list back to original sequence with '3' deleted**
=[2, 1, 4, 5]
In reality I have a list of tuples that contain (Price, Quantity, Total).
I want to sort the list, allow the user to delete items in the list and
then put it back in the original order minus the deleted items.
One thing to note is that the values in the tuples can repeat in the list,
such as:
aList = [(4.55, 10, 45.5), (4.55, 10, 45.5), (1.99, 3, 5.97), (1.99, 1, 1.99)]
You cannot unsort the list but you could keep the original unsorted index to restore positions.
E.g.
from operator import itemgetter
aList = [(4.55, 10, 45.5), (4.55, 10, 45.5), (1.99, 3, 5.97), (1.99, 1, 1.99)]
# In keyList:
# * every element has a unique id (it also saves the original position in aList)
# * list is sorted by some criteria specific to your records
keyList = sorted(enumerate(aList), key = itemgetter(1))
# User want to delete item 1
for i, (key, record) in enumerate(keyList):
if key == 1:
del keyList[i]
break
# "Unsort" the list
theList = sorted(keyList, key = itemgetter(0))
# We don't need the unique id anymore
result = [record for key, record in theList]
As you can see this works with duplicate values.
Unsorting can be done
This approach is like others - the idea is to keep the original indices to restore the positions. I wanted to add a clearer example on how this is done.
In the example below, we keep track of the original positions of the items in a by associating them with their list index.
>>> a = [4, 3, 2, 1]
>>> b = [(a[i], i) for i in range(len(a))]
>>> b
[(4, 0), (3, 1), (2, 2), (1, 3)]
b serves as a mapping between the list values and their indices in the unsorted list.
Now we can sort b. Below, each item of b is sorted by the first tuple member, which is the corresponding value in the original list.
>>> c = sorted(b)
>>> c
[(1, 3), (2, 2), (3, 1), (4, 0)]
There it is... sorted.
Going back to the original order requires another sort, except using the second tuple item as the key.
>>> d = sorted(c, key=lambda t: t[1])
>>> d
[(4, 0), (3, 1), (2, 2), (1, 3)]
>>>
>>> d == b
True
And now it's back in its original order.
One use for this could be to transform a list of non sequential values into their ordinal values while maintaining the list order. For instance, a sequence like [1034 343 5 72 8997] could be transformed to [3, 2, 0, 1, 4].
>>> # Example for converting a list of non-contiguous
>>> # values in a list into their relative ordinal values.
>>>
>>> def ordinalize(a):
... idxs = list(range(len(a)))
... b = [(a[i], i) for i in idxs]
... b.sort()
... c = [(*b[i], i) for i in idxs]
... c.sort(key=lambda item: item[1])
... return [c[i][2] for i in idxs]
...
>>> ordinalize([58, 42, 37, 25, 10])
[4, 3, 2, 1, 0]
Same operation
>>> def ordinalize(a):
... idxs = range(len(a))
... a = sorted((a[i], i) for i in idxs)
... a = sorted(((*a[i], i) for i in idxs),
... key=lambda item: item[1])
... return [a[i][2] for i in idxs]
You can't really do an "unsort", the best you can do is:
aList = [2, 1, 4, 3, 5]
aList.remove(sorted(aList)[2])
>>> print aList
[2, 1, 4, 5]
Try this to unsort a sorted list
import random
li = list(range(101))
random.shuffle(li)
Here's how I recommend to sort a list, do something, then unsort back to the original ordering:
# argsort is the inverse of argsort, so we use that
# for undoing the sorting.
sorter = np.argsort(keys)
unsorter = np.argsort(sorter)
sorted_keys = np.array(keys)[sorter]
result = do_a_thing_that_preserves_order(sorted_keys)
unsorted_result = np.array(result)[unsorter]
I had the same use case and I found an easy solution for that, which is basically random the list:
import random
sorted_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
unsorted_list = random.sample(sorted_list, len(sorted_list))

Dictionary works for len(string) multiple of 3. Function deletes remainders but now doesn't translate with dictionary. Python 2.7.1

I made a function with a dictionary. The purpose of the function is to separate the input string into sets of 3 . If the input string value is not a multiple of 3, I want to delete the remainder [1 or 2]
my function was working perfectly until I added the part for deleting the remainders
def func(fx):
d={'AAA':1,'BBB':2,'CCC':3}
length=len(fx)
if length % 3 == 0:
return fx
if length % 3 == 1:
return fx[:-1]
if length % 3 == 2:
return fx[:-2]
Fx=fx.upper()
Fx3=[Fx[i:i+3] for i in range(0,len(Fx),3)]
translate=[d[x] for x in Fx3]
return translate
x='aaabbbcc'
output = func(x)
print output
>>>
aaabbb
the function is recognizing that the input sequence is not a multiple of 3 so its deleting the 2 values which is what i want. However, its splitting the new string into 3 letter words to be translated with my dictionary anymore. If you delete the if statements, the function works but only for strings that are a multiple of 3.
What am I doing wrong ???
You are returning fx when you probably should be reassigning it
def func(fx):
d={'AAA':1,'BBB':2,'CCC':3}
length=len(fx)
if length % 3 == 0:
pass
elif length % 3 == 1:
fx = fx[:-1]
elif length % 3 == 2:
fx = fx[:-2]
Fx=fx.upper()
Fx3=[Fx[i:i+3] for i in range(0,len(Fx),3)]
translate=[d[x] for x in Fx3]
return translate
Here is an alternate function for you to figure out when you know some more Python
def func(fx):
d = {'AAA':1,'BBB':2,'CCC':3}
return [d["".join(x).upper()] for x in zip(*[iter(fx)]*3)]
Does this do what you want?
def func(fx):
d = {'AAA': 1, 'BBB': 2, 'CCC': 3}
fx = fx[:-(len(fx)%3)].upper()
groups = [fx[i:i+3] for i in range(0, len(fx), 3)]
translate = [d[group] for group in groups]
return translate
x='aaabbbcc'
print func(x)
When trimming the end of the string, you were returning the result when you wanted to just store it in a variable or assign it back to fx.
Rather than the if .. elifs you can just use the result of the length modulo 3 directly.
There is no need of a function, it can be done in a one liner less complex than the gnibbler's one.
Acom's solution is nearly mine.
d={'AAA':1,'BBB':2,'CCC':3}
for fx in ('bbbcccaaabbbcccbbbcccaaabbbcc',
'bbbcccaaabbbaaa','bbbcccaaabbbaa','bbbcccaaabbba',
'bbbcccaaabbb','bbbcccaaabb','bbbcccaaab',
'bbbcccaaa','bbbcccaa','bbbccca',
'bbbccc','bbbcc','bbbc',
'bbb','bb','b',''):
print fx
print tuple( d[fx[i:i+3].upper()] for i in xrange(0, len(fx)-len(fx)%3, 3) )
produces
bbbcccaaabbbcccbbbcccaaabbbcc
(2, 3, 1, 2, 3, 2, 3, 1, 2)
bbbcccaaabbbaaa
(2, 3, 1, 2, 1)
bbbcccaaabbbaa
(2, 3, 1, 2)
bbbcccaaabbba
(2, 3, 1, 2)
bbbcccaaabbb
(2, 3, 1, 2)
bbbcccaaabb
(2, 3, 1)
bbbcccaaab
(2, 3, 1)
bbbcccaaa
(2, 3, 1)
bbbcccaa
(2, 3)
bbbccca
(2, 3)
bbbccc
(2, 3)
bbbcc
(2,)
bbbc
(2,)
bbb
(2,)
bb
()
b
()
()
.
I think you have to treat strings that can contain only 3 characters strings 'aaa','bbb','ccc' at the positions 0,3,6,9,etc
Then the preceding programs won't crash if there's an heterogenous 3-characters string at one of these positions instead of one of these set 'aaa','bbb','ccc'
In this case, note that you could use the dictionary's method get that returns a default value when a pased argument isn't a key of the dictionary.
In the following code, I put the default returned value as 0:
d={'AAA':1,'BBB':2,'CCC':3}
for fx in ('bbbcccaaa###bbbccc"""bbbcc',
'bbb aaabbbaaa','bbbccc^^^bbbaa','bbbc;;;aabbba',
'bbbc^caaabbb',']]bccca..bb','bbb%%%aaab',
'bbbcccaaa','bbb!ccaa','b#bccca',
'bbbccc','bbbcc','bbbc',
'b&b','bb','b',''):
print fx
print [d.get(fx[i:i+3].upper(), 0) for i in xrange(0, len(fx)-len(fx)%3, 3)]
produces
bbbcccaaa###bbbccc"""bbbcc
[2, 3, 1, 0, 2, 3, 0, 2]
bbb aaabbbaaa
[2, 0, 1, 2, 1]
bbbccc^^^bbbaa
[2, 3, 0, 2]
bbbc;;;aabbba
[2, 0, 0, 2]
bbbc^caaabbb
[2, 0, 1, 2]
]]bccca..bb
[0, 3, 0]
bbb%%%aaab
[2, 0, 1]
bbbcccaaa
[2, 3, 1]
bbb!ccaa
[2, 0]
b#bccca
[0, 3]
bbbccc
[2, 3]
bbbcc
[2]
bbbc
[2]
b&b
[0]
bb
[]
b
[]
[]
By the way, I preferred to create a tuple instead of a list because for the kind of invariable objects that are in the result, I think it is better not to create a list

Categories