From Section 15.2 of Programming Pearls
The C codes can be viewed here: http://www.cs.bell-labs.com/cm/cs/pearls/longdup.c
When I implement it in Python using suffix-array:
example = open("iliad10.txt").read()
def comlen(p, q):
i = 0
for x in zip(p, q):
if x[0] == x[1]:
i += 1
else:
break
return i
suffix_list = []
example_len = len(example)
idx = list(range(example_len))
idx.sort(cmp = lambda a, b: cmp(example[a:], example[b:])) #VERY VERY SLOW
max_len = -1
for i in range(example_len - 1):
this_len = comlen(example[idx[i]:], example[idx[i+1]:])
print this_len
if this_len > max_len:
max_len = this_len
maxi = i
I found it very slow for the idx.sort step. I think it's slow because Python need to pass the substring by value instead of by pointer (as the C codes above).
The tested file can be downloaded from here
The C codes need only 0.3 seconds to finish.
time cat iliad10.txt |./longdup
On this the rest of the Achaeans with one voice were for
respecting the priest and taking the ransom that he offered; but
not so Agamemnon, who spoke fiercely to him and sent him roughly
away.
real 0m0.328s
user 0m0.291s
sys 0m0.006s
But for Python codes, it never ends on my computer (I waited for 10 minutes and killed it)
Does anyone have ideas how to make the codes efficient? (For example, less than 10 seconds)
My solution is based on Suffix arrays. It is constructed by Prefix doubling the Longest common prefix. The worst-case complexity is O(n (log n)^2). The file "iliad.mb.txt" takes 4 seconds on my laptop. The longest_common_substring function is short and can be easily modified, e.g. for searching the 10 longest non-overlapping substrings. This Python code is faster than the original C code from the question, if duplicate strings are longer than 10000 characters.
from itertools import groupby
from operator import itemgetter
def longest_common_substring(text):
"""Get the longest common substrings and their positions.
>>> longest_common_substring('banana')
{'ana': [1, 3]}
>>> text = "not so Agamemnon, who spoke fiercely to "
>>> sorted(longest_common_substring(text).items())
[(' s', [3, 21]), ('no', [0, 13]), ('o ', [5, 20, 38])]
This function can be easy modified for any criteria, e.g. for searching ten
longest non overlapping repeated substrings.
"""
sa, rsa, lcp = suffix_array(text)
maxlen = max(lcp)
result = {}
for i in range(1, len(text)):
if lcp[i] == maxlen:
j1, j2, h = sa[i - 1], sa[i], lcp[i]
assert text[j1:j1 + h] == text[j2:j2 + h]
substring = text[j1:j1 + h]
if not substring in result:
result[substring] = [j1]
result[substring].append(j2)
return dict((k, sorted(v)) for k, v in result.items())
def suffix_array(text, _step=16):
"""Analyze all common strings in the text.
Short substrings of the length _step a are first pre-sorted. The are the
results repeatedly merged so that the garanteed number of compared
characters bytes is doubled in every iteration until all substrings are
sorted exactly.
Arguments:
text: The text to be analyzed.
_step: Is only for optimization and testing. It is the optimal length
of substrings used for initial pre-sorting. The bigger value is
faster if there is enough memory. Memory requirements are
approximately (estimate for 32 bit Python 3.3):
len(text) * (29 + (_size + 20 if _size > 2 else 0)) + 1MB
Return value: (tuple)
(sa, rsa, lcp)
sa: Suffix array for i in range(1, size):
assert text[sa[i-1]:] < text[sa[i]:]
rsa: Reverse suffix array for i in range(size):
assert rsa[sa[i]] == i
lcp: Longest common prefix for i in range(1, size):
assert text[sa[i-1]:sa[i-1]+lcp[i]] == text[sa[i]:sa[i]+lcp[i]]
if sa[i-1] + lcp[i] < len(text):
assert text[sa[i-1] + lcp[i]] < text[sa[i] + lcp[i]]
>>> suffix_array(text='banana')
([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
Explanation: 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana'
The Longest Common String is 'ana': lcp[2] == 3 == len('ana')
It is between tx[sa[1]:] == 'ana' < 'anana' == tx[sa[2]:]
"""
tx = text
size = len(tx)
step = min(max(_step, 1), len(tx))
sa = list(range(len(tx)))
sa.sort(key=lambda i: tx[i:i + step])
grpstart = size * [False] + [True] # a boolean map for iteration speedup.
# It helps to skip yet resolved values. The last value True is a sentinel.
rsa = size * [None]
stgrp, igrp = '', 0
for i, pos in enumerate(sa):
st = tx[pos:pos + step]
if st != stgrp:
grpstart[igrp] = (igrp < i - 1)
stgrp = st
igrp = i
rsa[pos] = igrp
sa[i] = pos
grpstart[igrp] = (igrp < size - 1 or size == 0)
while grpstart.index(True) < size:
# assert step <= size
nextgr = grpstart.index(True)
while nextgr < size:
igrp = nextgr
nextgr = grpstart.index(True, igrp + 1)
glist = []
for ig in range(igrp, nextgr):
pos = sa[ig]
if rsa[pos] != igrp:
break
newgr = rsa[pos + step] if pos + step < size else -1
glist.append((newgr, pos))
glist.sort()
for ig, g in groupby(glist, key=itemgetter(0)):
g = [x[1] for x in g]
sa[igrp:igrp + len(g)] = g
grpstart[igrp] = (len(g) > 1)
for pos in g:
rsa[pos] = igrp
igrp += len(g)
step *= 2
del grpstart
# create LCP array
lcp = size * [None]
h = 0
for i in range(size):
if rsa[i] > 0:
j = sa[rsa[i] - 1]
while i != size - h and j != size - h and tx[i + h] == tx[j + h]:
h += 1
lcp[rsa[i]] = h
if h > 0:
h -= 1
if size > 0:
lcp[0] = 0
return sa, rsa, lcp
I prefer this solution over more complicated O(n log n) because Python has a very fast list sorting algorithm (Timsort). Python's sort is probably faster than necessary linear time operations in the method from that article, that should be O(n) under very special presumptions of random strings together with a small alphabet (typical for DNA genome analysis). I read in Gog 2011 that worst-case O(n log n) of my algorithm can be in practice faster than many O(n) algorithms that cannot use the CPU memory cache.
The code in another answer based on grow_chains is 19 times slower than the original example from the question, if the text contains a repeated string 8 kB long. Long repeated texts are not typical for classical literature, but they are frequent e.g. in "independent" school homework collections. The program should not freeze on it.
I wrote an example and tests with the same code for Python 2.7, 3.3 - 3.6.
The translation of the algorithm into Python:
from itertools import imap, izip, starmap, tee
from os.path import commonprefix
def pairwise(iterable): # itertools recipe
a, b = tee(iterable)
next(b, None)
return izip(a, b)
def longest_duplicate_small(data):
suffixes = sorted(data[i:] for i in xrange(len(data))) # O(n*n) in memory
return max(imap(commonprefix, pairwise(suffixes)), key=len)
buffer() allows to get a substring without copying:
def longest_duplicate_buffer(data):
n = len(data)
sa = sorted(xrange(n), key=lambda i: buffer(data, i)) # suffix array
def lcp_item(i, j): # find longest common prefix array item
start = i
while i < n and data[i] == data[i + j - start]:
i += 1
return i - start, start
size, start = max(starmap(lcp_item, pairwise(sa)), key=lambda x: x[0])
return data[start:start + size]
It takes 5 seconds on my machine for the iliad.mb.txt.
In principle it is possible to find the duplicate in O(n) time and O(n) memory using a suffix array augmented with a lcp array.
Note: *_memoryview() is deprecated by *_buffer() version
More memory efficient version (compared to longest_duplicate_small()):
def cmp_memoryview(a, b):
for x, y in izip(a, b):
if x < y:
return -1
elif x > y:
return 1
return cmp(len(a), len(b))
def common_prefix_memoryview((a, b)):
for i, (x, y) in enumerate(izip(a, b)):
if x != y:
return a[:i]
return a if len(a) < len(b) else b
def longest_duplicate(data):
mv = memoryview(data)
suffixes = sorted((mv[i:] for i in xrange(len(mv))), cmp=cmp_memoryview)
result = max(imap(common_prefix_memoryview, pairwise(suffixes)), key=len)
return result.tobytes()
It takes 17 seconds on my machine for the iliad.mb.txt. The result is:
On this the rest of the Achaeans with one voice were for respecting
the priest and taking the ransom that he offered; but not so Agamemnon,
who spoke fiercely to him and sent him roughly away.
I had to define custom functions to compare memoryview objects because memoryview comparison either raises an exception in Python 3 or produces wrong result in Python 2:
>>> s = b"abc"
>>> memoryview(s[0:]) > memoryview(s[1:])
True
>>> memoryview(s[0:]) < memoryview(s[1:])
True
Related questions:
Find the longest repeating string and the number of times it repeats in a given string
finding long repeated substrings in a massive string
The main problem seems to be that python does slicing by copy: https://stackoverflow.com/a/5722068/538551
You'll have to use a memoryview instead to get a reference instead of a copy. When I did this, the program hung after the idx.sort function (which was very fast).
I'm sure with a little work, you can get the rest working.
Edit:
The above change will not work as a drop-in replacement because cmp does not work the same way as strcmp. For example, try the following C code:
#include <stdio.h>
#include <string.h>
int main() {
char* test1 = "ovided by The Internet Classics Archive";
char* test2 = "rovided by The Internet Classics Archive.";
printf("%d\n", strcmp(test1, test2));
}
And compare the result to this python:
test1 = "ovided by The Internet Classics Archive";
test2 = "rovided by The Internet Classics Archive."
print(cmp(test1, test2))
The C code prints -3 on my machine while the python version prints -1. It looks like the example C code is abusing the return value of strcmp (it IS used in qsort after all). I couldn't find any documentation on when strcmp will return something other than [-1, 0, 1], but adding a printf to pstrcmp in the original code showed a lot of values outside of that range (3, -31, 5 were the first 3 values).
To make sure that -3 wasn't some error code, if we reverse test1 and test2, we'll get 3.
Edit:
The above is interesting trivia, but not actually correct in terms of affecting either chunks of code. I realized this just as I shut my laptop and left a wifi zone... Really should double check everything before I hit Save.
FWIW, cmp most certainly works on memoryview objects (prints -1 as expected):
print(cmp(memoryview(test1), memoryview(test2)))
I'm not sure why the code isn't working as expected. Printing out the list on my machine does not look as expected. I'll look into this and try to find a better solution instead of grasping at straws.
This version takes about 17 secs on my circa-2007 desktop using totally different algorithm:
#!/usr/bin/env python
ex = open("iliad.mb.txt").read()
chains = dict()
# populate initial chains dictionary
for (a,b) in enumerate(zip(ex,ex[1:])) :
s = ''.join(b)
if s not in chains :
chains[s] = list()
chains[s].append(a)
def grow_chains(chains) :
new_chains = dict()
for (string,pos) in chains :
offset = len(string)
for p in pos :
if p + offset >= len(ex) : break
# add one more character
s = string + ex[p + offset]
if s not in new_chains :
new_chains[s] = list()
new_chains[s].append(p)
return new_chains
# grow and filter, grow and filter
while len(chains) > 1 :
print 'length of chains', len(chains)
# remove chains that appear only once
chains = [(i,chains[i]) for i in chains if len(chains[i]) > 1]
print 'non-unique chains', len(chains)
print [i[0] for i in chains[:3]]
chains = grow_chains(chains)
The basic idea is to create a list of substrings and positions where they occure, thus eliminating the need to compare same strings again and again. The resulting list look like [('ind him, but', [466548, 739011]), (' bulwark bot', [428251, 428924]), (' his armour,', [121559, 124919, 193285, 393566, 413634, 718953, 760088])]. Unique strings are removed. Then every list member grows by 1 character and new list is created. Unique strings are removed again. And so on and so forth...
Consider there are some lists of integers as:
#--------------------------------------
0 [0,1,3]
1 [1,0,3,4,5,10,...]
2 [2,8]
3 [3,1,0,...]
...
n []
#--------------------------------------
The question is to merge lists having at least one common element. So the results only for the given part will be as follows:
#--------------------------------------
0 [0,1,3,4,5,10,...]
2 [2,8]
#--------------------------------------
What is the most efficient way to do this on large data (elements are just numbers)?
Is tree structure something to think about?
I do the job now by converting lists to sets and iterating for intersections, but it is slow! Furthermore I have a feeling that is so-elementary! In addition, the implementation lacks something (unknown) because some lists remain unmerged sometime! Having said that, if you were proposing self-implementation please be generous and provide a simple sample code [apparently Python is my favoriate :)] or pesudo-code.
Update 1:
Here is the code I was using:
#--------------------------------------
lsts = [[0,1,3],
[1,0,3,4,5,10,11],
[2,8],
[3,1,0,16]];
#--------------------------------------
The function is (buggy!!):
#--------------------------------------
def merge(lsts):
sts = [set(l) for l in lsts]
i = 0
while i < len(sts):
j = i+1
while j < len(sts):
if len(sts[i].intersection(sts[j])) > 0:
sts[i] = sts[i].union(sts[j])
sts.pop(j)
else: j += 1 #---corrected
i += 1
lst = [list(s) for s in sts]
return lst
#--------------------------------------
The result is:
#--------------------------------------
>>> merge(lsts)
>>> [0, 1, 3, 4, 5, 10, 11, 16], [8, 2]]
#--------------------------------------
Update 2:
To my experience the code given by Niklas Baumstark below showed to be a bit faster for the simple cases. Not tested the method given by "Hooked" yet, since it is completely different approach (by the way it seems interesting).
The testing procedure for all of these could be really hard or impossible to be ensured of the results. The real data set I will use is so large and complex, so it is impossible to trace any error just by repeating. That is I need to be 100% satisfied of the reliability of the method before pushing it in its place within a large code as a module. Simply for now Niklas's method is faster and the answer for simple sets is correct of course.
However how can I be sure that it works well for real large data set? Since I will not be able to trace the errors visually!
Update 3:
Note that reliability of the method is much more important than speed for this problem. I will be hopefully able to translate the Python code to Fortran for the maximum performance finally.
Update 4:
There are many interesting points in this post and generously given answers, constructive comments. I would recommend reading all thoroughly. Please accept my appreciation for the development of the question, amazing answers and constructive comments and discussion.
My attempt:
def merge(lsts):
sets = [set(lst) for lst in lsts if lst]
merged = True
while merged:
merged = False
results = []
while sets:
common, rest = sets[0], sets[1:]
sets = []
for x in rest:
if x.isdisjoint(common):
sets.append(x)
else:
merged = True
common |= x
results.append(common)
sets = results
return sets
lst = [[65, 17, 5, 30, 79, 56, 48, 62],
[6, 97, 32, 93, 55, 14, 70, 32],
[75, 37, 83, 34, 9, 19, 14, 64],
[43, 71],
[],
[89, 49, 1, 30, 28, 3, 63],
[35, 21, 68, 94, 57, 94, 9, 3],
[16],
[29, 9, 97, 43],
[17, 63, 24]]
print merge(lst)
Benchmark:
import random
# adapt parameters to your own usage scenario
class_count = 50
class_size = 1000
list_count_per_class = 100
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
if False: # change to true to generate the test data file (takes a while)
with open("/tmp/test.txt", "w") as f:
lists = []
classes = [
range(class_size * i, class_size * (i + 1)) for i in range(class_count)
]
for c in classes:
# distribute each class across ~300 lists
for i in xrange(list_count_per_class):
lst = []
if random.random() < large_list_probability:
size = random.choice(large_list_sizes)
else:
size = random.choice(small_list_sizes)
nums = set(c)
for j in xrange(size):
x = random.choice(list(nums))
lst.append(x)
nums.remove(x)
random.shuffle(lst)
lists.append(lst)
random.shuffle(lists)
for lst in lists:
f.write(" ".join(str(x) for x in lst) + "\n")
setup = """
# Niklas'
def merge_niklas(lsts):
sets = [set(lst) for lst in lsts if lst]
merged = 1
while merged:
merged = 0
results = []
while sets:
common, rest = sets[0], sets[1:]
sets = []
for x in rest:
if x.isdisjoint(common):
sets.append(x)
else:
merged = 1
common |= x
results.append(common)
sets = results
return sets
# Rik's
def merge_rik(data):
sets = (set(e) for e in data if e)
results = [next(sets)]
for e_set in sets:
to_update = []
for i, res in enumerate(results):
if not e_set.isdisjoint(res):
to_update.insert(0, i)
if not to_update:
results.append(e_set)
else:
last = results[to_update.pop(-1)]
for i in to_update:
last |= results[i]
del results[i]
last |= e_set
return results
# katrielalex's
def pairs(lst):
i = iter(lst)
first = prev = item = i.next()
for item in i:
yield prev, item
prev = item
yield item, first
import networkx
def merge_katrielalex(lsts):
g = networkx.Graph()
for lst in lsts:
for edge in pairs(lst):
g.add_edge(*edge)
return networkx.connected_components(g)
# agf's (optimized)
from collections import deque
def merge_agf_optimized(lists):
sets = deque(set(lst) for lst in lists if lst)
results = []
disjoint = 0
current = sets.pop()
while True:
merged = False
newsets = deque()
for _ in xrange(disjoint, len(sets)):
this = sets.pop()
if not current.isdisjoint(this):
current.update(this)
merged = True
disjoint = 0
else:
newsets.append(this)
disjoint += 1
if sets:
newsets.extendleft(sets)
if not merged:
results.append(current)
try:
current = newsets.pop()
except IndexError:
break
disjoint = 0
sets = newsets
return results
# agf's (simple)
def merge_agf_simple(lists):
newsets, sets = [set(lst) for lst in lists if lst], []
while len(sets) != len(newsets):
sets, newsets = newsets, []
for aset in sets:
for eachset in newsets:
if not aset.isdisjoint(eachset):
eachset.update(aset)
break
else:
newsets.append(aset)
return newsets
# alexis'
def merge_alexis(data):
bins = range(len(data)) # Initialize each bin[n] == n
nums = dict()
data = [set(m) for m in data] # Convert to sets
for r, row in enumerate(data):
for num in row:
if num not in nums:
# New number: tag it with a pointer to this row's bin
nums[num] = r
continue
else:
dest = locatebin(bins, nums[num])
if dest == r:
continue # already in the same bin
if dest > r:
dest, r = r, dest # always merge into the smallest bin
data[dest].update(data[r])
data[r] = None
# Update our indices to reflect the move
bins[r] = dest
r = dest
# Filter out the empty bins
have = [m for m in data if m]
return have
def locatebin(bins, n):
while bins[n] != n:
n = bins[n]
return n
lsts = []
size = 0
num = 0
max = 0
for line in open("/tmp/test.txt", "r"):
lst = [int(x) for x in line.split()]
size += len(lst)
if len(lst) > max:
max = len(lst)
num += 1
lsts.append(lst)
"""
setup += """
print "%i lists, {class_count} equally distributed classes, average size %i, max size %i" % (num, size/num, max)
""".format(class_count=class_count)
import timeit
print "niklas"
print timeit.timeit("merge_niklas(lsts)", setup=setup, number=3)
print "rik"
print timeit.timeit("merge_rik(lsts)", setup=setup, number=3)
print "katrielalex"
print timeit.timeit("merge_katrielalex(lsts)", setup=setup, number=3)
print "agf (1)"
print timeit.timeit("merge_agf_optimized(lsts)", setup=setup, number=3)
print "agf (2)"
print timeit.timeit("merge_agf_simple(lsts)", setup=setup, number=3)
print "alexis"
print timeit.timeit("merge_alexis(lsts)", setup=setup, number=3)
These timings are obviously dependent on the specific parameters to the benchmark, like number of classes, number of lists, list size, etc. Adapt those parameters to your need to get more helpful results.
Below are some example outputs on my machine for different parameters. They show that all the algorithms have their strength and weaknesses, depending on the kind of input they get:
=====================
# many disjoint classes, large lists
class_count = 50
class_size = 1000
list_count_per_class = 100
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
=====================
niklas
5000 lists, 50 equally distributed classes, average size 298, max size 999
4.80084705353
rik
5000 lists, 50 equally distributed classes, average size 298, max size 999
9.49251699448
katrielalex
5000 lists, 50 equally distributed classes, average size 298, max size 999
21.5317108631
agf (1)
5000 lists, 50 equally distributed classes, average size 298, max size 999
8.61671280861
agf (2)
5000 lists, 50 equally distributed classes, average size 298, max size 999
5.18117713928
=> alexis
=> 5000 lists, 50 equally distributed classes, average size 298, max size 999
=> 3.73504281044
===================
# less number of classes, large lists
class_count = 15
class_size = 1000
list_count_per_class = 300
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
===================
niklas
4500 lists, 15 equally distributed classes, average size 296, max size 999
1.79993700981
rik
4500 lists, 15 equally distributed classes, average size 296, max size 999
2.58237695694
katrielalex
4500 lists, 15 equally distributed classes, average size 296, max size 999
19.5465381145
agf (1)
4500 lists, 15 equally distributed classes, average size 296, max size 999
2.75445604324
=> agf (2)
=> 4500 lists, 15 equally distributed classes, average size 296, max size 999
=> 1.77850699425
alexis
4500 lists, 15 equally distributed classes, average size 296, max size 999
3.23530197144
===================
# less number of classes, smaller lists
class_count = 15
class_size = 1000
list_count_per_class = 300
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.1
===================
niklas
4500 lists, 15 equally distributed classes, average size 95, max size 997
0.773697137833
rik
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.0523750782
katrielalex
4500 lists, 15 equally distributed classes, average size 95, max size 997
6.04466891289
agf (1)
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.20285701752
=> agf (2)
=> 4500 lists, 15 equally distributed classes, average size 95, max size 997
=> 0.714507102966
alexis
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.1286110878
I tried to summurize everything that's been said and done about this topic in this question and in the duplicate one.
I tried to test and time every solution (all the code here).
Testing
This is the TestCase from the testing module:
class MergeTestCase(unittest.TestCase):
def setUp(self):
with open('./lists/test_list.txt') as f:
self.lsts = json.loads(f.read())
self.merged = self.merge_func(deepcopy(self.lsts))
def test_disjoint(self):
"""Check disjoint-ness of merged results"""
from itertools import combinations
for a,b in combinations(self.merged, 2):
self.assertTrue(a.isdisjoint(b))
def test_coverage(self): # Credit to katrielalex
"""Check coverage original data"""
merged_flat = set()
for s in self.merged:
merged_flat |= s
original_flat = set()
for lst in self.lsts:
original_flat |= set(lst)
self.assertTrue(merged_flat == original_flat)
def test_subset(self): # Credit to WolframH
"""Check that every original data is a subset"""
for lst in self.lsts:
self.assertTrue(any(set(lst) <= e for e in self.merged))
This test is supposing a list of sets as result, so I couldn't test a couple of sulutions that worked with lists.
I couldn't test the following:
katrielalex
steabert
Among the ones I could test, two failed:
-- Going to test: agf (optimized) --
Check disjoint-ness of merged results ... FAIL
-- Going to test: robert king --
Check disjoint-ness of merged results ... FAIL
Timing
The performances are strongly related with the data test employed.
So far three answers tried to time theirs and others solution. Since they used different testing data they had different results.
Niklas benchmark is very twakable. With his banchmark one could do different tests changing some parameters.
I've used the same three sets of parameters he used in his own answer, and I put them in three different files:
filename = './lists/timing_1.txt'
class_count = 50,
class_size = 1000,
list_count_per_class = 100,
large_list_sizes = (100, 1000),
small_list_sizes = (0, 100),
large_list_probability = 0.5,
filename = './lists/timing_2.txt'
class_count = 15,
class_size = 1000,
list_count_per_class = 300,
large_list_sizes = (100, 1000),
small_list_sizes = (0, 100),
large_list_probability = 0.5,
filename = './lists/timing_3.txt'
class_count = 15,
class_size = 1000,
list_count_per_class = 300,
large_list_sizes = (100, 1000),
small_list_sizes = (0, 100),
large_list_probability = 0.1,
This are the results that I got:
From file: timing_1.txt
Timing with: >> Niklas << Benchmark
Info: 5000 lists, average size 305, max size 999
Timing Results:
10.434 -- alexis
11.476 -- agf
11.555 -- Niklas B.
13.622 -- Rik. Poggi
14.016 -- agf (optimized)
14.057 -- ChessMaster
20.208 -- katrielalex
21.697 -- steabert
25.101 -- robert king
76.870 -- Sven Marnach
133.399 -- hochl
From file: timing_2.txt
Timing with: >> Niklas << Benchmark
Info: 4500 lists, average size 305, max size 999
Timing Results:
8.247 -- Niklas B.
8.286 -- agf
8.637 -- Rik. Poggi
8.967 -- alexis
9.090 -- ChessMaster
9.091 -- agf (optimized)
18.186 -- katrielalex
19.543 -- steabert
22.852 -- robert king
70.486 -- Sven Marnach
104.405 -- hochl
From file: timing_3.txt
Timing with: >> Niklas << Benchmark
Info: 4500 lists, average size 98, max size 999
Timing Results:
2.746 -- agf
2.850 -- Niklas B.
2.887 -- Rik. Poggi
2.972 -- alexis
3.077 -- ChessMaster
3.174 -- agf (optimized)
5.811 -- katrielalex
7.208 -- robert king
9.193 -- steabert
23.536 -- Sven Marnach
37.436 -- hochl
With Sven's testing data I got the following results:
Timing with: >> Sven << Benchmark
Info: 200 lists, average size 10, max size 10
Timing Results:
2.053 -- alexis
2.199 -- ChessMaster
2.410 -- agf (optimized)
3.394 -- agf
3.398 -- Rik. Poggi
3.640 -- robert king
3.719 -- steabert
3.776 -- Niklas B.
3.888 -- hochl
4.610 -- Sven Marnach
5.018 -- katrielalex
And finally with Agf's benchmark I got:
Timing with: >> Agf << Benchmark
Info: 2000 lists, average size 246, max size 500
Timing Results:
3.446 -- Rik. Poggi
3.500 -- ChessMaster
3.520 -- agf (optimized)
3.527 -- Niklas B.
3.527 -- agf
3.902 -- hochl
5.080 -- alexis
15.997 -- steabert
16.422 -- katrielalex
18.317 -- robert king
1257.152 -- Sven Marnach
As I said at the beginning all the code is available at this git repository. All the merging functions are in a file called core.py, every function there with its name ending with _merge will be auto loaded during the tests, so it shouldn't be hard to add/test/improve your own solution.
Let me also know if there's something wrong, it's been a lot of coding and I could use a couple of fresh eyes :)
Using Matrix Manipulations
Let me preface this answer with the following comment:
THIS IS THE WRONG WAY TO DO THIS. IT IS PRONE TO NUMERICAL INSTABILITY AND IS MUCH SLOWER THAN THE OTHER METHODS PRESENTED, USE AT YOUR OWN RISK.
That being said, I couldn't resist solving the problem from a dynamical point of view (and I hope you'll get a fresh perspective on the problem). In theory this should work all the time, but eigenvalue calculations can often fail. The idea is to think of your list as a flow from rows to columns. If two rows share a common value there is a connecting flow between them. If we were to think of these flows as water, we would see that the flows cluster into little pools when they there is a connecting path between them. For simplicity, I'm going to use a smaller set, though it works with your data set as well:
from numpy import where, newaxis
from scipy import linalg, array, zeros
X = [[0,1,3],[2],[3,1]]
We need to convert the data into a flow graph. If row i flows into value j we put it in the matrix. Here we have 3 rows and 4 unique values:
A = zeros((4,len(X)), dtype=float)
for i,row in enumerate(X):
for val in row: A[val,i] = 1
In general, you'll need to change the 4 to capture the number of unique values you have. If the set is a list of integers starting from 0 as we have, you can simply make this the largest number. We now perform an eigenvalue decomposition. A SVD to be exact, since our matrix is not square.
S = linalg.svd(A)
We want to keep only the 3x3 portion of this answer, since it will represent the flow of the pools. In fact we only want the absolute values of this matrix; we only care if there is a flow in this cluster space.
M = abs(S[2])
We can think of this matrix M as a Markov matrix and make it explicit by row normalizing. Once we have this we compute the (left) eigenvalue decomp. of this matrix.
M /= M.sum(axis=1)[:,newaxis]
U,V = linalg.eig(M,left=True, right=False)
V = abs(V)
Now a disconnected (non-ergodic) Markov matrix has the nice property that, for each non-connected cluster, there is a eigenvalue of unity. The eigenvectors associated with these unity values are the ones we want:
idx = where(U > .999)[0]
C = V.T[idx] > 0
I have to use .999 due to the aforementioned numerical instability. At this point, we are done! Each independent cluster can now pull the corresponding rows out:
for cluster in C:
print where(A[:,cluster].sum(axis=1))[0]
Which gives, as intended:
[0 1 3]
[2]
Change X to your lst and you'll get: [ 0 1 3 4 5 10 11 16] [2 8].
Addendum
Why might this be useful? I don't know where your underlying data comes from, but what happens when the connections are not absolute? Say row 1 has entry 3 80% of the time - how would you generalize the problem? The flow method above would work just fine, and would be completely parametrized by that .999 value, the further away from unity it is, the looser the association.
Visual Representation
Since a picture is worth 1K words, here are the plots of the matrices A and V for my example and your lst respectively. Notice how in V splits into two clusters (it is a block-diagonal matrix with two blocks after permutation), since for each example there were only two unique lists!
Faster Implementation
In hindsight, I realized that you can skip the SVD step and compute only a single decomp:
M = dot(A.T,A)
M /= M.sum(axis=1)[:,newaxis]
U,V = linalg.eig(M,left=True, right=False)
The advantage with this method (besides speed) is that M is now symmetric, hence the computation can be faster and more accurate (no imaginary values to worry about).
EDIT: OK, the other questions has been closed, posting here.
Nice question! It's much simpler if you think of it as a connected-components problem in a graph. The following code uses the excellent networkx graph library and the pairs function from this question.
def pairs(lst):
i = iter(lst)
first = prev = item = i.next()
for item in i:
yield prev, item
prev = item
yield item, first
lists = [[1,2,3],[3,5,6],[8,9,10],[11,12,13]]
import networkx
g = networkx.Graph()
for sub_list in lists:
for edge in pairs(sub_list):
g.add_edge(*edge)
networkx.connected_components(g)
[[1, 2, 3, 5, 6], [8, 9, 10], [11, 12, 13]]
Explanation
We create a new (empty) graph g. For each sub-list in lists, consider its elements as nodes of the graph and add an edge between them. (Since we only care about connectedness, we don't need to add all the edges -- only adjacent ones!) Note that add_edge takes two objects, treats them as nodes (and adds them if they aren't already there), and adds an edge between them.
Then, we just find the connected components of the graph -- a solved problem! -- and output them as our intersecting sets.
Here's my answer. I haven't checked it against today's batch of answers.
The intersection-based algorithms are O(N^2) since they check each new set against all the existing ones, so I used an approach that indexes each number and runs on close to O(N) (if we accept that dictionary lookups are O(1)). Then I ran the benchmarks and felt like a complete idiot because it ran slower, but on closer inspection it turned out that the test data ends up with only a handful of distinct result sets, so the quadratic algorithms don't have a lot work to do. Test it with more than 10-15 distinct bins and my algorithm is much faster. Try test data with more than 50 distinct bins, and it is enormously faster.
(Edit: There was also a problem with the way the benchmark is run, but I was wrong in my diagnosis. I altered my code to work with the way the repeated tests are run).
def mergelists5(data):
"""Check each number in our arrays only once, merging when we find
a number we have seen before.
"""
bins = range(len(data)) # Initialize each bin[n] == n
nums = dict()
data = [set(m) for m in data ] # Convert to sets
for r, row in enumerate(data):
for num in row:
if num not in nums:
# New number: tag it with a pointer to this row's bin
nums[num] = r
continue
else:
dest = locatebin(bins, nums[num])
if dest == r:
continue # already in the same bin
if dest > r:
dest, r = r, dest # always merge into the smallest bin
data[dest].update(data[r])
data[r] = None
# Update our indices to reflect the move
bins[r] = dest
r = dest
# Filter out the empty bins
have = [ m for m in data if m ]
print len(have), "groups in result"
return have
def locatebin(bins, n):
"""
Find the bin where list n has ended up: Follow bin references until
we find a bin that has not moved.
"""
while bins[n] != n:
n = bins[n]
return n
This new function only does the minimum necessary number of disjointness tests, something the other similar solutions fail to do. It also uses a deque to avoid as many linear time operations as possible, like list slicing and deletion from early in the list.
from collections import deque
def merge(lists):
sets = deque(set(lst) for lst in lists if lst)
results = []
disjoint = 0
current = sets.pop()
while True:
merged = False
newsets = deque()
for _ in xrange(disjoint, len(sets)):
this = sets.pop()
if not current.isdisjoint(this):
current.update(this)
merged = True
disjoint = 0
else:
newsets.append(this)
disjoint += 1
if sets:
newsets.extendleft(sets)
if not merged:
results.append(current)
try:
current = newsets.pop()
except IndexError:
break
disjoint = 0
sets = newsets
return results
The less overlap between the sets in a given set of data, the better this will do compared to the other functions.
Here is an example case. If you have 4 sets, you need to compare:
1, 2
1, 3
1, 4
2, 3
2, 4
3, 4
If 1 overlaps with 3, then 2 needs to be re-tested to see if it now overlaps with 1, in order to safely skip testing 2 against 3.
There are two ways to deal with this. The first is to restart the testing of set 1 against the other sets after each overlap and merge. The second is to continue with the testing by comparing 1 with 4, then going back and re-testing. The latter results in fewer disjointness tests, as more merges happen in a single pass, so on the re-test pass, there are fewer sets left to test against.
The problem is to track which sets have to be re-tested. In the above example, 1 needs to be re-tested against 2 but not against 4, since 1 was already in its current state before 4 was tested the first time.
The disjoint counter allows this to be tracked.
My answer doesn't help with the main problem of finding an improved algorithm for recoding into FORTRAN; it is just what appears to me to be the simplest and most elegant way to implement the algorithm in Python.
According to my testing (or the test in the accepted answer), it's slightly (up to 10%) faster than the next fastest solution.
def merge0(lists):
newsets, sets = [set(lst) for lst in lists if lst], []
while len(sets) != len(newsets):
sets, newsets = newsets, []
for aset in sets:
for eachset in newsets:
if not aset.isdisjoint(eachset):
eachset.update(aset)
break
else:
newsets.append(aset)
return newsets
No need for the un-Pythonic counters (i, range) or complicated mutation (del, pop, insert) used in the other implementations. It uses only simple iteration, merges overlapping sets in the simplest manner, and builds a single new list on each pass through the data.
My (faster and simpler) version of the test code:
import random
tenk = range(10000)
lsts = [random.sample(tenk, random.randint(0, 500)) for _ in range(2000)]
setup = """
def merge0(lists):
newsets, sets = [set(lst) for lst in lists if lst], []
while len(sets) != len(newsets):
sets, newsets = newsets, []
for aset in sets:
for eachset in newsets:
if not aset.isdisjoint(eachset):
eachset.update(aset)
break
else:
newsets.append(aset)
return newsets
def merge1(lsts):
sets = [set(lst) for lst in lsts if lst]
merged = 1
while merged:
merged = 0
results = []
while sets:
common, rest = sets[0], sets[1:]
sets = []
for x in rest:
if x.isdisjoint(common):
sets.append(x)
else:
merged = 1
common |= x
results.append(common)
sets = results
return sets
lsts = """ + repr(lsts)
import timeit
print timeit.timeit("merge0(lsts)", setup=setup, number=10)
print timeit.timeit("merge1(lsts)", setup=setup, number=10)
Here's an implementation using a disjoint-set data structure (specifically a disjoint forest), thanks to comingstorm's hint at merging sets which have even one element in common. I'm using path compression for a slight (~5%) speed improvement; it's not entirely necessary (and it prevents find being tail recursive, which could slow things down). Note that I'm using a dict to represent the disjoint forest; given that the data are ints, an array would also work although it might not be much faster.
def merge(data):
parents = {}
def find(i):
j = parents.get(i, i)
if j == i:
return i
k = find(j)
if k != j:
parents[i] = k
return k
for l in filter(None, data):
parents.update(dict.fromkeys(map(find, l), find(l[0])))
merged = {}
for k, v in parents.items():
merged.setdefault(find(v), []).append(k)
return merged.values()
This approach is comparable to the other best algorithms on Rik's benchmarks.
This would be my updated approach:
def merge(data):
sets = (set(e) for e in data if e)
results = [next(sets)]
for e_set in sets:
to_update = []
for i,res in enumerate(results):
if not e_set.isdisjoint(res):
to_update.insert(0,i)
if not to_update:
results.append(e_set)
else:
last = results[to_update.pop(-1)]
for i in to_update:
last |= results[i]
del results[i]
last |= e_set
return results
Note: During the merging empty lists will be removed.
Update: Reliability.
You need two tests for a 100% reliabilty of success:
Check that all the resulting sets are mutually disjointed:
merged = [{0, 1, 3, 4, 5, 10, 11, 16}, {8, 2}, {8}]
from itertools import combinations
for a,b in combinations(merged,2):
if not a.isdisjoint(b):
raise Exception(a,b) # just an example
Check that the merged set cover the original data. (as suggested by katrielalex)
I think this will take some time, but maybe it'll be worth it if you want to be 100% sure.
Just for fun...
def merge(mylists):
results, sets = [], [set(lst) for lst in mylists if lst]
upd, isd, pop = set.update, set.isdisjoint, sets.pop
while sets:
if not [upd(sets[0],pop(i)) for i in xrange(len(sets)-1,0,-1) if not isd(sets[0],sets[i])]:
results.append(pop(0))
return results
and my rewrite of the best answer
def merge(lsts):
sets = map(set,lsts)
results = []
while sets:
first, rest = sets[0], sets[1:]
merged = False
sets = []
for s in rest:
if s and s.isdisjoint(first):
sets.append(s)
else:
first |= s
merged = True
if merged: sets.append(first)
else: results.append(first)
return results
Here's a function (Python 3.1) to check if the result of a merge function is OK. It checks:
Are the result sets disjoint? (number of elements of union == sum of numbers of elements)
Are the elements of the result sets the same as of the input lists?
Is every input list a subset of a result set?
Is every result set minimal, i.e. is it impossible to split it into two smaller sets?
It does not check if there are empty result sets - I don't know if you want them or not...
.
from itertools import chain
def check(lsts, result):
lsts = [set(s) for s in lsts]
all_items = set(chain(*lsts))
all_result_items = set(chain(*result))
num_result_items = sum(len(s) for s in result)
if num_result_items != len(all_result_items):
print("Error: result sets overlap!")
print(num_result_items, len(all_result_items))
print(sorted(map(len, result)), sorted(map(len, lsts)))
if all_items != all_result_items:
print("Error: result doesn't match input lists!")
if not all(any(set(s).issubset(t) for t in result) for s in lst):
print("Error: not all input lists are contained in a result set!")
seen = set()
todo = list(filter(bool, lsts))
done = False
while not done:
deletes = []
for i, s in enumerate(todo): # intersection with seen, or with unseen result set, is OK
if not s.isdisjoint(seen) or any(t.isdisjoint(seen) for t in result if not s.isdisjoint(t)):
seen.update(s)
deletes.append(i)
for i in reversed(deletes):
del todo[i]
done = not deletes
if todo:
print("Error: A result set should be split into two or more parts!")
print(todo)
lists = [[1,2,3],[3,5,6],[8,9,10],[11,12,13]]
import networkx as nx
g = nx.Graph()
for sub_list in lists:
for i in range(1,len(sub_list)):
g.add_edge(sub_list[0],sub_list[i])
print nx.connected_components(g)
#[[1, 2, 3, 5, 6], [8, 9, 10], [11, 12, 13]]
Performance:
5000 lists, 5 classes, average size 74, max size 1000
15.2264976415
Performance of merge1:
print timeit.timeit("merge1(lsts)", setup=setup, number=10)
5000 lists, 5 classes, average size 74, max size 1000
1.26998780571
So it is 11x slower than the fastest.. but the code is much more simple and readable!
This is slower than the solution offered by Niklas (I got 3.9s on the test.txt instead of 0.5s for his solution), but yields the same result and might be easier to implement in e.g. Fortran, since it doesn't use sets, only sorting of the total amount of elements and then a single run through all of them.
It returns a list with the ids of the merged lists, so also keeps track of empty lists, they stay unmerged.
def merge(lsts):
# this is an index list that stores the joined id for each list
joined = range(len(lsts))
# create an ordered list with indices
indexed_list = sorted((el,index) for index,lst in enumerate(lsts) for el in lst)
# loop throught the ordered list, and if two elements are the same and
# the lists are not yet joined, alter the list with joined id
el_0,idx_0 = None,None
for el,idx in indexed_list:
if el == el_0 and joined[idx] != joined[idx_0]:
old = joined[idx]
rep = joined[idx_0]
joined = [rep if id == old else id for id in joined]
el_0, idx_0 = el, idx
return joined
Firstly I'm not exactly sure if the benchmarks are fair:
Adding the following code to the start of my function:
c = Counter(chain(*lists))
print c[1]
"88"
This means that out of all the values in all the lists, there are only 88 distinct values. Usually in the real world duplicates are rare, and you would expect a lot more distinct values. (of course i don't know where your data from so can't make assumptions).
Because Duplicates are more common, it means sets are less likely to be disjoint. This means the set.isdisjoint() method will be much faster because only after a few tests it will find that the sets aren't disjoint.
Having said all that, I do believe the methods presented that use disjoint are the fastest anyway, but i'm just saying, instead of being 20x faster maybe they should only be 10x faster than the other methods with different benchmark testing.
Anyway, i Thought I would try a slightly different technique to solve this, however the merge sorting was too slow, this method is about 20x slower than the two fastest methods using the benchmarking:
I thought I would order everything
import heapq
from itertools import chain
def merge6(lists):
for l in lists:
l.sort()
one_list = heapq.merge(*[zip(l,[i]*len(l)) for i,l in enumerate(lists)]) #iterating through one_list takes 25 seconds!!
previous = one_list.next()
d = {i:i for i in range(len(lists))}
for current in one_list:
if current[0]==previous[0]:
d[current[1]] = d[previous[1]]
previous=current
groups=[[] for i in range(len(lists))]
for k in d:
groups[d[k]].append(lists[k]) #add a each list to its group
return [set(chain(*g)) for g in groups if g] #since each subroup in each g is sorted, it would be faster to merge these subgroups removing duplicates along the way.
lists = [[1,2,3],[3,5,6],[8,9,10],[11,12,13]]
print merge6(lists)
"[set([1, 2, 3, 5, 6]), set([8, 9, 10]), set([11, 12, 13])]""
import timeit
print timeit.timeit("merge1(lsts)", setup=setup, number=10)
print timeit.timeit("merge4(lsts)", setup=setup, number=10)
print timeit.timeit("merge6(lsts)", setup=setup, number=10)
5000 lists, 5 classes, average size 74, max size 1000
1.26732238315
5000 lists, 5 classes, average size 74, max size 1000
1.16062907437
5000 lists, 5 classes, average size 74, max size 1000
30.7257182826
I found #Niklas B.'s answer really helpful... but it took me a while to read through it and understand the logic. This is a re-write of exactly the same code with new variable names and more explanation... to help the other N00bs out there!
def mergeUntilOnlyDisjointSetsRemain(_listsOfLists):
"""Function for merging lists until there are only disjoint sets"""
"""Imagine this algorithm as if it were processing train cars filled with
integers. It takes the first car of the train, separates it from the rest,
and then compares the first car to each subsequent car.
Start by renaming the first car to 'common'
If the two train cars have a common integer, you merge the two cars into
common, and continue down the line until you reach the end of the train.
Once you reach the end of the train, place the common car in results, (which
is essentially a collection of train cars that have already been compared
to all other cars).
You can exit the loop as soon as you get to the end of the train without
merging any of the cars. This is controlled by the continueMerge variable.
This variable is only reset to True after a merge operation.
"""
# Start by creating a trainCar(i.e. a set) from each list in our listOfLists
freightTrain = [set(trainCar) for trainCar in _listsOfLists if trainCar]
# This continueMerge means that we have not yet compared all cars in the train.
continueMerge = True
while continueMerge:
# Reset the while loop trigger.
continueMerge = False
# Create a fresh empty list of cars that have already been cross checked
crossCheckedCars = []
# While there are still elements in freightTrain
while freightTrain:
# Split the freightTrain up, into first car vs all the remaining cars
commonFirstTrainCar = freightTrain[0]
remainingCars = freightTrain[1:]
# The freightTrain is now empty
freightTrain = []
# Iterate over all the remaining traincars
for currentTrainCar in remainingCars:
# If there are not any common integers with the first car...
if currentTrainCar.isdisjoint(commonFirstTrainCar):
# Add each train car back onto the freightTrain
freightTrain.append(currentTrainCar)
# But if they share a common integer...
else:
# Trigger the reset switch to continueMerging cars
continueMerge = True
# and Join he cars together
commonFirstTrainCar |= currentTrainCar
# Once we have checked commonFirstTrainCar, remove it from the
# freightTrain and place it in crossCheckedCars
crossCheckedCars.append(commonFirstTrainCar)
# Now we have compared the first car to all subsequent cars
# (... but we aren't finished because the 5th and 7th cars might have
# had a common integer with each other... but only 1st and 5th cars
# shared an integer the first time around... so the 1st and 5th cars
# were merged, but the 7th car is still alone!)
# Reset the system by creating a new freightTrain
freightTrain = crossCheckedCars
# Post-process the freight train to turn it into lists instead of sets
listsForReturnTripHome = []
for processedTraincar in freightTrain:
listsForReturnTripHome.append(list(processedTraincar))
return listsForReturnTripHome
My solution, works well on small lists and is quite readable without dependencies.
def merge_list(starting_list):
final_list = []
for i,v in enumerate(starting_list[:-1]):
if set(v)&set(starting_list[i+1]):
starting_list[i+1].extend(list(set(v) - set(starting_list[i+1])))
else:
final_list.append(v)
final_list.append(starting_list[-1])
return final_list
Benchmarking it:
lists = [[1,2,3],[3,5,6],[8,9,10],[11,12,13]]
%timeit merge_list(lists)
100000 loops, best of 3: 4.9 µs per loop
This can be solved in O(n) by using the union-find algorithm. Given the first two rows of your data, edges to use in the union-find are the following pairs:
(0,1),(1,3),(1,0),(0,3),(3,4),(4,5),(5,10)
Use flag to ensure you get the final mutual exclusive results
def merge(lists):
while(1):
flag=0
for i in range(0,len(lists)):
for j in range(i+1,len(lists)):
if len(intersection(lists[i],lists[j]))!=0:
lists[i]=union(lists[i],lists[j])
lists.remove(lists[j])
flag+=1
break
if flag==0:
break
return lists
from itertools import combinations
def merge(elements_list):
d = {index: set(elements) for index, elements in enumerate(elements_list)}
while any(not set.isdisjoint(d[i], d[j]) for i, j in combinations(d.keys(), 2)):
merged = set()
for i, j in combinations(d.keys(), 2):
if not set.isdisjoint(d[i], d[j]):
d[i] = set.union(d[i], d[j])
merged.add(j)
for k in merged:
d.pop(k)
return [v for v in d.values() if v]
lst = [[65, 17, 5, 30, 79, 56, 48, 62],
[6, 97, 32, 93, 55, 14, 70, 32],
[75, 37, 83, 34, 9, 19, 14, 64],
[43, 71],
[],
[89, 49, 1, 30, 28, 3, 63],
[35, 21, 68, 94, 57, 94, 9, 3],
[16],
[29, 9, 97, 43],
[17, 63, 24]]
print(merge(lst))