Sorting multiple lists together in place - python

I have lists a,b,c,... of equal length. I'd like to sort all of them the order obtained by sorting a, i.e., I could do the decorate-sort-undecorate pattern
a, b, c = map(list, zip(*sorted(zip(a, b, c))))
or something like that. However, I'd like that the lists are sorted in place (I assume that sorted pulls everything from the temporary iterator passed to it to a temporary list, and then zip stuff into three output lists, so every datum in the input is copied twice unnecessarily) without creating temporary objects. So what I don't mean is:
a_sorted, b_sorted, c_sorted = map(list, zip(*sorted(zip(a, b, c))))
a[:] = a_sorted
b[:] = b_sorted
c[:] = c_sorted
How can I achieve that?

I think "without creating temporary objects" is impossible, especially since "everything is an object" in Python.
You could get O(1) space / number of objects if you implement some sorting algorithm yourself, though if you want O(n log n) time and stability, it's difficult. If you don't care about stability (seems likely, since you say you want to sort by a but then actually sort by a, b and c), heapsort is reasonably easy:
def sort_together_heapsort(a, b, c):
n = len(a)
def swap(i, j):
a[i], a[j] = a[j], a[i]
b[i], b[j] = b[j], b[i]
c[i], c[j] = c[j], c[i]
def siftdown(i):
while (kid := 2*i+1) < n:
imax = kid if a[kid] > a[i] else i
kid += 1
if kid < n and a[kid] > a[imax]:
imax = kid
if imax == i:
return
swap(i, imax)
i = imax
for i in range(n // 2)[::-1]:
siftdown(i)
while n := n - 1:
swap(0, n)
siftdown(0)
Anyway, if someone's interested in just saving some amount of memory, that can be done by decorating in-place (building tuples and storing them in a):
def sort_together_decorate_in_a(a, b, c):
for i, a[i] in enumerate(zip(a, b, c)):
pass
a.sort()
for i, [a[i], b[i], c[i]] in enumerate(a):
pass
Or if you trust that list.sort will ask for keys for the elements in order (at least in CPython it does, already did so when the key parameter was introduced 18 years ago, and I suspect will keep doing so):
def sort_together_iter_key(a, b, c):
it = iter(a)
b.sort(key=lambda _: next(it))
it = iter(a)
c.sort(key=lambda _: next(it))
a.sort()
Testing memory and time with three lists of 100,000 elements:
15,072,520 bytes 152 ms sort_together_sorted_zip
15,072,320 bytes 166 ms sort_together_sorted_zip_2
14,272,576 bytes 152 ms sort_together_sorted_zip_X
6,670,708 bytes 126 ms sort_together_decorate_in_a
6,670,772 bytes 177 ms sort_together_decorate_in_first_X
5,190,212 bytes 342 ms sort_multi_by_a_guest_X
1,597,400 bytes 100 ms sort_together_iter_key
1,597,448 bytes 102 ms sort_together_iter_key_X
744 bytes 1584 ms sort_together_heapsort
704 bytes 1663 ms sort_together_heapsort_X
168 bytes 1326 ms sort_together_heapsort_opti
188 bytes 1512 ms sort_together_heapsort_opti_X
Note:
The second solution is a shortened/improved version of yours, no need for temporary variables and conversions to lists.
The solutions with _X suffix are versions that take arbitrarily many lists as parameters.
The #a_guest is from their answer. Runtime-wise it currently benefits from my data being random, as that doesn't expose that solution's worst case complexity O(m * n²), where m is the number of lists and n is the length of each list.
Testing memory and time with ten lists of 100,000 elements:
19,760,808 bytes 388 ms sort_together_sorted_zip_X
12,159,100 bytes 425 ms sort_together_decorate_in_first_X
5,190,292 bytes 1249 ms sort_multi_by_a_guest_X
1,597,528 bytes 393 ms sort_together_iter_key_X
704 bytes 4186 ms sort_together_heapsort_X
188 bytes 4032 ms sort_together_heapsort_opti_X
The whole code (Try it online!):
import tracemalloc as tm
from random import random
from timeit import timeit
def sort_together_sorted_zip(a, b, c):
a_sorted, b_sorted, c_sorted = map(list, zip(*sorted(zip(a, b, c))))
a[:] = a_sorted
b[:] = b_sorted
c[:] = c_sorted
def sort_together_sorted_zip_2(a, b, c):
a[:], b[:], c[:] = zip(*sorted(zip(a, b, c)))
def sort_together_sorted_zip_X(*lists):
sorteds = zip(*sorted(zip(*lists)))
for lst, lst[:] in zip(lists, sorteds):
pass
def sort_together_decorate_in_a(a, b, c):
for i, a[i] in enumerate(zip(a, b, c)):
pass
a.sort()
for i, [a[i], b[i], c[i]] in enumerate(a):
pass
def sort_together_decorate_in_first_X(*lists):
first = lists[0]
for i, first[i] in enumerate(zip(*lists)):
pass
first.sort()
for i, values in enumerate(first):
for lst, lst[i] in zip(lists, values):
pass
def sort_together_iter_key(a, b, c):
it = iter(a)
b.sort(key=lambda _: next(it))
it = iter(a)
c.sort(key=lambda _: next(it))
a.sort()
def sort_together_iter_key_X(*lists):
for lst in lists[1:]:
it = iter(lists[0])
lst.sort(key=lambda _: next(it))
lists[0].sort()
def sort_together_heapsort(a, b, c):
n = len(a)
def swap(i, j):
a[i], a[j] = a[j], a[i]
b[i], b[j] = b[j], b[i]
c[i], c[j] = c[j], c[i]
def siftdown(i):
while (kid := 2*i+1) < n:
imax = kid if a[kid] > a[i] else i
kid += 1
if kid < n and a[kid] > a[imax]:
imax = kid
if imax == i:
return
swap(i, imax)
i = imax
for i in range(n // 2)[::-1]:
siftdown(i)
while n := n - 1:
swap(0, n)
siftdown(0)
def sort_together_heapsort_X(*lists):
a = lists[0]
n = len(a)
def swap(i, j):
for lst in lists:
lst[i], lst[j] = lst[j], lst[i]
def siftdown(i):
while (kid := 2*i+1) < n:
imax = kid if a[kid] > a[i] else i
kid += 1
if kid < n and a[kid] > a[imax]:
imax = kid
if imax == i:
return
swap(i, imax)
i = imax
for i in range(n // 2)[::-1]:
siftdown(i)
while n := n - 1:
swap(0, n)
siftdown(0)
def sort_together_heapsort_opti(a, b, c):
# Avoid inner functions and range-loop to minimize memory.
# Makes it faster, too. But duplicates code. Not recommended.
n = len(a)
i0 = n // 2 - 1
while i0 >= 0:
i = i0
while (kid := 2*i+1) < n:
imax = kid if a[kid] > a[i] else i
kid += 1
if kid < n and a[kid] > a[imax]:
imax = kid
if imax == i:
break
a[i], a[imax] = a[imax], a[i]
b[i], b[imax] = b[imax], b[i]
c[i], c[imax] = c[imax], c[i]
i = imax
i0 -= 1
while n := n - 1:
a[0], a[n] = a[n], a[0]
b[0], b[n] = b[n], b[0]
c[0], c[n] = c[n], c[0]
i = 0
while (kid := 2*i+1) < n:
imax = kid if a[kid] > a[i] else i
kid += 1
if kid < n and a[kid] > a[imax]:
imax = kid
if imax == i:
break
a[i], a[imax] = a[imax], a[i]
b[i], b[imax] = b[imax], b[i]
c[i], c[imax] = c[imax], c[i]
i = imax
def sort_together_heapsort_opti_X(*lists):
# Avoid inner functions and range-loop to minimize memory.
# Makes it faster, too. But duplicates code. Not recommended.
a = lists[0]
n = len(a)
i0 = n // 2 - 1
while i0 >= 0:
i = i0
while (kid := 2*i+1) < n:
imax = kid if a[kid] > a[i] else i
kid += 1
if kid < n and a[kid] > a[imax]:
imax = kid
if imax == i:
break
for lst in lists:
lst[i], lst[imax] = lst[imax], lst[i]
i = imax
i0 -= 1
while n := n - 1:
for lst in lists:
lst[0], lst[n] = lst[n], lst[0]
i = 0
while (kid := 2*i+1) < n:
imax = kid if a[kid] > a[i] else i
kid += 1
if kid < n and a[kid] > a[imax]:
imax = kid
if imax == i:
break
for lst in lists:
lst[i], lst[imax] = lst[imax], lst[i]
i = imax
def sort_multi_by_a_guest_X(a, *lists):
indices = list(range(len(a)))
indices.sort(key=lambda i: a[i])
a.sort()
for lst in lists:
for i, j in enumerate(indices):
while j < i:
j = indices[j]
lst[i], lst[j] = lst[j], lst[i]
funcs = [
sort_together_sorted_zip,
sort_together_sorted_zip_2,
sort_together_sorted_zip_X,
sort_together_decorate_in_a,
sort_together_decorate_in_first_X,
sort_multi_by_a_guest_X,
sort_together_iter_key,
sort_together_iter_key_X,
sort_together_heapsort,
sort_together_heapsort_X,
sort_together_heapsort_opti,
sort_together_heapsort_opti_X,
]
n = 100000
a0 = [random() for _ in range(n)]
b0 = [x + 1 for x in a0]
c0 = [x + 2 for x in a0]
for _ in range(3):
for func in funcs:
a, b, c = a0[:], b0[:], c0[:]
time = timeit(lambda: func(a, b, c), number=1)
assert a == sorted(a0)
assert b == sorted(b0)
assert c == sorted(c0)
a, b, c = a0[:], b0[:], c0[:]
tm.start()
func(a, b, c)
memory = tm.get_traced_memory()[1]
tm.stop()
print(f'{memory:10,} bytes {int(time * 1e3):4} ms {func.__name__}')
print()

The following function uses a memory overhead that is independent of the number of lists to sort. It is stable w.r.t. the first list.
def sort_multi(a, *lists):
indices = list(range(len(a)))
indices.sort(key=lambda i: a[i])
a.sort()
for lst in lists:
for i, j in enumerate(indices):
while j < i:
j = indices[j]
lst[i], lst[j] = lst[j], lst[i]

Related

How can I convert pseudocode to mergesort algorithm?

I need to convert pseudocode into a merge sort algorithm that mirrors that pseudocode. I am new to pseudocode so I'm having trouble with this. Can anyone tell me what is wrong with my algorithm? Please note that the arrays in the pseudocode are 1-indexed.
PSEUDOCODE:
MergeSort(A[1 .. n]):
if n > 1
m ← bn/2c
MergeSort(A[1 .. m])
MergeSort(A[m + 1 .. n])
Merge(A[1 .. n], m)
Merge(A[1 .. n], m):
i ← 1; j ← m + 1
for k ← 1 to n
if j > n
B[k] ← A[i]; i ← i + 1
else if i > m
B[k] ← A[j]; j ← j + 1
else if A[i] < A[ j]
B[k] ← A[i]; i ← i + 1
else
B[k] ← A[j]; j ← j + 1
for k ← 1 to n
A[k] ← B[k]
MY CODE
def mergeSort(arr):
n = len(arr)
if n > 1:
m = n//2
mergeSort(arr[:m])
mergeSort(arr[m:])
merge(arr, m)
def merge(arr, m):
n = len(arr)
i = 0
j = m
b = [0] * n
for k in range(n):
if j >= n:
b[k] = arr[i]
i += 1
elif i > m-1:
b[k] = arr[j]
j += 1
elif arr[i] < arr[j]:
b[k] = arr[i]
i += 1
else:
b[k] = arr[j]
j += 1
for k in range(n):
arr[k] = b[k]
The thing is that in the pseudo code version, the notation A[1..m] is supposed to mean a partition of the array, but in-place, not as a new array (slice): it is like a window on a part of the array, with its own indexing, but not copied.
The translation to list slicing in Python does not reflect this. arr[:m] creates a new list, and so whatever mergeSort(arr[:m]) does with that new list, it doesn't touch arr itself: all that work is for nothing, as it doesn't mutate arr, but a sliced copy of it, which we lose access to.
A solution is to not create slices, but to pass the start/end indices of the intended partition to the function call.
Here is the adapted code:
def mergeSort(arr):
mergeSortRec(arr, 0, len(arr))
def mergeSortRec(arr, start, end):
n = end - start
if n > 1:
m = start + n//2
mergeSortRec(arr, start, m)
mergeSortRec(arr, m, end)
merge(arr, start, m, end)
def merge(arr, start, m, end):
n = end - start
i = start
j = m
b = [0] * n
for k in range(n):
if j >= end:
b[k] = arr[i]
i += 1
elif i >= m:
b[k] = arr[j]
j += 1
elif arr[i] < arr[j]:
b[k] = arr[i]
i += 1
else:
b[k] = arr[j]
j += 1
for k in range(n):
arr[start + k] = b[k]
This code is working fine for me, however your operations are being applied in place, so you just need to call the function with the array to sort rather than getting the return value (which will always be None, because you provide no return in the function mergeSort)
arr = np.random.uniform(1, 10, 10)
print(arr)
[2.10748505 9.47408117 5.4620788 1.5585025 9.57387679 4.13719947
1.28671255 4.150946 2.84044402 6.56294717]
mergeSort(arr)
print(arr)
[1.28671255 1.5585025 2.10748505 2.84044402 4.13719947 4.150946
5.4620788 6.56294717 9.47408117 9.57387679]

Wrong number of permutations nPr(5,3) on a list

The goal of this program is to make as many permutations of x in size 3 (nPr(5,3), hence the iterations of (i, j, k)).
My effort on trying to achieve the permutations nPr(5,3), where 5 is the length of the list x and 3 is the length of the tuple (i,j,k):
# x will never have any repeats
x = [1, 2, 3, 4, 5]
# nPr(5,3) = 60
y = []
for i in x:
for j in x:
for k in x:
y.append((i, j, k))
print(f"Len of y = {len(y)}")
I'm expecting len(y) to be 60, as nPr(5,3) = 60. But i get the output Len of y = 125. Also, making y = set() does not fix this issue
What have I done wrong?
How do I fix this code to work (without using itertools)
Answer TL;DR: I was allowing duplicates (1,1,1)
You are allowing repeats (for example, [1,1,1] and [2,2,2]). The value of 60 is for permutations without repeats. You do that by checking that you aren't repeating a value.
NOTE that this code only works if there are no repeats in x. If there are duplicates, then you would have to use indexes instead (that is, for i in range(len(x)):).
x = [1,2,3,4,5]
y = []
for i in x:
for j in x:
if i == j:
continue
for k in x:
if i!=k and j!= k:
y.append((i,j,k))
print(y)
As pointed out by Tim Roberts, I was adding repeats of i,j or k, (1,1,1). My fix is to just make sure i,j and k are different:
for i in x:
for j in x:
for k in x:
# If i,j,k are different
if len(set((i, j, k))) == 3:
y.append((i, j, k))
As set((i, j, k)) will remove the duplicates in the tuple (i, j, k), so the length must equal 3. This is also helpful if I need to make this recursive for nPr(n,r) as set((i, j, k ... r)) == r.
This will work, though it's a bit too deeply nested for my taste:
y = []
for i in x:
for j in x:
if i != j:
for k in x:
if i != k and j != k:
y.append((i, j, k))
assert list(itertools.permutations(x, 3)) == y
Negating the conditions and using continue increases readability:
y = []
for i in x:
for j in x:
if i == j:
continue
for k in x:
if i == k or j == k:
continue
y.append((i, j, k))
assert list(itertools.permutations(x, 3)) == y
But this will only work if all members of x are unique. Better would be to check that the indices are different:
y = []
for i in range(len(x)):
for j in range(len(x)):
if i == j:
continue
for k in range(len(x)):
if i == k or j == k:
continue
y.append((x[i], x[j], x[k]))
assert list(itertools.permutations(x, 3)) == y
We could also do the job with recursion, parameterizing r (number of items in each permutation) in the process, though without dynamic programming, this approach will do a lot of extra work for large x and r:
# if x were hashable, i.e. a tuple in this case, we could use the
# #functools.cache decorator to avoid repeated work
def permutations(x, r):
if not r:
return [(),]
res = []
for i in range(len(x)):
perms_without_x_i = permutations(x[:i] + x[i + 1 :], r - 1)
res += [(x[i],) + p for p in perms_without_x_i]
return res
assert permutations(x, 3) == list(itertools.permutations(x, 3))
But probably the best way of all is to steal the answer directly from the docs:
def permutations(iterable, r=None):
# permutations('ABCD', 2) --> AB AC AD BA BC BD CA CB CD DA DB DC
# permutations(range(3)) --> 012 021 102 120 201 210
pool = tuple(iterable)
n = len(pool)
r = n if r is None else r
if r > n:
return
indices = list(range(n))
cycles = list(range(n, n-r, -1))
yield tuple(pool[i] for i in indices[:r])
while n:
for i in reversed(range(r)):
cycles[i] -= 1
if cycles[i] == 0:
indices[i:] = indices[i+1:] + indices[i:i+1]
cycles[i] = n - i
else:
j = cycles[i]
indices[i], indices[-j] = indices[-j], indices[i]
yield tuple(pool[i] for i in indices[:r])
break
else:
return

Merge sort in python: slicing vs iterating - impact on complexity

I want to check that my understanding of how python handles slices is correct.
Here's my implementation of merge sort:
def merge_sort(L):
def merge(a, b):
i, j = 0, 0
c = []
while i < len(a) and j < len(b):
if a[i] < b[j]:
c.append(a[i])
i += 1
elif b[j] < a[i]:
c.append(b[j])
j += 1
if a[i:]:
c.extend(a[i:])
if b[j:]:
c.extend(b[j:])
return c
if len(L) <= 1:
return L
else:
mid = len(L) // 2
left = merge_sort(L[:mid])
right = merge_sort(L[mid:])
return merge(left, right)
Am I right in thinking that I could replace this:
if a[i:]:
c.extend(a[i:])
if b[j:]:
c.extend(b[j:])
With this:
while i < len(a):
c.append(a[i])
i += 1
while j < len(b):
c.append(b[j])
j += 1
And have the exact same level of complexity? My understanding of slicing is that its complexity is equivalent to slice length? Is that correct?
Does the fact that I'm calling a slice twice (first in the condition, second time inside of it) make it 2x complexity?
Your implementation of mergesort has problems:
in the merge function's main loop, you do nothing if the values in a[i] and b[j] are equal, or more precisely if you have neither a[i] < b[i] nor a[i] > b[i]. This causes an infinite loop.
there is no need to define merge as a local function, actually there is no need to make it a separate function, you could inline the code and save the overhead of a function call.
Here is a modified version:
def merge_sort(L):
if len(L) <= 1:
return L
else:
mid = len(L) // 2
a = merge_sort(L[:mid])
b = merge_sort(L[mid:])
i, j = 0, 0
c = []
while i < len(a) and j < len(b):
if a[i] <= b[j]:
c.append(a[i])
i += 1
else:
c.append(b[j])
j += 1
if a[i:]:
c.extend(a[i:])
else:
c.extend(b[j:])
return c
Regarding performance, slicing or iterating has no impact on complexity since both operations have linear time cost.
Regarding performance, here are directions to try:
replace the test if a[i:] with if i < len(a). Creating the slice twice is costly.
perform the sort in place, avoiding the append operations
restructure the main loop to have a single test per iteration
Here is a modified version:
def merge_sort(L):
if len(L) <= 1:
return L
else:
mid = len(L) // 2
a = merge_sort(L[:mid])
b = merge_sort(L[mid:])
i, j, k = 0, 0, 0
while True:
if a[i] <= b[j]:
L[k] = a[i]
k += 1
i += 1
if (i == len(a)):
L[k:] = b[j:]
return L
else:
L[k] = b[j]
k += 1
j += 1
if (j == len(b)):
L[k:] = a[i:]
return L

Count number of comparisons for QuickSort

I want to count the number of comparisons in quicksort. In order to do so, I introduced a counting variable c. Although I think the implementation is correct, the counter is significantly higher than with insertion sort, which should not be the case. Have I done something wrong?
Here is my code.
def quick_sort(a):
c = 0
c = quickSortImpl(a, 0, len(a)-1, c)
return c
def quickSortImpl(a, l, r, c):
if r > l:
k, c = partition(a, l, r, c)
c = quickSortImpl(a, l, k-1, c)
c = quickSortImpl(a, k+1, r, c)
return c
def partition(a, l, r, c):
pivot = a[r]
i = l
j = r - 1
while True:
c += 1
while i < r and a[i] <= pivot:
c += 1
i += 1
c += 1
while j > l and a[j] >= pivot:
c += 1
j -= 1
if i < j:
a[i], a[j] = a[j], a[i]
else:
break
a[r] = a[i]
a[i] = pivot
return i, c
Click here for comparison between Insertion Sort vs Quick Sort

Quick Sort algorithm with three way partition

I am new to algorithms and was working on implementing the Quick Sort algorithm with a 3-way partition such that it works fast even on sequences containing many equal elements. The following was my implementation:
def randomized_quick_sort(a, l, r):
if l >= r:
return
k = random.randint(l, r)
a[l], a[k] = a[k], a[l]
#use partition3
m1,m2 = partition3(a, l, r)
randomized_quick_sort(a, l, m1 - 1);
randomized_quick_sort(a, m2 + 1, r);
def partition3(a, l, r):
x, j, t = a[l], l, r
for i in range(l + 1, t+1):
if a[i] < x:
j += 1
a[i], a[j] = a[j], a[i]
elif a[i]>x:
a[i],a[t]=a[t],a[i]
i-=1
t-=1
a[l], a[j] = a[j], a[l]
return j,t
It does not generate correctly sorted lists. I found the correct implementation of the partition code here in Stack Overflow.
def partition3(a, l, r):
x, j, t = a[l], l, r
i = j
while i <= t :
if a[i] < x:
a[j], a[i] = a[i], a[j]
j += 1
elif a[i] > x:
a[t], a[i] = a[i], a[t]
t -= 1
i -= 1 # remain in the same i in this case
i += 1
return j,t
Can someone please explain to me how the incorrect partition implementation was failing?
Thanks in advance

Categories