merge sort python implementation methodology

merge sort python implementation methodology - python

I'm learning the well known sorting algorithms and implementing them myself. I have recently done merge sort and the code I have is:
def merge(l, r, direction):
# print('merging')
# print(l, r)
# adding infinity to end of list so we know when we've hit the bottom of one pile
l.append(inf)
r.append(inf)
A = []
i, j = 0, 0
while (i < len(l)) and (j < len(r)):
if l[i] <= r[j]:
A.append(l[i])
i += 1
else:
A.append(r[j])
j += 1
# removing infinity from end of list
A.pop()
return(A)
def merge_sort(num_lst, direction='increasing', level=0):
if len(num_lst) > 1:
mid = len(num_lst)//2
l = num_lst[:mid]
r = num_lst[mid:]
l = merge_sort(l, level=level + 1)
r = merge_sort(r, level=level + 1)
num_lst = merge(l, r, direction)
return num_lst
What I have seen in other implementations that differs from my own is in the merging of lists. Where I just create a blank list and append elements in numerical order other pass the preexisting into merge and overwrite each element to create a list in numerical order. Something like:
def merge(arr, l, m, r):
n1 = m - l + 1
n2 = r- m
# create temp arrays
L = [0] * (n1)
R = [0] * (n2)
# Copy data to temp arrays L[] and R[]
for i in range(0 , n1):
L[i] = arr[l + i]
for j in range(0 , n2):
R[j] = arr[m + 1 + j]
# Merge the temp arrays back into arr[l..r]
i = 0 # Initial index of first subarray
j = 0 # Initial index of second subarray
k = l # Initial index of merged subarray
while i < n1 and j < n2 :
if L[i] <= R[j]:
arr[k] = L[i]
i += 1
else:
arr[k] = R[j]
j += 1
k += 1
# Copy the remaining elements of L[], if there
# are any
while i < n1:
arr[k] = L[i]
i += 1
k += 1
# Copy the remaining elements of R[], if there
# are any
while j < n2:
arr[k] = R[j]
j += 1
k += 1
I'm curious about the following:
Problem
Is using a append() on a blank list a bad idea? By my understanding when python creates a list it grabs a certain size chunk of memory and if our list grows beyond that copies the list to a different and larger section of memory (which seems it would be pretty high cost if it happened even once for a large list let alone repeatedly).
Is there just a higher cost to using append() compared to accessing a list by index? It would seemed to me append would be able to do things at a fairly low cost.

When you instantiate a list Python will allocate the memory necessary for holding the items plus some extra memory for future appends / extends. When you put too much extra stuff in the list it eventually needs to be reallocated which potentially slows down the program (see this part of the source code). The reallocation happens here and the size is calculated as:
new_allocated = (size_t)newsize + (newsize >> 3) + (newsize < 9 ? 3 : 6);
As the comment states the growth pattern is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ....
So if the size of the final list is known in advance and allocated from the beginning this will save you some extra reallocations (and thus compute time).

from time import time
append_arr = []
index_arr = [None] * 10240*1024
t0 = time()
for x in range(10240*1024):
append_arr.append(x)
t1 = time()
t2 = time()
for i in range(10240*1024):
index_arr[i] = i
t3 = time()
print(str(t1-t0))
print(str(t3-t2))
Looks like appending is slower.

Related

merge, heap, and quick sort counts are not coming out properly

import random, timeit
#Qucik sort
def quick_sort(A,first,last):
global Qs,Qc
if first>=last: return
left, right= first+1, last
pivot = A[first]
while left <= right:
while left <=last and A[left]<pivot:
Qc= Qc+1
left= left + 1
while right > first and A[right] >= pivot:
Qc=Qc+1
right = right -1
if left <= right:
A[left],A[right]=A[right],A[left]
Qs = Qs+1
left= left +1
right= right-1
A[first],A[right]=A[right],A[first]
Qs=Qs+1
quick_sort(A,first,right-1)
quick_sort(A,right+1,last)
#Merge sort
def merge_sort(A, first, last): # merge sort A[first] ~ A[last]
global Ms,Mc
if first >= last: return
middle = (first+last)//2
merge_sort(A, first, middle)
merge_sort(A, middle+1, last)
B = []
i = first
j = middle+1
while i <= middle and j <= last:
Mc=Mc+1
if A[i] <= A[j]:
B.append(A[i])
i += 1
else:
B.append(A[j])
j += 1
for i in range(i, middle+1):
B.append(A[i])
Ms=Ms+1
for j in range(j, last+1):
B.append(A[j])
for k in range(first, last+1): A[k] = B[k-first]
#Heap sort
def heap_sort(A):
global Hs, Hc
n = len(A)
for i in range(n - 1, -1, -1):
while 2 * i + 1 < n:
left, right = 2 * i + 1, 2 * i + 2
if left < n and A[left] > A[i]:
m = left
Hc += 1
else:
m = i
Hc += 1
if right < n and A[right] > A[m]:
m = right
Hc += 1
if m != i:
A[i], A[m] = A[m], A[i]
i = m
Hs += 1
else:
break
for i in range(n - 1, -1, -1):
A[0], A[i] = A[i], A[0]
n -= 1
k = 0
while 2 * k + 1 < n:
left, right = 2 * k + 1, 2 * k + 2
if left < n and A[left] > A[k]:
m = left
Hc += 1
else:
m = k
Hc += 1
if right < n and A[right] > A[m]:
m = right
Hc += 1
if m != k:
A[k], A[m] = A[m], A[k]
k = m
Hs += 1
else:
break
#
def check_sorted(A):
for i in range(n-1):
if A[i] > A[i+1]: return False
return True
#
#
Qc, Qs, Mc, Ms, Hc, Hs = 0, 0, 0, 0, 0, 0
n = int(input())
random.seed()
A = []
for i in range(n):
A.append(random.randint(-1000,1000))
B = A[:]
C = A[:]
print("")
print("Quick sort:")
print("time =", timeit.timeit("quick_sort(A, 0, n-1)", globals=globals(), number=1))
print(" comparisons = {:10d}, swaps = {:10d}\n".format(Qc, Qs))
print("Merge sort:")
print("time =", timeit.timeit("merge_sort(B, 0, n-1)", globals=globals(), number=1))
print(" comparisons = {:10d}, swaps = {:10d}\n".format(Mc, Ms))
print("Heap sort:")
print("time =", timeit.timeit("heap_sort(C)", globals=globals(), number=1))
print(" comparisons = {:10d}, swaps = {:10d}\n".format(Hc, Hs))
assert(check_sorted(A))
assert(check_sorted(B))
assert(check_sorted(C))
I made the code that tells how much time it takes to sort list size n(number input) with 3 ways of sorts. However, I found that my result is quite unexpected.
Quick sort:
time = 0.0001289689971599728
comparisons = 474, swaps = 168
Merge sort:
time = 0.00027709499408956617
comparisons = 541, swaps = 80
Heap sort:
time = 0.0002578190033091232
comparisons = 744, swaps = 478
Quick sort:
time = 1.1767549149953993
comparisons = 3489112, swaps = 352047
Merge sort:
time = 0.9040642600011779
comparisons = 1536584, swaps = 77011
Heap sort:
time = 1.665754442990874
comparisons = 2227949, swaps = 1474542
Quick sort:
time = 4.749891302999458
comparisons = 11884246, swaps = 709221
Merge sort:
time = 3.1966246420051903
comparisons = 3272492, swaps = 154723
Heap sort:
time = 6.2041203819972
comparisons = 4754829, swaps = 3148479
as you see, my results are very different from what I learned. Can you please tell me why quick sort is not the fastest in my code? and why merge is the fastest one.

I can see that you are choosing the first element of the array as the pivot in quicksort. Now, consider the order of the elements of the unsorted array. Is it random? How do you generate the input array?
You see, if the pivot was either the min or max value of the aray, or somewhere close to the mind/max value, the running time of quicksort in that case (worst case) will be in the order of O(n^2). That is because on each iteration, you are partitioning the arry by breaking off only one element.
For optimal quicksort performance of O(n log n), your pivot should be as close to the median value as possible. In order to increase the likelihood of that being the case, consider initially picking 3 values at random in from the array, and use the median value as the pivot. Obviously, the more values you choose the median from initially the better the probability that your pivot is more efficient, but you are adding extra moves by choosing those values to begin with, so it's a trade off. I imagine one would even be able to calculate exactly how many elements should be selected in relation to the size of the array for optimal performance.
Merge sort on the other hand, always has the complexity in the order of O(n log n) irrespective of input, which is why you got consistent results with it over different samples.
TL:DR my guess is that the input array's first element is very close to being the smallest or largest value of that array, and it ends up being the pivot of your quicksort algorithm.

I was asked to make a program in python to check whether a sorting algorithm is stable or not. Need some help regarding that

So I decided to approach this problem by creating another array (lets say arraycount) with the same length of the input array(lets say arraynumber) which is to be sorted. For the program to really check the stability of the sorting function, I decided to use test cases of integer array of repeated occurrences eg arraynumber = {10,8,7,8,2,2,2,1,6}. I then use a function to calculate the occurrences of the number in the array and store them in the arraycount[] which will result as arraycount={1,1,1,2,1,2,3,1,1}.
Naturally after sorting in increasing order they become arraynumber={1,2,2,2,6,7,8,8,10} and the result of the arraycount decides that its stable or not. Hence if its stable arraycount={1,1,2,3,1,1,1,2,1} otherwise it won't be same.
This is the code I have been working on. Help and feedback is appreciated. Main function and input is not taken yet. I will take another array copy of the original arraycount which will be compared afterward to check if it's retained the original position. Sorry if I made any trivial mistakes. Thank you.
def merge(arr1, l, m, r,arr2):
n1 = m - l + 1
n2 = r- m
# create temp arrays
L = [0] * (n1)
R = [0] * (n2)
# Copy data to temp arrays L[] and R[]
for i in range(0 , n1):
L[i] = arr1[l + i]
for j in range(0 , n2):
R[j] = arr1[m + 1 + j]
# Merge the temp arrays back into arr[l..r]
i = 0 # Initial index of first subarray
j = 0 # Initial index of second subarray
k = l # Initial index of merged subarray
while i < n1 and j < n2 :
if L[i] <= R[j]:
arr1[k] = L[i]
arr2[k]=L[i]
i += 1
else:
arr1[k] = R[j]
arr2[k]=R[j]
j += 1
k += 1
# Copy the remaining elements of L[], if there
# are any
while i < n1:
arr1[k] = L[i]
arr2[k]=L[i]
i += 1
k += 1
# Copy the remaining elements of R[], if there
# are any
while j < n2:
arr1[k] = R[j]
arr2[k]=R[j]
j += 1
k += 1
# l is for left index and r is right index of the
# sub-array of arr to be sorted
def mergeSort(arr1,l,r,arr2):
if l < r:
# Same as (l+r)/2, but avoids overflow for
# large l and h
m = (l+(r-1))/2
# Sort first and second halves
mergeSort(arr1, l, m,arr2)
mergeSort(arr1, m+1, r,arr2)
merge(arr1, l, m, r,arr2)
# This function takes last element as pivot, places
# the pivot element at its correct position in sorted
# array, and places all smaller (smaller than pivot)
# to left of pivot and all greater elements to right
# of pivot
def partition(arr1,low,high,arr2):
i = ( low-1 ) # index of smaller element
pivot = arr1[high] # pivot
for j in range(low , high):
# If current element is smaller than or
# equal to pivot
if arr1[j] <= pivot:
# increment index of smaller element
i = i+1
arr1[i],arr1[j] = arr1[j],arr1[i]
arr2[i],arr2[j]=arr2[j],arr2[i]
arr1[i+1],arr1[high] = arr1[high],arr1[i+1]
arr2[i+1],arr2[high]=arr2[high],arr2[i+1]
return ( i+1 )
# The main function that implements QuickSort
# arr1[] --> Array to be sorted,
# arr2[] --> Second Array of the count to be sorted along with first array
# low --> Starting index,
# high --> Ending index
# Function to do Quick sort
def quickSort(arr1,low,high,arr2):
if low < high:
# pi is partitioning index, arr[p] is now
# at right place
pi = partition(arr1,low,high,arr2)
# Separately sort elements before
# partition and after partition
quickSort(arr1, low, pi-1,arr2)
quickSort(arr1, pi+1, high,arr2)
def countArray(arr1,arr2):
i=0,j=1,k=0
while i<range(len(arr1[])):
while j<range(len(arr1[])):
if arr1[i]==arr1[j]:
arr2[i]=k+1
j=j+1
k=0
i=i+1
arraylist=[]
arraycount=[]
def isStable(arr1,arr2):
k=0
while i < range(len(arr1[])):
if arr[i]==arr2[i]:
k=k+1
i=i+1
if k==len(arr2):
print("Stable sort")
else:
print("Unstable sort")

How to use multithreading with divide and conquer?

I am new at Python and have been trying to use multithreading. There already exists an in-depth comment on Stackoverflow on the topic, but I still have some questions.
The goal of my program is to create and populate an array (although I guess that technically it would have to be called a "list" in Python) and have it sorted by a "divide and conquer" algorithm. Unfortunately, the terms "list" and "array" seem to be conflated by many a user, even though they are not the same. If "array" is used in my comment, please bear in mind that I have posted different code from various resources and have, for the sake of respecting the original author/s, not changed its content.
My code for populating the list count was quite simple
#!/usr/bin/env python3
count = []
i = 149
while i >= 0:
count.append(i)
print(i)
i -= 1
After that I used this very handy guide on the topic of "divide and conquer" to create two lists for sorting, which were merged later on. My main concern is now how to use those lists correctly with multithreading.
In the earlier mentioned post it was argued that, basically, just a few lines of code were needed to use multithreading:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(4)
as well as
results = pool.starmap(function, zip(list_a, list_b))
to pass multiple lists.
I have tried to adapt the code but failed. My function's parameters are def merge(count, l, m, r) (used to divide the list count into a left and a right part) and the two temporarily created lists are called L and R.
def merge(arr, l, m, r):
n1 = m - l + 1
n2 = r- m
# create temp arrays
L = [0] * (n1)
R = [0] * (n2)
But every time I run the program, it just responds with the following error message:
Traceback (most recent call last):
File "./DaCcountdownTEST.py", line 71, in <module>
results = pool.starmap(merge,zip(L,R))
NameError: name 'L' is not defined
I don't know the cause for my problem.
Any help is greatly appreciated!

I'm not sure exactly what's going wrong with your code, but here's a complete working example of a multi-threaded version of the mergeSort code you linked to:
from multiprocessing.dummy import Pool as ThreadPool
# Merges two subarrays of arr[].
# First subarray is arr[l..m]
# Second subarray is arr[m+1..r]
def merge(arr, l, m, r):
n1 = m - l + 1
n2 = r- m
# create temp arrays
L = [0] * (n1)
R = [0] * (n2)
# Copy data to temp arrays L[] and R[]
for i in range(0 , n1):
L[i] = arr[l + i]
for j in range(0 , n2):
R[j] = arr[m + 1 + j]
# Merge the temp arrays back into arr[l..r]
i = 0 # Initial index of first subarray
j = 0 # Initial index of second subarray
k = l # Initial index of merged subarray
while i < n1 and j < n2 :
if L[i] <= R[j]:
arr[k] = L[i]
i += 1
else:
arr[k] = R[j]
j += 1
k += 1
# Copy the remaining elements of L[], if there
# are any
while i < n1:
arr[k] = L[i]
i += 1
k += 1
# Copy the remaining elements of R[], if there
# are any
while j < n2:
arr[k] = R[j]
j += 1
k += 1
# l is for left index and r is right index of the
# sub-array of arr to be sorted
def mergeSort(arr,l=0,r=None):
if r is None:
r = len(arr) - 1
if l < r:
# Same as (l+r)/2, but avoids overflow for
# large l and h
m = (l+(r-1))//2
# Sort first and second halves
pool = ThreadPool(2)
pool.starmap(mergeSort, zip((arr, arr), (l, m+1), (m, r)))
pool.close()
pool.join()
merge(arr, l, m, r)
Here's a short test of the code:
arr = np.random.randint(0,100,10)
print(arr)
mergeSort(arr)
print(arr)
which produces the following output:
[93 56 55 60 0 28 17 77 84 2]
[ 0 2 17 28 55 56 60 77 84 93]
Sadly, it does seem to be quite a bit slower than the single threaded version. However, this kind of slowdown is par for the course when it comes to multithreading compute-bound tasks in Python.

What will be the big-O notation for this code?

Will it be O(n) or greater?
n = length of list
a is a list of integers that is very long
final_count=0
while(n>1):
i=0
j=1
while(a[i+1]==a[i]):
i=i+1
j=j+1
if i==n-1:
break
for k in range(j):
a.pop(0)
final_count=final_count+j*(n-j)
n=n-j

The way I see it, your code would be O(n) if it wasn’t for the a.pop(0) part. Since lists are implemented as arrays in memory, removing an element at the top means that all elements in the array need to moved as well. So removing from a list is O(n). You do that in a loop over j and as far as I can tell, in the end, the sum of all js will be the same as n, so you are removing the item n times from the list, making this part quadratic (O(n²)).
You can avoid this though by not modifying your list, and just keeping track of the initial index. This not only removes the need for the pop, but also the loop over j, making the complexity calculation a bit more straight-forward:
final_count = 0
offset = 0
while n > 1:
i = 0
j = 1
while a[offset + i + 1] == a[offset + i]:
i += 1
j += 1
if i == n - 1:
break
offset += j
final_count += j * (n - j)
n = n - j
Btw. it’s not exactly clear to me why you keep track of j, since j = i + 1 at every time, so you can get rid of that, making the code a bit simpler. And at the very end j * (n - j) is exactly j * n if you first adjust n:
final_count = 0
offset = 0
while n > 1:
i = 0
while a[offset + i + 1] == a[offset + i]:
i += 1
if i == n - 1:
break
offset += i + 1
n = n - (i + 1)
final_count += (i + 1) * n
One final note: As you may notice, offset is now counting from zero to the length of your list, while n is doing the reverse, counting from the length of your list to zero. You could probably combine this, so you don’t need both.

Merge Sort Index Error

def merge (seq, p, q, r):
# n1: length of sub-array [p..q]
n1 = q - p + 1
# n2: length of sub-array [q+1 ..r]
n2 = r - q
# Left and Right arrays
left_arr = []
right_arr = []
for i in xrange(0, n1):
left_arr.append( seq[p+i] )
for j in xrange(0, n2):
right_arr.append( seq[q+j+1] )
j=0
i=0
for k in xrange(p,r+1):
if left_arr[i]<= right_arr[j]:
seq[k]=left_arr[i]
i+=1
else:
seq[k]=right_arr[j]
j+=1
return seq
s = [2,4,5,7,1,2,3,6]
p = 0
q = 3
r = 7
print merge(s,p,q,r)
How it works:
A unsorted sequence s is taken, along with the index numbers where the sequence has to be merged. (p=initial, r = final, q=middle)
Now, left_arr and right_arr are [p,q], [q+1, r] respectively
we take left_arr and right_arr initial values (i=0, j=0). We iterate over the sequence seq...
while iterating we are replacing the values of seq with the sorted values...
The above function is able to sort till last number "7".. at the end, its showing "IndexError". I can use exception handling to catch it and fix but I think... for merge sort, its too much.. Can anyone help me in fixing the code as easy as possible.
I am learning Algorithms.. (following book: Introduction to Algorithms by Thomas H. Cormen)

the problem is that at the last iteration i will be equal to 4 and you are trying to access left_arr[5] which does not exist. you should add a stopping condition when i or j become larger then the size of the corresponding array, and then add all the remaining elements in the other array to seq.
Here is a code that works:
def merge (seq, p, q, r):
# n1: length of sub-array [p..q]
n1 = q - p + 1
# n2: length of sub-array [q+1 ..r]
n2 = r - q
# Left and Right arrays
left_arr = seq[p:n1] #here
right_arr = seq[n1:r+1] #here
j=0
i=0
for k in xrange(p, r+1):
if left_arr[i]<= right_arr[j]:
seq[k]=left_arr[i]
i+=1
if i > n1-1: #here
break
else:
seq[k]=right_arr[j] #here
j+=1
if j > n2-1:
break
if i >= len(left_arr): #from here down
seq[k+1:] = right_arr[j:]
elif j >= len(right_arr):
seq[k+1:] = left_arr[i:]
return seq
s = [2,4,5,7,1,1,1,1]
p = 0
q = 3
r = 7
print merge(s,p,q,r)
I have marked with comments the places where I have edited your code.

The problem with your code is that you're looping over xrange(p, r+1), so during that loop in some iteration the value of either i or j might become equal the value of len(left) or len(right), causing the IndexError eventually.
def merge(seq,p,q,r):
left=seq[p:q+1] #a better way to fetch the list
right=seq[q+1:]
i=0
j=0
#you shuldn't loop over the length of seq as that might make the value of either i or j
# greater than the length of left or right lists respectively.
# so you should only loop until one of the lists is fully exhausted
while i<len(left) and j<len(right):
if left[i] <= right[j] :
seq[i+j]=left[i]
i+=1
else:
seq[i+j]=right[j]
j+=1
#now the while loop is over so either right or left is fully traversed, which can be
#find out my matching the value of j and i with lenghts of right and left lists respectively
if j == len(right):
seq[i+j:]=left[i:] #if right is fully traversed then paste the remaining left list into seq
elif i==len(left): #this is just vice-versa of the above step
seq[i+j:]=right[j:]
print seq
s = [2,4,5,7,1,2,3,6]
p = 0
q = 3
r = 7
merge(s,p,q,r)
output:
[1, 2, 2, 3, 4, 5, 6, 7]

You didn't check the array index while iterating left_arr and right_arr.
You should merge the left part of either array when another array ends.
for k in xrange(p,r+1):
if left_arr[i]<= right_arr[j]:
seq[k]=left_arr[i]
i+=1
else:
seq[k]=right_arr[j]
j+=1
# --------------- add this ----------------
# if any array ends, merge the left elements of the other array
if i >= len(left_arr):
seq[k:] = right_arr[j:]
break
elif j >= len(right_arr):
seq[k:] = left_arr[i:]
break
# --------------- end ----------------
return seq

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.