Code is taking too much time - python

I wrote code to arrange numbers after taking user input. The ordering requires that the sum of adjacent numbers is prime. Up until 10 as an input code is working fine. If I go beyond that the system hangs. Please let me know the steps to optimize it
ex input 8
Answer should be: (1, 2, 3, 4, 7, 6, 5, 8)
Code as follows....
import itertools
x = raw_input("please enter a number")
range_x = range(int(x)+1)
del range_x[0]
result = list(itertools.permutations(range_x))
def prime(x):
for i in xrange(1,x,2):
if i == 1:
i = i+1
if x%i==0 and i < x :
return False
else:
return True
def is_prime(a):
for i in xrange(len(a)):
print a
if i < len(a)-1:
if prime(a[i]+a[i+1]):
pass
else:
return False
else:
return True
for i in xrange(len(result)):
if i < len(result)-1:
if is_prime(result[i]):
print 'result is:'
print result[i]
break
else:
print 'result is'
print result[i-1]

For posterity ;-), here's one more based on finding a Hamiltonian path. It's Python3 code. As written, it stops upon finding the first path, but can easily be changed to generate all paths. On my box, it finds a solution for all n in 1 through 900 inclusive in about one minute total. For n somewhat larger than 900, it exceeds the maximum recursion depth.
The prime generator (psieve()) is vast overkill for this particular problem, but I had it handy and didn't feel like writing another ;-)
The path finder (ham()) is a recursive backtracking search, using what's often (but not always) a very effective ordering heuristic: of all the vertices adjacent to the last vertex in the path so far, look first at those with the fewest remaining exits. For example, this is "the usual" heuristic applied to solving Knights Tour problems. In that context, it often finds a tour with no backtracking needed at all. Your problem appears to be a little tougher than that.
def psieve():
import itertools
yield from (2, 3, 5, 7)
D = {}
ps = psieve()
next(ps)
p = next(ps)
assert p == 3
psq = p*p
for i in itertools.count(9, 2):
if i in D: # composite
step = D.pop(i)
elif i < psq: # prime
yield i
continue
else: # composite, = p*p
assert i == psq
step = 2*p
p = next(ps)
psq = p*p
i += step
while i in D:
i += step
D[i] = step
def build_graph(n):
primes = set()
for p in psieve():
if p > 2*n:
break
else:
primes.add(p)
np1 = n+1
adj = [set() for i in range(np1)]
for i in range(1, np1):
for j in range(i+1, np1):
if i+j in primes:
adj[i].add(j)
adj[j].add(i)
return set(range(1, np1)), adj
def ham(nodes, adj):
class EarlyExit(Exception):
pass
def inner(index):
if index == n:
raise EarlyExit
avail = adj[result[index-1]] if index else nodes
for i in sorted(avail, key=lambda j: len(adj[j])):
# Remove vertex i from the graph. If this isolates
# more than 1 vertex, no path is possible.
result[index] = i
nodes.remove(i)
nisolated = 0
for j in adj[i]:
adj[j].remove(i)
if not adj[j]:
nisolated += 1
if nisolated > 1:
break
if nisolated < 2:
inner(index + 1)
nodes.add(i)
for j in adj[i]:
adj[j].add(i)
n = len(nodes)
result = [None] * n
try:
inner(0)
except EarlyExit:
return result
def solve(n):
nodes, adj = build_graph(n)
return ham(nodes, adj)

This answer is based on #Tim Peters' suggestion about Hamiltonian paths.
There are many possible solutions. To avoid excessive memory consumption for intermediate solutions, a random path can be generated. It also allows to utilize multiple CPUs easily (each cpu generates its own paths in parallel).
import multiprocessing as mp
import sys
def main():
number = int(sys.argv[1])
# directed graph, vertices: 1..number (including ends)
# there is an edge between i and j if (i+j) is prime
vertices = range(1, number+1)
G = {} # vertex -> adjacent vertices
is_prime = sieve_of_eratosthenes(2*number+1)
for i in vertices:
G[i] = []
for j in vertices:
if is_prime[i + j]:
G[i].append(j) # there is an edge from i to j in the graph
# utilize multiple cpus
q = mp.Queue()
for _ in range(mp.cpu_count()):
p = mp.Process(target=hamiltonian_random, args=[G, q])
p.daemon = True # do not survive the main process
p.start()
print(q.get())
if __name__=="__main__":
main()
where Sieve of Eratosthenes is:
def sieve_of_eratosthenes(limit):
is_prime = [True]*limit
is_prime[0] = is_prime[1] = False # zero and one are not primes
for n in range(int(limit**.5 + .5)):
if is_prime[n]:
for composite in range(n*n, limit, n):
is_prime[composite] = False
return is_prime
and:
import random
def hamiltonian_random(graph, result_queue):
"""Build random paths until Hamiltonian path is found."""
vertices = list(graph.keys())
while True:
# build random path
path = [random.choice(vertices)] # start with a random vertice
while True: # until path can be extended with a random adjacent vertex
neighbours = graph[path[-1]]
random.shuffle(neighbours)
for adjacent_vertex in neighbours:
if adjacent_vertex not in path:
path.append(adjacent_vertex)
break
else: # can't extend path
break
# check whether it is hamiltonian
if len(path) == len(vertices):
assert set(path) == set(vertices)
result_queue.put(path) # found hamiltonian path
return
Example
$ python order-adjacent-prime-sum.py 20
Output
[19, 18, 13, 10, 1, 4, 9, 14, 5, 6, 17, 2, 15, 16, 7, 12, 11, 8, 3, 20]
The output is a random sequence that satisfies the conditions:
it is a permutation of the range from 1 to 20 (including)
the sum of adjacent numbers is prime
Time performance
It takes around 10 seconds on average to get result for n = 900 and extrapolating the time as exponential function, it should take around 20 seconds for n = 1000:
The image is generated using this code:
import numpy as np
figname = 'hamiltonian_random_noset-noseq-900-900'
Ns, Ts = np.loadtxt(figname+'.xy', unpack=True)
# use polyfit to fit the data
# y = c*a**n
# log y = log (c * a ** n)
# log Ts = log c + Ns * log a
coeffs = np.polyfit(Ns, np.log2(Ts), deg=1)
poly = np.poly1d(coeffs, variable='Ns')
# use curve_fit to fit the data
from scipy.optimize import curve_fit
def func(x, a, c):
return c*a**x
popt, pcov = curve_fit(func, Ns, Ts)
aa, cc = popt
a, c = 2**coeffs
# plot it
import matplotlib.pyplot as plt
plt.figure()
plt.plot(Ns, np.log2(Ts), 'ko', label='time measurements')
plt.plot(Ns, np.polyval(poly, Ns), 'r-',
label=r'$time = %.2g\times %.4g^N$' % (c, a))
plt.plot(Ns, np.log2(func(Ns, *popt)), 'b-',
label=r'$time = %.2g\times %.4g^N$' % (cc, aa))
plt.xlabel('N')
plt.ylabel('log2(time in seconds)')
plt.legend(loc='upper left')
plt.show()
Fitted values:
>>> c*a**np.array([900, 1000])
array([ 11.37200806, 21.56029156])
>>> func([900, 1000], *popt)
array([ 14.1521409 , 22.62916398])

Dynamic programming, to the rescue:
def is_prime(n):
return all(n % i != 0 for i in range(2, n))
def order(numbers, current=[]):
if not numbers:
return current
for i, n in enumerate(numbers):
if current and not is_prime(n + current[-1]):
continue
result = order(numbers[:i] + numbers[i + 1:], current + [n])
if result:
return result
return False
result = order(range(500))
for i in range(len(result) - 1):
assert is_prime(result[i] + result[i + 1])
You can force it to work for even larger lists by increasing the maximum recursion depth.

Here's my take on a solution. As Tim Peters pointed out, this is a Hamiltonian path problem.
So the first step is to generate the graph in some form.
Well the zeroth step in this case to generate prime numbers. I'm going to use a sieve, but whatever prime test is fine. We need primes upto 2 * n since that is the largest any two numbers can sum to.
m = 8
n = m + 1 # Just so I don't have to worry about zero indexes and random +/- 1's
primelen = 2 * m
prime = [True] * primelen
prime[0] = prime[1] = False
for i in range(4, primelen, 2):
prime[i] = False
for i in range(3, primelen, 2):
if not prime[i]:
continue
for j in range(i * i, primelen, i):
prime[j] = False
Ok, now we can test for primality with prime[i]. Now its easy to make the graph edges. If I have a number i, what numbers can come next. I'll also make use of the fact that i and j have opposite parity.
pairs = [set(j for j in range(i%2+1, n, 2) if prime[i+j])
for i in range(n)]
So here pairs[i] is set object whose elements are integers j such that i+j is prime.
Now we need to walk the graph. This is really where the time consuming part is and all further optimizations will be done here.
chains = [
([], set(range(1, n))
]
chains is going to keep track of the valid paths as we walk them. The first element in the tuple will be your result. The second element is all the unused numbers, or unvisited nodes. The idea is to take one chain out of the queue, take a step down the path and put it back.
while chains:
chain, unused = chains.pop()
if not chain:
# we haven't even started, all unused are valid
valid_next = unused
else:
# We need numbers that are both unused and paired with the last node
# Using sets makes this easy
valid_next = unused & pairs[chains[-1]]
for num in valid_next:
# Take a step to the new node and add the new path back to chains
# Reminder, its important not to mutate anything here, always make new objs
newchain = chain + [num]
newunused = unused - set([num])
chains.append( (newchain, newunused) )
# are we done?
if not newunused:
print newchain
chains = False
Notice that if there is no valid next step, the path is removed without a replacement.
This is really memory inefficient, but runs in a reasonable time. The biggest performance bottleneck is walking the graph, so the next optimization would be popping and inserting paths in intelligent places to prioritize the most likely paths. It might be helpful to use a collections.deque or different container for your chains in that case.
EDIT
Here is an example of how you can implement your path priority. We will assign each path a score and keep the chains list sorted by this score. For a simple example I will suggest that paths containing "harder to use" nodes are worth more. That is for each step on a path the score will increase by n - len(valid_next) The modified code will look something like this.
import bisect
chains = ...
chains_score = [0]
while chains:
chain, unused = chains.pop()
score = chains_score.pop()
...
for num in valid_next:
newchain = chain + [num]
newunused = unused - set([num])
newscore = score + n - len(valid_next)
index = bisect.bisect(chains_score, newscore)
chains.insert(index, (newchain, newunused))
chains_score.insert(index, newscore)
Remember that insertion is O(n) so the overhead of adding this can be rather large. Its worth doing some analysis on your score algorithm to keep the queue length len(chains) managable.

Related

Subset problem in python - fix the number of addends that sum to the target

I'm trying to implement a simple program that aims to solve the subset problem in python.
I've found the following code that involves dynamically programming:
def subset_sum(numbers, target, partial=[]):
s = sum(partial)
# check if the partial sum is equals to target
if s == target:
print("sum(%s)=%s" % (partial, target))
if s >= target:
return # if we reach the number why bother to continue
for i in range(len(numbers)):
n = numbers[i]
remaining = numbers[i+1:]
subset_sum(remaining, target, partial + [n])
The code works but it finds all the possible combinations, while I'm only interested on subsets with a maximum number of addends that I want to specify time to time.
In other words I'd like to set a maximum number of the internal loops. To do that I've written the following code that considers all the possible combinations of at most 4 numbers that sum to the target (i.e. 3 internal loops)
def subset_sum(n_list, tot):
cnd = n_list[n_list < tot]
s = np.sort(cnd)
n_max = len(cnd)
possibilities = []
for i1 in range(n_max):
i2 = i1+1
while (i2<n_max)and(s[i1]+s[i2]<=tot):
if (s[i1]+s[i2]==tot):
possibilities.append([s[i1],s[i2]])
i3 = i2+1
while (i3<n_max)and(s[i1]+s[i2]+s[i3]<=tot):
if (s[i1]+s[i2]+s[i3]==tot):
possibilities.append([s[i1],s[i2],s[i3]])
i4 = i3+1
while (i4<n_max)and(s[i1]+s[i2]+s[i3]+s[i4]<=tot):
if (s[i1]+s[i2]+s[i3]+s[i4]==tot):
possibilities.append([s[i1],s[i2],s[i3],s[i4]])
i4+=1
i3+=1
i2+=1
return possibilities
This code works pretty well, can also be speed up with numba (while the first code no) but I cannot fix the maximum number of addends.
Is there a way to implement the function subset_sum with an extra argument that fix the maximum number of addends that sum to the target?
Since you are adding a number in each recursion you can just limit the recursion depth. To do so, you need to add a new parameter to control the maximum depth (a.k.a. the maximum number of addends).
Here is the code:
def subset_sum(numbers, target, num_elems, partial=[]):
# Check if the partial sum is equals to target
s = sum(partial)
if s == target:
print("sum(%s)=%s" % (partial, target))
# If we have surpassed the number there is no point to continue
if s >= target:
return
# If we have surpassed the number of elements there is no point to continue
if len(partial) >= num_elems:
return
# Otherwise go through the remaining numbers
for i in range(len(numbers)):
n = numbers[i]
remaining = numbers[i+1:]
subset_sum(remaining, target, num_elems, partial + [n])
You can run it with:
if __name__ == "__main__":
nums = [1, 2, 3, 4, 5]
num_elems = 3
target = 10
p = []
subset_sum(nums, target, num_elems, p)
And the output will be:
sum([1, 4, 5])=10
sum([2, 3, 5])=10
Notice that the combination of 4 elements ([1, 2, 3, 4]) is not shown.
EDIT:
To speed-up the above code with Numba you need to build an iterative version of it. Since you are basically computing the combinations of numbers in sets of num_elements size, you can check the iterative implementation of the itertools.combination (more details here). Based on that implementation you can obtain the following code:
def subset_sum_iter(numbers, target, num_elements):
# General: we iterate among the indexes and build each solution by taking the values in those indexes
# Initialize solutions list
solutions = []
# Build first index by taking the first num_elements from the numbers
indices = list(range(num_elements))
solution = [numbers[i] for i in indices]
if sum(solution) == target:
solutions.append(solution)
# We iterate over the rest of the indices until we have tried all combinations
while True:
for i in reversed(range(num_elements)):
if indices[i] != i + len(numbers) - num_elements:
break
else:
# No combinations left
break
# Increase current index and all its following ones
indices[i] += 1
for j in range(i + 1, num_elements):
indices[j] = indices[j - 1] + 1
# Check current solution
solution = [numbers[i] for i in indices]
if sum(solution) == target:
solutions.append(solution)
# Print all valid solutions
for sol in solutions:
print ("sum(" + str(sol) + ")=" + str(target))
Which can be run with:
if __name__ == "__main__":
nums = [1, 2, 3, 4, 5]
num_elems = 3
target = 10
# Calling iterative subset
subset_sum_iter(nums, target, num_elems)
And outputs:
sum([1, 4, 5])=10
sum([2, 3, 5])=10
As in the previous case, notice that only the combinations with 3 elements are shown.
I am not sure whether you prefer combinations or permuations here, but you could try this?
import itertools
limit = 1 #number of addends
possibilities = 0
combinations = []
not_possibilties = 0
number_addends = 4
while(limit <= number_addends):
for comb in itertools.combinations([number_list], limit):
if sum(comb) == target:
possibilities +=1
combinations.append(comb)
else:
not_possiblitities +=1
limit +=1
total_combi = possibilities + not_possibilties #check if actually all possibilities were done
If you need permutations just change itertools.combinations to itertools.permutationss

How to verify that a shuffling algorithm is uniform?

I have a simple Python implementation of Knuth's shuffling algorithm:
def knuth_shuffle(ar):
num = len(ar)
for i in range(num):
index = random.randint(0, i)
ar[i], ar[index] = ar[index], ar[i]
return ar
How is it possible to test (using scipy or whatever other package) that the shuffling is indeed uniform? I have found a couple of related posts (1, 2) but they don't answer my question. It would be great to understand how to perform such checks in general.
EDIT:
As Paul Hankin in the comments, my original test only checked that the probability of each element falling into each position, but not that all permutations are equally likely, which is a stronger requirement. The snippet below counts the frequency of each permutation, which is what we should be looking at:
import math
import random
def knuth_shuffle(ar):
num = len(ar)
for i in range(num):
index = random.randint(0, i)
ar[i], ar[index] = ar[index], ar[i]
return ar
# This function computes a unique index for a given permutation
# Adapted from https://www.jaapsch.net/puzzles/compindx.htm#perm
def permutation_index(permutation):
n = len(permutation)
t = 0
for i in range(n):
t = t * (n - i)
for j in range(i + 1, n):
if permutation[i] > permutation[j]:
t += 1
return t
N = 6 # Test list size
T = 1000 # Trials / number of permutations
random.seed(100)
n_perm = math.factorial(N)
trials = T * n_perm
ar = list(range(N))
freq = [0] * n_perm
for _ in range(trials):
ar_shuffle = ar.copy()
knuth_shuffle(ar_shuffle)
freq[permutation_index(ar_shuffle)] += 1
If the shuffle is uniform, the values of the resulting freq vector should be distributed according to a binomial distribution with T * N! trials and probability of success 1 / (N!). Here is a plot of a distribution estimation for the previous example (done with Seaborn), where frequency values should be around 1000:
Which I think looks good but again, for a quantitative result you would need a deeper statistical analysis, such as Pearson's chi-squared test, as suggested by David Eisenstat.
ORIGINAL ANSWER:
I'm going to put here some basic ideas, but I don't have the strongest background on statistics so someone may want to complement or correct anything that is wrong.
You can make a matrix of frequencies of each value falling into each position for a number of trials:
def knuth_shuffle(ar):
num = len(ar)
for i in range(num):
index = random.randint(0, i)
ar[i], ar[index] = ar[index], ar[i]
return ar
N = 100 # Test list size
T = 10000 # Number of trials
ar = list(range(N))
freq = [[0] * N for _ in range(N)]
for _ in range(T):
ar_shuffle = ar.copy()
kunth_shuffle(ar_shuffle)
for i, j in enumerate(ar_shuffle):
freq[i][j] += 1
Once you can do that, there are several approaches you can take. A simple idea is, if the shuffle is uniform, freq / T should tend to 1 / N as T tends to infinity. So you can just use a "very big" value of T and see that those values are "close enough". Or check that the standard deviation of freq / T - 1 / N is "small enough".
These "close enough" and "small enough" though are not very solid concepts. The grounded analysis requires more statistical tools. I think you would need to test the hipothesis that each frequency value is sampled from a binomial distribution with T trials a 1 / N success probability. As I said do not have the background for a full explanation of that and this is probably not the place for it but if you really need a thorough analysis you can read up on the topic.
You can check this exactly, by injecting all possible sequences of random numbers into knuth_shuffle, and then verifying you get each permutation exactly once.
This code does that:
import collections
import itertools
import random
def knuth_shuffle(ar, R=random.randint):
num = len(ar)
for i in range(num):
index = R(0, i)
ar[i], ar[index] = ar[index], ar[i]
return ar
def fact(i):
r = 1
while i > 1:
r *= i
i -= 1
return r
def all_random_seqs(N):
for r in range(fact(N)):
seq = []
for i in range(N):
seq.append(r % (i+1))
r //= (i+1)
it = iter(seq)
yield lambda x, y: next(it)
for N in range(1, 6):
print N
results = collections.Counter()
for R in all_random_seqs(N):
a = list('ABCDEFG'[:N])
knuth_shuffle(a, R)
results[''.join(a)] += 1
print 'checking...'
if len(results) != fact(N):
print 'N=%d. Not enough results. %s' % (N, results)
if any(c > 1 for c in results.itervalues()):
print 'N=%d. Not all permutations unique. %s' % (N, results)
if any(sorted(c) != list('ABCDEFG'[:N]) for c in results.iterkeys()):
print 'N=%d. Some permutations are illegal. %s' % (N, results)
This code checks exact correctness for input lists of size 1, 2, 3, 4, 5. You can probably go a little further before N! gets too large.
You will also want to perform sanity checks on the version of the code using random.randint (for example generating 500 shuffles of 'ABCD', and making sure you get each permutation at least once).
If you randomly shuffle the same items from a given fixed order, then the count of each item in one fixed position in the shuffled items should tend to the same value.
Below I shuffle the list 0..9 a few times and print the output:
from random import shuffle # Uses Fischer-Yates
tries = 1_000_000
intcount = 10
first_position_counts = {n:0 for n in ints}
ints = range(intcount)
for _ in range(tries):
lst = list(ints) # [0, 1, ...9] In that order
shuffle(lst)
first_position_counts[lst[0]] += 1
print(f'{tries} shuffles of the ints 0..{intcount-1} should have each int \n',
'appear in the first position {tries/intcount} times.')
for item in first_position_counts.items():
print(' %i: %5i' % item)
Run once you might get something like:
0: 99947
1: 100522
2: 99828
3: 100123
4: 99582
5: 99635
6: 99991
7: 100108
8: 100172
9: 100092
And again:
0: 100049
1: 99918
2: 100053
3: 100285
4: 100293
5: 100034
6: 99861
7: 99584
8: 100055
9: 99868
Now if you have thousands of items to shuffle, then they should end up in one of n! permutations, but n! grows large, quckly; and if it is "comparable", certainly grater than the possible range of your random number generator then it breaks down.

Optimal Search Tree Using Python - Code Analysis

First of all, sorry about the naive question. But I couldn't find help elsewhere
I'm trying to create an Optimal Search Tree using Dynamic Programing in Python that receives two lists (a set of keys and a set of frequencies) and returns two answers:
1 - The smallest path cost.
2 - The generated tree for that smallest cost.
I basically need to create a tree organized by the most accessed items on top (most accessed item it's the root), and return the smallest path cost from that tree, by using the Dynamic Programming solution.
I've the following implemented code using Python:
def optimalSearchTree(keys, freq, n):
#Create an auxiliary 2D matrix to store results of subproblems
cost = [[0 for x in xrange(n)] for y in xrange(n)]
#For a single key, cost is equal to frequency of the key
#for i in xrange (0,n):
# cost[i][i] = freq[i]
# Now we need to consider chains of length 2, 3, ... .
# L is chain length.
for L in xrange (2,n):
for i in xrange(0,n-L+1):
j = i+L-1
cost[i][j] = sys.maxint
for r in xrange (i,j):
if (r > i):
c = cost[i][r-1] + sum(freq, i, j)
elif (r < j):
c = cost[r+1][j] + sum(freq, i, j)
elif (c < cost[i][j]):
cost[i][j] = c
return cost[0][n-1]
def sum(freq, i, j):
s = 0
k = i
for k in xrange (k,j):
s += freq[k]
return s
keys = [10,12,20]
freq = [34,8,50]
n=sys.getsizeof(keys)/sys.getsizeof(keys[0])
print(optimalSearchTree(keys, freq, n))
I'm trying to output the answer 1. The smallest cost for that tree should be 142 (the value stored on the Matrix Position [0][n-1], according to the Dynamic Programming solution). But unfortunately it's returning 0. I couldn't find any issues in that code. What's going wrong?
You have several very questionable statements in your code, definitely inspired by C/Java programming practices. For instance,
keys = [10,12,20]
freq = [34,8,50]
n=sys.getsizeof(keys)/sys.getsizeof(keys[0])
I think you think you calculate the number of items in the list. However, n is not 3:
sys.getsizeof(keys)/sys.getsizeof(keys[0])
3.142857142857143
What you need is this:
n = len(keys)
One more find: elif (r < j) is always True, because r is in the range between i (inclusive) and j (exclusive). The elif (c < cost[i][j]) condition is never checked. The matrix c is never updated in the loop - that's why you always end up with a 0.
Another suggestion: do not overwrite the built-in function sum(). Your namesake function calculates the sum of all items in a slice of a list:
sum(freq[i:j])
import sys
def optimalSearchTree(keys, freq):
#Create an auxiliary 2D matrix to store results of subproblems
n = len(keys)
cost = [[0 for x in range(n)] for y in range(n)]
storeRoot = [[0 for i in range(n)] for i in range(n)]
#For a single key, cost is equal to frequency of the key
for i in range (0,n):
cost[i][i] = freq[i]
# Now we need to consider chains of length 2, 3, ... .
# L is chain length.
for L in range (2,n+1):
for i in range(0,n-L+1):
j = i + L - 1
cost[i][j] = sys.maxsize
for r in range (i,j+1):
c = (cost[i][r-1] if r > i else 0)
c += (cost[r+1][j] if r < j else 0)
c += sum(freq[i:j+1])
if (c < cost[i][j]):
cost[i][j] = c
storeRoot[i][j] = r
return cost[0][n-1], storeRoot
if __name__ == "__main__" :
keys = [10,12,20]
freq = [34,8,50]
print(optimalSearchTree(keys, freq))

Faster algorithm for finding number of paths between two nodes

I am trying to answer a question on an online judge in Python, but I am exceeding both the time limit and memory limit. The question is pretty much asking for the number of all paths from a start node to an end node. Full question specifications can be seen here.
This is my code:
import sys
lines = sys.stdin.read().strip().split('\n')
n = int(lines[0])
dict1 = {}
for i in xrange(1, n+1):
dict1[i] = []
for i in xrange(1, len(lines) - 1):
numbers = map(int, lines[i].split())
num1 = numbers[0]
num2 = numbers[1]
dict1[num2].append(num1)
def pathfinder(start, graph, count):
new = []
if start == []:
return count
for i in start:
numList = graph[i]
for j in numList:
if j == 1:
count += 1
else:
new.append(j)
return pathfinder(new, graph, count)
print pathfinder([n], dict1, 0)
What the code does is it starts at the end node, and works its way up to the top by exploring all neighboring nodes. I made essentially a breadth first search algorithm, but its taking up too much space and time. How can I improve this code to make it more efficient? Is my approach wrong and how should I fix it?
Since the graph is acyclic there is a topological ordering which we can immediately see to be 1, 2, ..., n. So we can use dynamic programming the same way it is used to solve the longest path problem. In a list paths the element paths[i] stores how many paths would there be from 1 to i. The update would be simple - for each edge (i,j) where i is from our topological order we do paths[j] += path[i].
from collections import defaultdict
graph = defaultdict(list)
n = int(input())
while True:
tokens = input().split()
a, b = int(tokens[0]), int(tokens[1])
if a == 0:
break
graph[a].append(b)
paths = [0] * (n+1)
paths[1] = 1
for i in range(1, n+1):
for j in graph[i]:
paths[j] += paths[i]
print(paths[n])
Note that what you are implementing is not actually BFS since you don't mark which vertices you've visited making your start to grow out of proportion.
Test the graph
for i in range(1, n+1):
dict1[i] = list(range(i-1, 0, -1))
If you print the size of start you can see that the max value it gets for a given n grows exactly as binomial(n, floor(n/2)) which is ~4^n/sqrt(n). Note also that BFS is not what you want since it is not possible to count the number of paths in that way.
import sys
from collections import defaultdict
def build_matrix(filename, x):
# A[i] stores number of paths from node x to node i.
# O(n) to build parents_of_node
parents_of_node = defaultdict(list)
with open(filename) as infile:
num_nodes = int(infile.readline())
A = [0] * (num_nodes + 1) # A[0] is dummy variable. Not used.
for line in infile:
if line == "0 0":
break
u, v = map(int, line.strip().split())
parents_of_node[v].append(u)
# Initialize all direct descendants of x to 1
if u == x:
A[v] = 1
# Number of paths from x to i = sum(number of paths from x to parent of i)
for i in xrange(1, num_nodes + 1): # O(n)
A[i] += sum(A[p] for p in parents_of_node[i]) # O(max fan-in of graph), assuming O(1) for accessing dict.
# Total time complexity to build A is O(n * (max_fan-in of graph))
return A
def main():
filename = sys.argv[1]
x = 1 # Find number of paths from x
y = 4 # to y
A = build_matrix(filename, x)
print(A[y])
What you are doing is a DFS (not a BFS) in that code...
Here's a link to a good solution...
EDITED:
Use this approach instead...
http://www.geeksforgeeks.org/find-paths-given-source-destination/

find second largest in the list of numbers

I have to find the second largest number and largest number from the list by divide and conquer algorithm. The problem is that everything is right except the part that I use indices like a and b. Because it works faster. Cost cheaper. Do not need rewrite code or send other codes and approaches. Just help me please to fix it if u can.. Any helps any ideas welcome. Thanks
#!/usr/local/bin/python2.7
def two_max(arr,a,b):
n = len(arr)
if n==2:
if arr[0]<arr[1]:
return (arr[1], arr[0])
else:
return (arr[0], arr[1])
(greatest_left, sec_greatest_left) = two_max(arr,a (a+b)/2)
(greatest_right, sec_greatest_right) = two_max(arr,(a+b)/2,b)
if greatest_left < greatest_right:
greatest = greatest_right
if greatest_left < sec_greatest_left:
return (greatest, sec_greatest_left)
else:
return (greatest, greatest_left)
else:
greatest = greatest_left
if greatest_right < sec_greatest_right: # Line 4
return (greatest, sec_greatest_right)
else:
return (greatest, greatest_right)
The biggest problem is that you never get any closer to your recursive base case.
The base case is len(arr) == 2. But every time you call yourself, you just pass arr as-is:
(greatest_left, sec_greatest_left) = two_max(arr,a,(a+b)/2)
(greatest_right, sec_greatest_right) = two_max(arr,(a+b)/2,b)
(Note that I'm guessing on the comma in the first one, because as you posted it, you're actually calling the number a as a function, which is unlikely to do anything useful…)
So, either your base case should take a and b into account, like this:
if b-a == 2:
if arr[a]<arr[a+1]:
return (arr[a+1], arr[a])
else:
return (arr[a], arr[a+1])
… or you should send a slice of arr instead of the whole thing—in which case you don't need a and b in the first place:
(greatest_left, sec_greatest_left) = two_max(arr[:len(a)/2])
(greatest_right, sec_greatest_right) = two_max(arr[len(a)/2:])
Either one will fix your first problem. Of course the function still doesn't work for most inputs. In fact, it only works if the length of the list is a power of two.
If that isn't a good enough hint for how to fix it: What happens if b-a is 3? Obviously you can't split it into two halves, both of which are of size 2 or greater. So, you'll need to write another base case for b-a == 1, and return something that will make the rest of the algorithm work.
Why don't you do it that way:
>>> def getIlargest(arr, i):
if (i <= len(arr) and i > 0):
return sorted(arr)[-i]
>>> a = [1,3,51,4,6,23,53,2,532,5,2,6,7,5,4]
>>> getIlargest(a, 2)
53
I took it one step further and tested 3 methods:
Using counting sort - getIlargestVer2
Using python sorted function - getIlargestVer1
Using heap - heapIlargest as #abarnert suggested.
The results:
for arrays in sizes from 1 to ~5000 sorted is the best, for larger arrays the heapq.nlargest usage is the winner:
plot for arrays in sizes between [1*150, 55*150]:
*Full scan between array in sizes of [1*150, 300*150]:*
The code I used is the following, the 3 methods implementation is in setup string:
setup = """
import heapq, random
a = random.sample(xrange(1<<30), 150)
a = a * factor
class ILargestFunctions:
# taken from [wiki][3] and was rewriting it.
def counting_sort(self, array, maxval):
m = maxval + 1
count = {}
for a in array:
if count.get(a, None) is None:
count[a] = 1
else:
count[a] += 1
i = 0
for key in count.keys():
for c in range(count[key]):
array[i] = key
i += 1
return array
def getIlargestVer1(self, arr, i):
if (i <= len(arr) and i > 0):
return sorted(arr)[-i]
def getIlargestVer2(self, arr, i):
if (i <= len(arr) and i > 0):
return self.counting_sort(arr, max(arr))[-i]
def heapIlargest(self, arr, i):
if (i <= len(arr) and i > 0):
return heapq.nlargest(i,arr)
n = ILargestFunctions()
"""
And the main line triggers the performance counting and plots the collected data is in:
import timeit
import numpy as np
import matplotlib.pyplot as plt
if __name__ == "__main__":
results = {}
r1 = []; r2 = []; r3 = [];
x = np.arange(1,300,1)
for i in xrange(1,300,1):
print i
factorStr = "factor = " + str(i) + ";"
newSetupStr = factorStr + setup
r1.append(timeit.timeit('n.getIlargestVer1(a, 100)', number=200, setup=newSetupStr))
r2.append(timeit.timeit('n.getIlargestVer2(a, 100)', number=200, setup=newSetupStr))
r3.append(timeit.timeit('n.heapIlargest(a, 100)', number=200, setup=newSetupStr))
results[i] = (r1,r2,r3)
p1 = plt.plot(x, r1, 'r', label = "getIlargestVer1")
p2 = plt.plot(x, r2, 'b' , label = "getIlargestVer2")
p3 = plt.plot(x, r3, 'g' , label = "heapIlargest")
plt.legend(bbox_to_anchor=(1.05, 1), loc=1, borderaxespad=0.)
plt.show()
#0x90 has the right idea, but he got it reversed.
def find_i_largest_element(seq, i):
if (i <= len(seq) and i > 0):
s = sorted(seq, reverse=True)
return s[i-1]
By the way, is this a homework assignment? If so, what's the whole idea behind the algorithm you have to use?

Categories