Optimizing python DFS (for loop is inefficient) - python

Given the following function, what would be the correct and pythonic way to archiving the same (and faster) result?
My code is not efficient and I believe I'm missing something that is staring at me.
The idea is to find a pattern that is [[A,B],[A,C],[C,B]] without having to generate additional permutations (since this will result in a higher processing time for the comparisons).
The length of the dictionary fed into find_path in real-life would be approximately 10,000, so having to iterate over that amount with the current code version below is not efficient.
from time import perf_counter
from typing import List, Generator, Dict
def find_path(data: Dict) -> Generator:
for first_pair in data:
pair1: List[str] = first_pair.split("/")
for second_pair in data:
pair2: List[str] = second_pair.split("/")
if pair2[0] == pair1[0] and pair2[1] != pair1[1]:
for third_pair in data:
pair3: List[str] = third_pair.split("/")
if pair3[0] == pair2[1] and pair3[1] == pair1[1]:
amount_pair_1: int = data.get(first_pair)[
"amount"
]
id_pair_1: int = data.get(first_pair)["id"]
amount_pair_2: int = data.get(second_pair)[
"amount"
]
id_pair_2: int = data.get(second_pair)["id"]
amount_pair_3: int = data.get(third_pair)[
"amount"
]
id_pair_3: int = data.get(third_pair)["id"]
yield (
pair1,
amount_pair_1,
id_pair_1,
pair2,
amount_pair_2,
id_pair_2,
pair3,
amount_pair_3,
id_pair_3,
)
raw_data = {
"EZ/TC": {"id": 1, "amount": 9},
"LM/TH": {"id": 2, "amount": 8},
"CD/EH": {"id": 3, "amount": 7},
"EH/TC": {"id": 4, "amount": 6},
"LM/TC": {"id": 5, "amount": 5},
"CD/TC": {"id": 6, "amount": 4},
"BT/TH": {"id": 7, "amount": 3},
"BT/TX": {"id": 8, "amount": 2},
"TX/TH": {"id": 9, "amount": 1},
}
processed_data = list(find_path(raw_data))
for i in processed_data:
print(("The path to traverse is:", i))
>> ('The path to traverse is:', (['CD', 'TC'], 4, 6, ['CD', 'EH'], 7, 3, ['EH', 'TC'], 6, 4))
>> ('The path to traverse is:', (['BT', 'TH'], 3, 7, ['BT', 'TX'], 2, 8, ['TX', 'TH'], 1, 9))
>> ('Time to complete', 5.748599869548343e-05)
# Timing for a simple ref., as mentioned above, the raw_data is a dict containing about 10,000 keys

You can't do that with this representation of the graph. This algorithm has O(|E|^3) time complexity. It is a good idea to store edges as array of lists, each list will store only adjacent vertexes. And then it is easy to do what you need. Fortunately, you can re-represent graph in O(|E|) time.
How to do that
We will store graph as array of vertices (but in this case because of string vertex-values we take a dictionary). We want to access in all neighbours by a vertex. Let's do that -- we will store in the array lists of all neighbours of the given vertex.
Now we just need to construct our structure by set of edges (aka row_data).
How to add an edge in graph? Easy! We should find a vertex from in our array and add a vertex to to the list of it's neighbours
So, the construct_graph function could be like:
def construct_graph(raw_data): # here we will change representation
graph = defaultdict(list) # our graph
for pair in raw_data: # go through every edge
u, v = pair.split("/") # get from and to vertexes
graph[u].append(v) # and add this edge in our structure
return graph # return our new graph to other functions
How to find path length 2
We will use dfs on our graph.
def dfs(g, u, dist): # this is a simple dfs function
if dist == 2: # we has a 'dist' from our start
return [u] # and if we found already answer, return it
for v in g.get(u, []): # otherwise check all neighbours of current vertex
ans = dfs(g, v, dist + 1) # run dfs in every neighbour with dist+1
if ans: # and if that dfs found something
ans.append(u) # store it in ouy answer
return ans # and return it
return [] # otherwise we found nothing
And then we just try it for every vertex.
def main():
graph = construct_graph(raw_data)
for v in graph.keys(): # here we will try to find path
ans = dfs(graph, v, 0) # starting with 0 dist
if ans: # and if we found something
print(list(reversed(ans))) # return it, but answer will be reversed

Related

Monotonic Shortest Path of a directed graph in python

I am trying to get the following code working. After every for ends heappop gives me an integer instead of Vertex. In addition when I got it working, with changing the Vertex in the priority queue with integer. I have wrong result. Please help.
Thanks in advance
import heapq
class Vertex:
def __init__(self, id):
self.id = id
self.adjList = []
self.adjWeights = []
def shortestPath(vertices, N, source, destination):
distTo = [float('inf') for _ in range(N+1)]
edgeTo = [float('inf') for _ in range(N+1)]
# Set initial distance from source
# to the highest value
distTo[source] = 0.0
edgeTo[source] = float('inf')
pq = [vertices[source]]
heapq.heapify(pq)
while True:
closest = heapq.heappop(pq)
for i in range(len(closest.adjList)):
# Checks if the edges are decreasing and
# whether the current directed edge will
# create a shorter path
if closest.adjWeights[i] < edgeTo[closest.id] and distTo[closest.id] + closest.adjWeights[i] < distTo[closest.adjList[i]]:
edgeTo[closest.adjList[i]] = closest.adjWeights[i]
distTo[closest.adjList[i]] = closest.adjWeights[i] + distTo[closest.id];
heapq.heappush(pq, closest.adjList[i])
print(distTo)
print(distTo[destination])
def main ()
N = 6
M = 9
'''
edges = {{0, 2, 1.1}, {0, 4, 2}, {0, 5, 3.3}, {1, 4, 2.7},
{2, 3, 2}, {2, 4, 1.1}, {3, 1, 2.3}, {4, 5, 2.4}, {5, 1, 3}}
'''
# Create an array of vertices
vertices = [Vertex(i) for i in range(0, N)]
i=0
vertices[0].adjList.append(2)
vertices[0].adjWeights.append(1.1)
vertices[0].adjList.append(4)
vertices[0].adjWeights.append(2.0)
vertices[0].adjList.append(5)
vertices[0].adjWeights.append(3.3)
vertices[1].adjList.append(4)
vertices[1].adjWeights.append(2.7)
vertices[2].adjList.append(3)
vertices[2].adjWeights.append(2.0)
vertices[2].adjList.append(4)
vertices[2].adjWeights.append(1.1)
vertices[3].adjList.append(1)
vertices[3].adjWeights.append(2.3)
vertices[4].adjList.append(5)
vertices[4].adjWeights.append(2.4)
vertices[5].adjList.append(1)
vertices[5].adjWeights.append(3.0)
# Source and destination vertices
src = 0
target = 1
print(shortestPath(vertices, N, src, target))

Which is the most efficient way to compairing all values (sets) of a dict in python?

I'm trying to compare every pair of values in a python dict. The values associated to each key is a list, and i have to obtain the common elements between every pair of lists. The dict structure is:
fruits = {
'orange': [3, 5, 7, 2, 11],
'apple': [4, 7, 2, 9, 1],
'grape': [18, 6, 8, 4]
}
And the result i want:
{
'orange': { 'apple': 2 },
'apple': { 'orange': 2, 'grape': 1 },
'grape': { 'apple': 1 }
}
I have tried to do it using dict comprehesion and deque data structure, but still not so efficient:
deque:
deque_fruits = deque(fruits.keys())
fruits_copy = fruits.copy()
result = {}
while len(deque_fruits) > 0:
fruit = deque_fruits.popleft()
values = fruits_copy[fruit]
del fruit__copy[fruit]
for aux_fruit, aux_values in fruits_copy.items():
intersection = len(set(values)&set(aux_values))
if intersection > 0:
result[fruit][aux_fruit] = intersection
result[aux_fruit][fruit] = intersection
dict comprehesion
result = {
fruit: {
aux_fruit: len(set(values)&set(aux_values))
for aux_fruit, aux_values in fruits.items() if fruit != aux_fruit and len(set(values)&set(aux_values)) > 0
}
for fruit, valuesin fruits.items()
}
Any idea of how to speed up this? Got ~84000 elems and each one a list of ~100-200 values.
Thank you in advance!
Here is a faster implementation:
# Precompute the sets and the items
fruitItems = {name: set(value) for name,value in fruits.items()}.items()
result = dict()
for name1, value1 in fruitItems:
tmpRes = dict()
for name2, value2 in fruitItems:
count = len(value1 & value2)
if name1 != name2 and count > 0:
tmpRes[name2] = count
result[name1] = tmpRes
This code is 4 times faster with 50 fruits and 10 integers/list.
Note that using PyPy or Cython improve a bit the execution time.
If all list integers are quite small and bounded, there is a much faster implementation.
#Jérôme Richard's solution computes N^2 intersections and handles N^2 - N items, although intersection(set1, set2) == intersection(set2, set1). This can be cut down to N*(N-1)/2 operations, which is 25% to 50% (for large N) of the original effort. So to say, it's traversing only the upper half of an NxN matrix, minus the N diagonal elements. defaultdict is needed as the assignment of the counterpart is not in sequence of occurrance.
from collections import defaultdict
def compare_sets():
fruits = {
'orange': [3, 5, 7, 2, 11],
'apple': [4, 7, 2, 9, 1],
'grape': [18, 6, 8, 4],
'pear': [4, 16, 7, 5]
}
# precompute the sets and the items
fruit_items = {name: set(set) for name,set in fruits.items()}.items()
# create dict of dicts
result = defaultdict(dict)
n = len(fruit_items)
# working with indices instead of items to avoid duplicate pairings
for i in range(n-1):
name1, set1 = fruit_items[i]
tmp_result = dict()
for j in range(i+1, n):
name2, set2 = fruit_items[j]
count = len(set1 & set2) # intersection
if count > 0:
tmp_result[name2] = count
result[name2][name1] = count # counterpart
result[name1] = tmp_result

Updating priority queue python Dijkstras algorithm

I would like to understand in the following WORKING AND FINISHED code, why when updating pq_update, it is written as pq_update[neighbour][1].
Instead of writing pq_update[neighbour] (which is how I did it), it does not seem to change anything so why is it included ?
Thank you
import heapq
def dijkstra(graph, start):
distances = {vertex:float('inf') for vertex in graph}
pq = []
pq_update = {}
distances[start] = 0
for vertex, value in distances.items():
entry = [vertex, value]
heapq.heappush(pq, entry)
pq_update[vertex] = entry
while pq:
getmin = heapq.heappop(pq)[0]
for neighbour, distance_neigh in graph[getmin].items():
dist = distances[getmin] + distance_neigh
if dist < distances[neighbour]:
distances[neighbour] = dist
pq_update[neighbour][1] = dist # THIS LINE !!!
print(distances)
return distances
if __name__ == '__main__':
example_graph = {
'U': {'V': 2, 'W': 5, 'X': 1},
'V': {'U': 2, 'X': 2, 'W': 3},
'W': {'V': 3, 'U': 5, 'X': 3, 'Y': 1, 'Z': 5},
'X': {'U': 1, 'V': 2, 'W': 3, 'Y': 1},
'Y': {'X': 1, 'W': 1, 'Z': 1},
'Z': {'W': 5, 'Y': 1},
}
dijkstra(example_graph, 'X')
Note: the implementation you have is broken and doesn't correctly implement Dijkstra. More on that below.
The pq_update dictionary contains lists, each with two entries:
for vertex, value in distances.items():
entry = [vertex, value]
heapq.heappush(pq, entry)
pq_update[vertex] = entry
So pq_update[neighbour] is a list with both the vertex and the distance. You want to update the distance, not replace the [vertex, value] list, so pq_update[neighbour][1] is used.
Note that the entry list is also shared wit the heapq. The pq heap has a reference to the same list object, so changes to pq_update[neightbor][1] will also be visible in entries still to be processed on heap!
When you assign directly to pq_update[neighbour], you remove that connection.
The reason you don't see any difference is because the implementation of the algorithm is actually broken, as the heap is not used correctly. The heap is sorted by first by the first value in the list items you pushed in. In your code that's the node name, not the distance, and the heapq order of items is never updated when the distances in the list items are altered. Because the heapq is not used correctly, you always traverse the nodes in alphabetical order.
To use the heapq correctly, you need to put the edge length first, and you don't alter the values on the heap; if you use tuples you can't accidentally do this. You only need to push nodes onto the heap that you reached, really; you'll end up with multiple entries for some of the nodes (reached by multiple paths), but the heapq will still present the shortest path to that node first. Just keep a set of visited nodes so you know to skip any longer paths. The point is that you visit the shorter path to a given node before the longer path, and you don't need to alter the heapq items in-place to achieve that.
You could re-write your function (with better variable names) to:
def dijkstra(graph, start):
"""Visit all nodes and calculate the shortest paths to each from start"""
queue = [(0, start)]
distances = {start: 0}
visited = set()
while queue:
_, node = heapq.heappop(queue) # (distance, node), ignore distance
if node in visited:
continue
visited.add(node)
dist = distances[node]
for neighbour, neighbour_dist in graph[node].items():
if neighbour in visited:
continue
neighbour_dist += dist
if neighbour_dist < distances.get(neighbour, float('inf')):
heapq.heappush(queue, (neighbour_dist, neighbour))
distances[neighbour] = neighbour_dist
return distances

Finding the shortest path in a cyclical graph using Dijkstra

I have a cyclical directed graph. Below is the representation of the graph as a python dict
graph = {
'A': {'B': 5, 'D': 5, 'E': 7 },
'B': {'C': 4},
'C': {'D': 8, 'E': 2},
'D': {'C': 8, 'E': 6},
'E': {'B': 3}
}
I have wrote a simple implementation of a Dijkstra's shortest path. Which seems to work for given two points. Below is my implementation.
def shortestpath(self, start, end, visited=[],distances={},predecessors={}):
# initialize a big number
maxint = 10000
if start==end:
path=[]
while end != None:
path.append(end)
end=predecessors.get(end, None)
return distances[start], path[::-1]
# detect if it's the first time through, set current distance to zero
if not visited: distances[start]=0
# process neighbors as per algorithm, keep track of predecessors
for neighbor in self.graph[start]:
if neighbor not in visited:
neighbordist = distances.get(neighbor,maxint)
tentativedist = distances[start] + self.graph[start][neighbor]
if tentativedist < neighbordist:
distances[neighbor] = tentativedist
predecessors[neighbor]=start
# neighbors processed, now mark the current node as visited
visited.append(start)
# finds the closest unvisited node to the start
unvisiteds = dict((k, distances.get(k,maxint)) for k in self.graph if k not in visited)
closestnode = min(unvisiteds, key=unvisiteds.get)
# now we can take the closest node and recurse, making it current
return self.shortestpath(closestnode,end,visited,distances,predecessors)
now this simple implementation seems to work. For example if I do somthing like this
shortestpath('A', 'C')
it will give me the path and shortest weight
(9, ['A', 'B', 'C'])
in this case.
However, whenever I shortestpath('B', 'B') the program will break.
Now there is a shortest path from B to B since it is a cyclic graph the path is B-C-E-B. I just don't know how to check for that and modify the Dijktra's algorithm accordingly to have it check for cyclic cases like this one. Any suggestion is greatly appreciated. Thanks :)

How to automate dictionary creation in Python

I am trying to write a python code that solves a Sudoku puzzle. My code starts by making a list of each row/column combination, or the coordinates of each box. Next, I want to find a way to, for each box, reference its location.
This is my current code:
boxes = []
for i in range(1, 10):
for x in range(1,10):
boxes = boxes + ['r'+str(i)+'c'+str(x)]
for box in boxes:
Next, I was going to create a dictionary for each one, but I would want each to be named by the list item. The dictionaries would be, for example, r1c1 = {'row': '1', 'Column': 1}.
What is the best way to separate and store this information?
You don't need to create all those dictionaries. You already have your coordinates, just don't lock them up in strings:
boxes = []
for i in range(1, 10):
for x in range(1,10):
boxes.append((i, x))
would create a list of (row, column) tuples instead, and you then wouldn't have to map them back.
Even if you needed to associate strings with data, you could do so in a nested dictionary:
coordinates = {
'r1c1': {'row': 1, 'column': 1},
# ...
}
but you could also parse that string and extract the numbers after r and c to produce the row and column numbers again.
In fact, I wrote a Sudoku checker on the same principles once; in the following code block_indices, per9() and zip(*per9(s)) produce indices for each block, row or column of a puzzle, letting you verify that you have 9 unique values in each. The only difference is that instead of a matrix, I used one long list to represent a puzzle, all elements from row to row included in sequence:
from itertools import product
block_indices = [[x + y + s for s in (0, 1, 2, 9, 10, 11, 18, 19, 20)]
for x, y in product(range(0, 81, 27), range(0, 9, 3))]
def per9(iterable):
# group iterable in chunks of 9
return zip(*([iter(iterable)] * 9))
def is_valid_sudoku(s):
return (
# rows
all(len(set(r)) == 9 for r in per9(s)) and
# columns
all(len(set(c)) == 9 for c in zip(*per9(s))) and
# blocks
all(len(set(s[i] for i in ix)) == 9 for ix in block_indices)
)
So row 1, column 4 is 1 * 9 + 4 = index 13 in a flat list.
While Martijn's answer is probably better from a "what you should do" perspective, for completeness, you could build that structure pretty easily using dictionary comprehension:
The below annotated code will output your desire data structure:
boxes = {
"r%sc%s"%(i,j): # build the keys in the form "r1c2"
{'row':i,"column":j} # build the dictionary of values - {'row':1, 'column': 2}
for i in range(1,10) # first level of loop
for j in range(1,10) # second level of loop
}
print boxes
This will output in your desired format:
{ 'r1c1': { 'column': 1, 'row': 1},
'r1c2': { 'column': 2, 'row': 1},
'r1c3': { 'column': 3, 'row': 1},
'r1c4': { 'column': 4, 'row': 1},
....
}

Categories