Skip to main content
2 of 9
edited title

Find all shortest paths in a directed, unweighted, SQL graph

I want to find all shortest paths in a unweighted graph stored in a SQL database. The graph has about 460,000,000 edges and 5,600,000 nodes. My approach is to use a bidirectional BFS to find all the shortest paths.

I implemented the algorithm as follows: A search is started from the source node outwards to the target node and one is started on the target node and searches in the direction of the source. If there is a intersection between the nodes visited by the BFS from the source to the target and the BFS from target to source a path has been found. Since I wish to find all the paths the current distance from the source is saved and the current distance from the target is saved. The search then continues as long as there are more nodes with the same distance from the source or same distance from the target. Now all paths have been found but not just the shortes but also longer once. The last step is therefore to remove the paths that are not the shortest.

The SQL database looks like this:

CREATE TABLE edges (
     edge_from UNSIGNED int(10) NOT NULL,
     edge_target UNSIGNED int(10) NOT NULL
);
CREATE INDEX edges_from_index ON edges (edge_from);
CREATE INDEX edges_target_index ON edges (edge_target);

Running the function on my computer takes a few seconds even if the path is pritty short. A quick look at the time spent in each function with cProfile reveals that what takes the longest are the SQL-lookup. Is there anything I can do to imporve the time complexity and therby decrease the lookups or can I improve my SQL-database/SQL-query to make it any faster? I also apreciate any comments on my code in general. Here is my code:

import sqlite3
import collections
import itertools

def bidirectional_BFS(connection, source, target):
    """
    Returns all the shortest paths between 'source' and 'target'.
    """
    source_queue = collections.deque((source,))
    target_queue = collections.deque((target,))

    source_visited = {source: 0} # Maps a node to it's distance from the source
    target_visited = {target: 0} # Maps a node to it's distance from the target

    # Keeps track all the ways to get to a given node
    source_parent = collections.defaultdict(set)
    target_parent = collections.defaultdict(set)

    source_parent[source].add(None)
    target_parent[target].add(None)

    found_path    = False
    source_deapth = 0
    target_deapth = 0

    # The set of all intersections between the two searches
    intersections = set()

    while source_queue and target_queue:
        if found_path and (source_visited[source_queue[0]] > source_deapth and target_visited[target_queue[0]] > target_deapth):
            # We are done. All nodes at the current deapth has been explored
            intersections = filter_intersections(source_visited, target_visited, intersections)
            return construct_path(source_parent, target_parent, source, target, intersections)

        if found_path and source_visited[source_queue[0]] > source_deapth:
            # Found a path but the BFS from the target still has more nodes to explore
            target_added, t_deapth = BFS_target(target_queue, target_visited, target_parent, connection)
            intersections |= target_added & source_visited.keys()
        elif found_path and target_visited[target_queue[0]] > target_deapth:
            # Found a path but the BFS from the source still has more nodes to explore
            source_added, s_deapth = BFS_source(source_queue, source_visited, source_parent, connection)
            intersections |= source_added & target_visited.keys()
        else:
            source_added, s_deapth = BFS_source(source_queue, source_visited, source_parent, connection)
            target_added, t_deapth = BFS_target(target_queue, target_visited, target_parent, connection)

            intersections |= source_added & target_visited.keys()
            intersections |= target_added & source_visited.keys()

        if not found_path and intersections:
            # We found a path so we look the search deapth to the current deapth
            found_path = True
            source_deapth = s_deapth
            target_deapth = t_deapth

    if found_path:
        return construct_path(source_parent, target_parent, source, target, intersections)
    else:
        return None

def filter_intersections(source_visited, target_visited, intersections):
    """
    Returns only the intersections where the combined distance from source
    to the intersection and from target to the intersect are the smallest
    """
    filterd = set()
    shortest = float('inf')
    for intersection in intersections:
        if source_visited[intersection] + target_visited[intersection] < shortest:
            shortest = source_visited[intersection] + target_visited[intersection]
            filterd = {intersection}
        elif source_visited[intersection] + target_visited[intersection] == shortest:
            filterd.add(intersection)
    return filterd

def construct_path(source_parent, target_parent, source, target, intersections):
    """
    Constructs all paths and returns a list of list where each list is one path
    """
    paths = set()

    for intersection in intersections:
        from_source_to_inter = construct_path_from_to(source_parent, source, intersection)
        from_inter_to_target = construct_path_from_to(target_parent, target, intersection, reverse=True)
        for path in itertools.product(from_source_to_inter, from_inter_to_target):
            paths.add(tuple(path[0] + path[1][1:]))
    return paths
        
def construct_path_from_to(source_parent, target, start, reverse=False):
    """
    Constructs all paths between start and target recursivly. If reverse is true
    then all the paths are reversed.
    """
    if start == target:
        return [[target]]

    paths = []
    for parent in source_parent[start]:
        for path in construct_path_from_to(source_parent, target, parent, reverse):
            if reverse:
                path.insert(0, start)
            else:
                path.append(start)
            paths.append(path)

    return paths

def BFS_source(queue, visited, parent, connection):
    """
    Runs one iteration of the BFS from source to the target and then returns
    the edges explored during the iteration and the current deapth of the search
    """
    current = queue.popleft()

    added = set()

    for (neighbor,) in connection.execute('SELECT edge_target FROM edges WHERE edge_from= ?;', (current,)):
        if neighbor not in visited:
            parent[neighbor].add(current)
            visited[neighbor] = visited[current] + 1
            queue.append(neighbor)
            added.add(neighbor)
        elif visited[current] + 1 == visited[neighbor]:
            parent[neighbor].add(current)

    return added, visited[current]

def BFS_target(queue, visited, parent, connection):
    """
    Runs one iteration of the BFS from target to source and then returns
    the edges explored during the iteration and the current deapth of the search
    """
    current = queue.popleft()

    added = set()

    for (neighbor,) in connection.execute('SELECT edge_from FROM pagelinks WHERE edge_target = ?;', (current,)):
        if neighbornot in visited:
            parent[neighbor].add(current)
            visited[neighbor] = visited[current] + 1
            queue.append(neighbor)
            added.add(neighbor)
        elif visited[current] + 1 == visited[neighbor]:
            parent[neighbor].add(current)

    return added, visited[current]