Source code for superblockify.metrics.distances

"""Distance calculation for the network metrics."""

from datetime import timedelta
from os import environ
from time import time

import numpy as np
from networkx import to_scipy_sparse_array
from osmnx.projection import is_projected
from psutil import virtual_memory
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import dijkstra

from .plot import plot_distance_distributions
from ..config import logger
from ..utils import has_pairwise_overlap

_AVG_EARTH_RADIUS_M = 6.3781e6  # in meters, arXiv:1510.07674 [astro-ph.SR]


[docs] def calculate_path_distance_matrix( graph, weight=None, unit_symbol=None, node_order=None, plot_distributions=False, log_debug=True, ): """Calculate the distance matrix for the partitioning. Use cythonized scipy.sparse.csgraph functions to calculate the distance matrix. Generally Dijkstra's algorithm with a Fibonacci heap is used. It's approximate computational cost is ``O[N(N*k + N*log(N))]`` where ``N`` is the number of nodes and ``k`` is the average number of edges per node. We use this, because our graphs are usually sparse. For dense graphs, the Floyd-Warshall algorithm can be implemented with ``O[N^3]`` computational cost. [1]_ Runtime comparison: - Scheveningen, NL (N = 1002, E = 2329): - Dijkstra: 172ms - Floyd-Warshall: 193ms - Liechtenstein, LI (N = 1797, E = 4197): - Dijkstra: 498ms - Floyd-Warshall: 917ms - Copenhagen, DK (N = 7806, E = 19565): - Dijkstra: 14.65s - Floyd-Warshall: 182.03s - Barcelona, ES (N = 8812, E = 16441): - Dijkstra: 18.21s - Floyd-Warshall: 134.69s (simple, one-time execution) The input graph will be converted to a scipy sparse matrix in CSR format. Compressed Sparse Row format is a sparse matrix format that is efficient for arithmetic operations. [2]_ Parameters ---------- graph : networkx.Graph The graph to calculate the distance matrix for weight : str, optional The edge attribute to use as weight. If None, all edge weights are 1. unit_symbol : str, optional The unit symbol to use for the distance matrix, like 's' for seconds. node_order : list, optional The order of the nodes in the distance matrix. If None, the ordering is produced by graph.nodes(). plot_distributions : bool, optional If True, a histogram of the distribution of the shortest path lengths is plotted. log_debug : bool, optional If True, log runtime and graph information at debug level. Raises ------ ValueError If the graph has negative edge weights. Returns ------- dist_matrix : ndarray The distance matrix for the partitioning. dist_matrix[i, j] is the shortest path length from node i to node j. predecessors : ndarray The predecessors for the distance matrix. predecessors[i, j] is the node before node j on the shortest path from node i to node j. [1]_ References ---------- .. [1] SciPy 1.10.0 Reference Guide, scipy.sparse.csgraph.shortest_path https://docs.scipy.org/doc/scipy-1.10.0/reference/generated/scipy.sparse.csgraph.shortest_path.html (accessed February 21, 2023) .. [2] SciPy 1.10.0 Reference Guide, scipy.sparse.csr_matrix https://docs.scipy.org/doc/scipy-1.10.0/reference/generated/scipy.sparse.csr_matrix.html (accessed February 21, 2023) """ if weight is not None and any( w < 0 for (u, v, w) in graph.edges.data(weight, default=0) ): # For this case Johnson's algorithm could be used, but none of our graphs # should have negative edge weights. raise ValueError("Graph has negative edge weights.") # First get N x N array of distances representing the input graph. graph_matrix = to_scipy_sparse_array( graph, weight=weight, format="csr", nodelist=node_order ) # Try to downcast indices to int32 if graph_matrix.indices.dtype != np.int32: logger.debug("Downcasting indices to int32.") downcasted_indices = graph_matrix.indices.astype(np.int32) if np.array_equal(downcasted_indices, graph_matrix.indices): graph_matrix.indices = downcasted_indices else: logger.warning("Downcasting indices to int32 failed.") # pragma: no cover # Try to downcast indptr to int32 if graph_matrix.indptr.dtype != np.int32: logger.debug("Downcasting indptr to int32.") downcasted_indptr = graph_matrix.indptr.astype(np.int32) if np.array_equal(downcasted_indptr, graph_matrix.indptr): graph_matrix.indptr = downcasted_indptr else: logger.warning("Downcasting indptr to int32 failed.") # pragma: no cover start_time = time() dist_full_graph, predecessors = dijkstra( graph_matrix, directed=True, return_predecessors=True, unweighted=False ) # Convert to single-precision - half-precision would encounter overflow dist_full_graph = dist_full_graph.astype(np.float32) if log_debug: logger.debug( "All-pairs shortest path lengths for graph with %s nodes and %s edges " "calculated in %s.", graph.number_of_nodes(), graph.number_of_edges(), timedelta(seconds=time() - start_time), ) if plot_distributions: if node_order is None: node_order = graph.nodes() # Where `dist_full_graph` is inf, replace with 0 plot_distance_distributions( dist_full_graph[dist_full_graph != np.inf], dist_title="Distribution of shortest path lengths on full graph", coords=( [graph.nodes[node]["x"] for node in node_order], [graph.nodes[node]["y"] for node in node_order], ), coord_title="Projected coordinates of nodes", labels=("x (m)", "y (m)"), distance_unit=unit_symbol, ) return dist_full_graph, predecessors
[docs] def calculate_euclidean_distance_matrix_projected( graph, node_order=None, plot_distributions=False ): """Calculate the Euclidean distances between all nodes in the graph. Uses the x and y coordinates of the nodes of a projected graph. The coordinates are in meters. Parameters ---------- graph : networkx.Graph The graph to calculate the distance matrix for. The graph should be projected. node_order : list, optional The order of the nodes in the distance matrix. If None, the ordering is produced by graph.nodes(). plot_distributions : bool, optional If True, plot the distributions of the Euclidean distances and coordinates. Sanity check for the coordinate values. Returns ------- dist_matrix : ndarray The distance matrix for the partitioning. dist_matrix[i, j] is the Euclidean distance between node i and node j. Raises ------ ValueError If the graph is not projected. """ # Find CRS from graph's metadata if "crs" not in graph.graph or not is_projected(graph.graph["crs"]): raise ValueError("Graph is not projected.") # Get the node order if node_order is None: node_order = list(graph.nodes()) # Get the coordinates of the nodes x_coord = np.array([graph.nodes[node]["x"] for node in node_order]) y_coord = np.array([graph.nodes[node]["y"] for node in node_order]) # Check that all values are float or int and not inf or nan if not np.issubdtype(x_coord.dtype, np.number) or not np.issubdtype( y_coord.dtype, np.number ): raise ValueError("Graph has non-numeric coordinates.") if np.any(np.isinf(x_coord)) or np.any(np.isinf(y_coord)): raise ValueError("Graph has infinite coordinates.") # Calculate the Euclidean distances between all nodes dist_matrix = np.sqrt( np.square(x_coord[:, np.newaxis] - x_coord[np.newaxis, :]) + np.square(y_coord[:, np.newaxis] - y_coord[np.newaxis, :]) ) # Convert to single-precision dist_matrix = dist_matrix.astype(np.float32) if plot_distributions: plot_distance_distributions( dist_matrix, dist_title="Distribution of Euclidean distances", coords=(x_coord, y_coord), coord_title="Scatter plot of projected coordinates", labels=("x (m)", "y (m)"), ) return dist_matrix
[docs] def calculate_euclidean_distance_matrix_haversine( graph, node_order=None, plot_distributions=False ): """Calculate the Euclidean distances between all nodes in the graph. Uses the **Haversine formula** to calculate the distances between all nodes in the graph. The coordinates are in degrees. Parameters ---------- graph : networkx.Graph The graph to calculate the distance matrix for node_order : list, optional The order of the nodes in the distance matrix. If None, the ordering is produced by graph.nodes(). plot_distributions : bool, optional If True, plot the distributions of the Euclidean distances and coordinates. Sanity check for the coordinate values. Returns ------- dist_matrix : ndarray The distance matrix for the partitioning. dist_matrix[i, j] is the Euclidean distance between node i and node j. Raises ------ ValueError If coordinates are not numeric or not in the range [-90, 90] for latitude and [-180, 180] for longitude. """ if node_order is None: node_order = list(graph.nodes()) start_time = time() # Calculate the Euclidean distances between all nodes # Do vectorized calculation for all nodes lat = np.array([graph.nodes[node]["lat"] for node in node_order]) lon = np.array([graph.nodes[node]["lon"] for node in node_order]) # Check that all values are float or int and proper lat/lon values if not np.issubdtype(lat.dtype, np.number) or not np.issubdtype( lon.dtype, np.number ): raise ValueError("Latitude and longitude values must be numeric.") if np.any(lat > 90) or np.any(lat < -90): raise ValueError("Latitude values are not in the range [-90, 90].") if np.any(lon > 180) or np.any(lon < -180): raise ValueError("Longitude values are not in the range [-180, 180].") node1_lat = np.expand_dims(lat, axis=0) node1_lon = np.expand_dims(lon, axis=0) node2_lat = np.expand_dims(lat, axis=1) node2_lon = np.expand_dims(lon, axis=1) # Calculate haversine distance, # see https://en.wikipedia.org/wiki/Haversine_formula # and https://github.com/mapado/haversine/blob/master/haversine/haversine.py lat = node2_lat - node1_lat lon = node2_lon - node1_lon hav = ( np.sin(lat / 2) ** 2 + np.cos(node1_lat) * np.cos(node2_lat) * np.sin(lon / 2) ** 2 ) dist_matrix = 2 * _AVG_EARTH_RADIUS_M * np.arcsin(np.sqrt(hav)) logger.debug( "Euclidean distances for graph with %s nodes and %s edges " "calculated in %s. " "Min/max lat/lon values: %s, %s, %s, %s; Difference: %s, %s", graph.number_of_nodes(), graph.number_of_edges(), timedelta(seconds=time() - start_time), np.min(node1_lat), np.max(node1_lat), np.min(node1_lon), np.max(node1_lon), np.max(node1_lat) - np.min(node1_lat), np.max(node1_lon) - np.min(node1_lon), ) if plot_distributions: # Plot distribution of distances and scatter plot of lat/lon plot_distance_distributions( dist_matrix, dist_title="Distribution of Euclidean distances", coords=(node1_lon, node1_lat), coord_title="Scatter plot of unprojected coordinates", labels=("Longitude [°]", "Latitude [°]"), ) return dist_matrix
[docs] def calculate_partitioning_distance_matrix( partitioner, weight=None, unit_symbol=None, node_order=None, plot_distributions=False, check_overlap=True, max_mem_factor=0.2, ): """Calculate the distance matrix for the partitioning. This is the pairwise distance between all pairs of nodes, where the shortest paths are only allowed to traverse edges in the start and goal partitions and unpartitioned edges. For this we calculate the distances and predecessors on the sparsified graph and the subgraphs separately. Then we combine the distances and predecessors to get a full distance matrix. We cannot do one big calculation, because the restrictions, to only enter/leave, are dynamic and depend on the start and goal node. Parameters ---------- partitioner : BasePartitioner The partitioner to calculate the distance matrix for weight : str, optional The edge attribute to use as weight. If None, all edges have weight 1. unit_symbol : str, optional The unit symbol to use for the weight. node_order : list, optional The order of the nodes in the distance matrix. If None, the ordering is produced by graph.nodes(). plot_distributions : bool, optional If True, plot the distributions of the Euclidean distances and coordinates. check_overlap : bool, optional If True, check that the partitions do not overlap node-wise. max_mem_factor : float, optional The maximal memory factor to use for filling up the restricted distance matrices as tensor. Otherwise, using vectorized version. Raises ------ ValueError If partitions don't have unique names. ValueError If the partitions overlap node-wise. For nodes considered to be in the partition, see :func:`BasePartitioner.get_partition_nodes() <superblockify.partitioning.partitioner.BasePartitioner.get_partition_nodes()>`. Returns ------- dist_matrix : ndarray The distance matrix for the partitioning. dist_matrix[i, j] is the distance between node i and node j for the given rules of the partitioning. In the order of `node_order` if given, otherwise as produced by `graph.nodes()`. predecessors : ndarray The predecessor matrix for the partitioning. predecessors[i, j] is the predecessor of node j on the shortest path from node i for the given rules of the partitioning. """ logger.debug("Preparing to calculate partitioning distance matrix.") if node_order is None: node_order = list(partitioner.graph.nodes) partitions = partitioner.get_partition_nodes() # Check that the partitions have unique names if len(partitions) != len({part["name"] for part in partitions}): raise ValueError("Partitions must have unique names.") partitions = { part["name"]: { "subgraph": part["subgraph"], "nodes": list(part["nodes"]), # exclusive nodes inside the subgraph "nodelist": list(part["subgraph"]), # also nodes shared with the # sparsified graph or on partition boundaries } for part in partitions } # Check that none of the partitions overlap by checking that the intersection # of the nodes in each partition is empty. if check_overlap: pairwise_overlap = has_pairwise_overlap( [list(part["nodes"]) for part in partitions.values()] ) # Check if any element in the pairwise overlap matrix is True, except the # diagonal if np.any(pairwise_overlap[np.triu_indices_from(pairwise_overlap, k=1)]): raise ValueError("The partitions overlap node-wise. This is not allowed.") logger.debug("Checked that the partitions do not overlap node-wise.") partitions["sparsified"] = { "subgraph": partitioner.sparsified, "nodes": list(partitioner.sparsified.nodes), "nodelist": list(partitioner.sparsified.nodes), } dist_matrix, pred_matrix = shortest_paths_restricted( partitioner.graph, partitions, weight, node_order, max_mem_factor, ) if plot_distributions: # Where `dist_full_graph` is inf, replace with 0 plot_distance_distributions( dist_matrix[dist_matrix != np.inf], dist_title="Distribution of shortest path distances for the " "partitioning", coords=( [partitioner.graph.nodes[node]["x"] for node in node_order], [partitioner.graph.nodes[node]["y"] for node in node_order], ), coord_title="Projected coordinates of nodes", labels=("x (m)", "y (m)"), distance_unit=unit_symbol, ) return dist_matrix, pred_matrix
[docs] def shortest_paths_restricted( graph, partitions, weight, node_order, max_mem_factor=0.2, ): """Calculate restricted shortest paths. Shortest distances and predecessors with restrictions of not passing through partitions. The passed partitions is a dictionary with the partition names as keys and the partition as value. The partition is a dictionary with the keys `subgraph`, `nodes` and `nodelist`. The dict needs to have a special partition, called `sparsified`, which is the sparsified graph. The `nodes` key is a list of nodes that are exclusive to the partition, i.e. nodes that are not shared with the sparsified graph or on partition boundaries. The `nodelist` key is a list of nodes that are exclusive to the partition and nodes that are shared with the sparsified graph. The subgraphs all need to be graphs views of a shared graph. The Partitioner class produces such partitions, after passing the integrated :func:`is_valid_partitioning() <superblockify.partitioning.checks.is_valid_partitioning>` checks. Parameters ---------- graph : networkx.MultiDiGraph The graph to calculate the shortest paths for. partitions : dict The partitions to calculate the shortest paths for. The keys are the partition names and the values are dictionaries with the keys `subgraph`, `nodes` and `nodelist`. weight : str or None, optional The edge attribute to use as weight. If None, all edge weights are 1. node_order : list The order of the nodes in the distance matrix. max_mem_factor : float, optional The maximum fraction to use in the fully vectorized calculation. Defaults to 0.2. Otherwise, the calculation is iterated over the partition indices. Returns ------- dist_matrix : ndarray The distance matrix for the partitioning. pred_matrix : ndarray The predecessor matrix for the partitioning. Notes ----- For usage with a :class:`Partitioner <superblockify.partitioning.partitioner.BasePartitioner>`, see :func:`calculate_partitioning_distance_matrix() <superblockify.metrics.distances.calculate_partitioning_distance_matrix>`. """ # pylint: disable=too-many-locals, too-many-statements # node_order indices # filtered indices: sparse/partition n_sparse_indices = [node_order.index(n) for n in partitions["sparsified"]["nodes"]] part_name_order = [name for name in partitions.keys() if name != "sparsified"] n_partition_indices_separate = [ [node_order.index(n) for n in partitions[name]["nodes"]] for name in part_name_order ] n_partition_indices = [n for part in n_partition_indices_separate for n in part] # Semipermeable graphs logger.debug("Preparing semipermeable graphs.") # Construct Compressed Sparse Row matrix # https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html g_leaving = to_scipy_sparse_array( graph, nodelist=node_order, weight=weight, format="coo" ) data, row, col = g_leaving.data, g_leaving.row, g_leaving.col mask_to = np.logical_and( np.isin(row, n_sparse_indices), np.isin(col, n_partition_indices) ) mask_from = np.logical_and( np.isin(row, n_partition_indices), np.isin(col, n_sparse_indices) ) # mask the edges inside each partition and inside the sparse graph mask_intra = np.logical_and( np.isin(row, n_sparse_indices), np.isin(col, n_sparse_indices) ) for n_ind in n_partition_indices_separate: mask_intra = np.logical_or( mask_intra, np.logical_and(np.isin(row, n_ind), np.isin(col, n_ind)) ) mask_to = np.logical_or(mask_to, mask_intra) mask_from = np.logical_or(mask_from, mask_intra) g_leaving = csr_matrix( (data[mask_to], (row[mask_to], col[mask_to])), shape=(len(node_order), len(node_order)), ) g_entering = csr_matrix( (data[mask_from], (row[mask_from], col[mask_from])), shape=(len(node_order), len(node_order)), ) # Calculate the combinations logger.debug("Calculating 2 semipermeable distance matrices.") start_time = time() dist_entering, pred_entering = dijkstra( g_entering, directed=True, indices=None, # all nodes, sorted as in sparse_matrix return_predecessors=True, unweighted=False, ) logger.debug( "Finished calculating the first distance matrix in %s. Starting the second.", timedelta(seconds=time() - start_time), ) dist_le, pred_le = dijkstra( g_leaving, directed=True, indices=None, # all nodes, sorted as in sparse_matrix return_predecessors=True, unweighted=False, ) logger.debug( "Finished calculating distance matrices in %s. Combining the results.", timedelta(seconds=time() - start_time), ) min_mask = dist_le > dist_entering dist_le[min_mask] = dist_entering[min_mask] pred_le[min_mask] = pred_entering[min_mask] del dist_entering, pred_entering # Fill up paths n_partition_intersect_indices = [ list( set(partitions["sparsified"]["nodelist"]).intersection( partitions[name]["nodelist"] ) ) for name in part_name_order ] n_partition_intersect_indices = [ [node_order.index(n) for n in part_indices] for part_indices in n_partition_intersect_indices ] start_time = time() dist_step = np.full_like(dist_le, np.inf, dtype=np.float32) pred_step = np.full_like(pred_le, -9999, dtype=np.int32) logger.debug("Prepared distance matrices") tensorized = 0 vectorized = 0 for part_idx, part_intersect in zip( n_partition_indices_separate, n_partition_intersect_indices ): # Add distances from i in part_idx to all j in n_partition_indices # to the same j to all k in n_partition_indices # In one step if small enough (max_mem_factor) # shape (len(part_idx), len(part_intersect), len(n_partition_indices)) # Use $SLURM_MEM_PER_NODE if available mem = float( environ.get( # different types of memory "SLURM_MEM_PER_NODE", # already in MB - total memory per node # https://slurm.schedmd.com/sbatch.html#SECTION_OUTPUT-ENVIRONMENT-VARIABLES virtual_memory().available / 1024 / 1024, # convert from bytes to MB # available memory on the machine ) ) needed_mem = ( len(part_idx) * len(part_intersect) * len(n_partition_indices) * 8 / 1024 / 1024 ) if needed_mem < mem * max_mem_factor: # logger.debug( # "There is enough memory (%d/%d) to calculate the distances in one " # "step.", # needed_mem, # int(free_mem*max_mem_factor), # ) tensorized += 1 # There is a different amount of i-j than j-k, calculate all combinations # and then mask out the invalid ones # add to a new dimension dists = ( dist_le[np.ix_(part_idx, part_intersect)][:, :, np.newaxis] + dist_le[np.ix_(part_intersect, n_partition_indices)][np.newaxis, :, :] ) # get index of minimum distance for the predecessors, use new axis min_idx = np.argmin(dists, axis=1) # write minima into dist_step dist_step[np.ix_(part_idx, n_partition_indices)] = dists[ np.arange(dists.shape[0])[:, np.newaxis], min_idx, np.arange(dists.shape[-1]), ] del dists # free memory # write predecessors into pred_step # predecessor i-k is the same as predecessor k-j # into pred_step, write the predecessor of k-j for n_p, i in enumerate(part_idx): for m_p, j in enumerate(n_partition_indices): if i == j: continue pred_step[i, j] = pred_le[part_intersect[min_idx[n_p, m_p]], j] # might be vectorized del min_idx else: vectorized += 1 # logger.debug( # "There is not enough memory (%d/%d) to calculate the distances in one" # " vectorized step. Looping over each partition index (%d).", # needed_mem, # int(free_mem*max_mem_factor), # len(part_idx), # ) # Loop over each partition index for i in part_idx: # Calculate distances from i to all j in n_partition_indices # to the same j to all k in n_partition_indices # shape (len(part_intersect), len(n_partition_indices)) dists = ( dist_le[i, part_intersect][:, np.newaxis] + dist_le[np.ix_(part_intersect, n_partition_indices)] ) # get index of minimum distance for the predecessors min_idx = np.argmin(dists, axis=0) # write minima into dist_step dist_step[i, n_partition_indices] = dists[ min_idx, np.arange(dists.shape[-1]) ] del dists # write predecessors into pred_step # predecessor i-k is the same as predecessor k-j # into pred_step, write the predecessor of k-j for m_p, j in enumerate(n_partition_indices): if i == j: continue pred_step[i, j] = pred_le[part_intersect[min_idx[m_p]], j] del min_idx mask = dist_step < dist_le dist_le = np.where(mask, dist_step, dist_le) pred_le = np.where(mask, pred_step, pred_le) logger.debug( "Finished filling up paths in %s. Used %d tensorized and %d vectorized steps.", timedelta(seconds=time() - start_time), tensorized, vectorized, ) return dist_le, pred_le