about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/networkx/algorithms/community/label_propagation.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/networkx/algorithms/community/label_propagation.py')
-rw-r--r--.venv/lib/python3.12/site-packages/networkx/algorithms/community/label_propagation.py338
1 files changed, 338 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/networkx/algorithms/community/label_propagation.py b/.venv/lib/python3.12/site-packages/networkx/algorithms/community/label_propagation.py
new file mode 100644
index 00000000..74880286
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/networkx/algorithms/community/label_propagation.py
@@ -0,0 +1,338 @@
+"""
+Label propagation community detection algorithms.
+"""
+
+from collections import Counter, defaultdict, deque
+
+import networkx as nx
+from networkx.utils import groups, not_implemented_for, py_random_state
+
+__all__ = [
+    "label_propagation_communities",
+    "asyn_lpa_communities",
+    "fast_label_propagation_communities",
+]
+
+
+@py_random_state("seed")
+@nx._dispatchable(edge_attrs="weight")
+def fast_label_propagation_communities(G, *, weight=None, seed=None):
+    """Returns communities in `G` as detected by fast label propagation.
+
+    The fast label propagation algorithm is described in [1]_. The algorithm is
+    probabilistic and the found communities may vary in different executions.
+
+    The algorithm operates as follows. First, the community label of each node is
+    set to a unique label. The algorithm then repeatedly updates the labels of
+    the nodes to the most frequent label in their neighborhood. In case of ties,
+    a random label is chosen from the most frequent labels.
+
+    The algorithm maintains a queue of nodes that still need to be processed.
+    Initially, all nodes are added to the queue in a random order. Then the nodes
+    are removed from the queue one by one and processed. If a node updates its label,
+    all its neighbors that have a different label are added to the queue (if not
+    already in the queue). The algorithm stops when the queue is empty.
+
+    Parameters
+    ----------
+    G : Graph, DiGraph, MultiGraph, or MultiDiGraph
+        Any NetworkX graph.
+
+    weight : string, or None (default)
+        The edge attribute representing a non-negative weight of an edge. If None,
+        each edge is assumed to have weight one. The weight of an edge is used in
+        determining the frequency with which a label appears among the neighbors of
+        a node (edge with weight `w` is equivalent to `w` unweighted edges).
+
+    seed : integer, random_state, or None (default)
+        Indicator of random number generation state. See :ref:`Randomness<randomness>`.
+
+    Returns
+    -------
+    communities : iterable
+        Iterable of communities given as sets of nodes.
+
+    Notes
+    -----
+    Edge directions are ignored for directed graphs.
+    Edge weights must be non-negative numbers.
+
+    References
+    ----------
+    .. [1] Vincent A. Traag & Lovro Šubelj. "Large network community detection by
+       fast label propagation." Scientific Reports 13 (2023): 2701.
+       https://doi.org/10.1038/s41598-023-29610-z
+    """
+
+    # Queue of nodes to be processed.
+    nodes_queue = deque(G)
+    seed.shuffle(nodes_queue)
+
+    # Set of nodes in the queue.
+    nodes_set = set(G)
+
+    # Assign unique label to each node.
+    comms = {node: i for i, node in enumerate(G)}
+
+    while nodes_queue:
+        # Remove next node from the queue to process.
+        node = nodes_queue.popleft()
+        nodes_set.remove(node)
+
+        # Isolated nodes retain their initial label.
+        if G.degree(node) > 0:
+            # Compute frequency of labels in node's neighborhood.
+            label_freqs = _fast_label_count(G, comms, node, weight)
+            max_freq = max(label_freqs.values())
+
+            # Always sample new label from most frequent labels.
+            comm = seed.choice(
+                [comm for comm in label_freqs if label_freqs[comm] == max_freq]
+            )
+
+            if comms[node] != comm:
+                comms[node] = comm
+
+                # Add neighbors that have different label to the queue.
+                for nbr in nx.all_neighbors(G, node):
+                    if comms[nbr] != comm and nbr not in nodes_set:
+                        nodes_queue.append(nbr)
+                        nodes_set.add(nbr)
+
+    yield from groups(comms).values()
+
+
+def _fast_label_count(G, comms, node, weight=None):
+    """Computes the frequency of labels in the neighborhood of a node.
+
+    Returns a dictionary keyed by label to the frequency of that label.
+    """
+
+    if weight is None:
+        # Unweighted (un)directed simple graph.
+        if not G.is_multigraph():
+            label_freqs = Counter(map(comms.get, nx.all_neighbors(G, node)))
+
+        # Unweighted (un)directed multigraph.
+        else:
+            label_freqs = defaultdict(int)
+            for nbr in G[node]:
+                label_freqs[comms[nbr]] += len(G[node][nbr])
+
+            if G.is_directed():
+                for nbr in G.pred[node]:
+                    label_freqs[comms[nbr]] += len(G.pred[node][nbr])
+
+    else:
+        # Weighted undirected simple/multigraph.
+        label_freqs = defaultdict(float)
+        for _, nbr, w in G.edges(node, data=weight, default=1):
+            label_freqs[comms[nbr]] += w
+
+        # Weighted directed simple/multigraph.
+        if G.is_directed():
+            for nbr, _, w in G.in_edges(node, data=weight, default=1):
+                label_freqs[comms[nbr]] += w
+
+    return label_freqs
+
+
+@py_random_state(2)
+@nx._dispatchable(edge_attrs="weight")
+def asyn_lpa_communities(G, weight=None, seed=None):
+    """Returns communities in `G` as detected by asynchronous label
+    propagation.
+
+    The asynchronous label propagation algorithm is described in
+    [1]_. The algorithm is probabilistic and the found communities may
+    vary on different executions.
+
+    The algorithm proceeds as follows. After initializing each node with
+    a unique label, the algorithm repeatedly sets the label of a node to
+    be the label that appears most frequently among that nodes
+    neighbors. The algorithm halts when each node has the label that
+    appears most frequently among its neighbors. The algorithm is
+    asynchronous because each node is updated without waiting for
+    updates on the remaining nodes.
+
+    This generalized version of the algorithm in [1]_ accepts edge
+    weights.
+
+    Parameters
+    ----------
+    G : Graph
+
+    weight : string
+        The edge attribute representing the weight of an edge.
+        If None, each edge is assumed to have weight one. In this
+        algorithm, the weight of an edge is used in determining the
+        frequency with which a label appears among the neighbors of a
+        node: a higher weight means the label appears more often.
+
+    seed : integer, random_state, or None (default)
+        Indicator of random number generation state.
+        See :ref:`Randomness<randomness>`.
+
+    Returns
+    -------
+    communities : iterable
+        Iterable of communities given as sets of nodes.
+
+    Notes
+    -----
+    Edge weight attributes must be numerical.
+
+    References
+    ----------
+    .. [1] Raghavan, Usha Nandini, Réka Albert, and Soundar Kumara. "Near
+           linear time algorithm to detect community structures in large-scale
+           networks." Physical Review E 76.3 (2007): 036106.
+    """
+
+    labels = {n: i for i, n in enumerate(G)}
+    cont = True
+
+    while cont:
+        cont = False
+        nodes = list(G)
+        seed.shuffle(nodes)
+
+        for node in nodes:
+            if not G[node]:
+                continue
+
+            # Get label frequencies among adjacent nodes.
+            # Depending on the order they are processed in,
+            # some nodes will be in iteration t and others in t-1,
+            # making the algorithm asynchronous.
+            if weight is None:
+                # initialising a Counter from an iterator of labels is
+                # faster for getting unweighted label frequencies
+                label_freq = Counter(map(labels.get, G[node]))
+            else:
+                # updating a defaultdict is substantially faster
+                # for getting weighted label frequencies
+                label_freq = defaultdict(float)
+                for _, v, wt in G.edges(node, data=weight, default=1):
+                    label_freq[labels[v]] += wt
+
+            # Get the labels that appear with maximum frequency.
+            max_freq = max(label_freq.values())
+            best_labels = [
+                label for label, freq in label_freq.items() if freq == max_freq
+            ]
+
+            # If the node does not have one of the maximum frequency labels,
+            # randomly choose one of them and update the node's label.
+            # Continue the iteration as long as at least one node
+            # doesn't have a maximum frequency label.
+            if labels[node] not in best_labels:
+                labels[node] = seed.choice(best_labels)
+                cont = True
+
+    yield from groups(labels).values()
+
+
+@not_implemented_for("directed")
+@nx._dispatchable
+def label_propagation_communities(G):
+    """Generates community sets determined by label propagation
+
+    Finds communities in `G` using a semi-synchronous label propagation
+    method [1]_. This method combines the advantages of both the synchronous
+    and asynchronous models. Not implemented for directed graphs.
+
+    Parameters
+    ----------
+    G : graph
+        An undirected NetworkX graph.
+
+    Returns
+    -------
+    communities : iterable
+        A dict_values object that contains a set of nodes for each community.
+
+    Raises
+    ------
+    NetworkXNotImplemented
+       If the graph is directed
+
+    References
+    ----------
+    .. [1] Cordasco, G., & Gargano, L. (2010, December). Community detection
+       via semi-synchronous label propagation algorithms. In Business
+       Applications of Social Network Analysis (BASNA), 2010 IEEE International
+       Workshop on (pp. 1-8). IEEE.
+    """
+    coloring = _color_network(G)
+    # Create a unique label for each node in the graph
+    labeling = {v: k for k, v in enumerate(G)}
+    while not _labeling_complete(labeling, G):
+        # Update the labels of every node with the same color.
+        for color, nodes in coloring.items():
+            for n in nodes:
+                _update_label(n, labeling, G)
+
+    clusters = defaultdict(set)
+    for node, label in labeling.items():
+        clusters[label].add(node)
+    return clusters.values()
+
+
+def _color_network(G):
+    """Colors the network so that neighboring nodes all have distinct colors.
+
+    Returns a dict keyed by color to a set of nodes with that color.
+    """
+    coloring = {}  # color => set(node)
+    colors = nx.coloring.greedy_color(G)
+    for node, color in colors.items():
+        if color in coloring:
+            coloring[color].add(node)
+        else:
+            coloring[color] = {node}
+    return coloring
+
+
+def _labeling_complete(labeling, G):
+    """Determines whether or not LPA is done.
+
+    Label propagation is complete when all nodes have a label that is
+    in the set of highest frequency labels amongst its neighbors.
+
+    Nodes with no neighbors are considered complete.
+    """
+    return all(
+        labeling[v] in _most_frequent_labels(v, labeling, G) for v in G if len(G[v]) > 0
+    )
+
+
+def _most_frequent_labels(node, labeling, G):
+    """Returns a set of all labels with maximum frequency in `labeling`.
+
+    Input `labeling` should be a dict keyed by node to labels.
+    """
+    if not G[node]:
+        # Nodes with no neighbors are themselves a community and are labeled
+        # accordingly, hence the immediate if statement.
+        return {labeling[node]}
+
+    # Compute the frequencies of all neighbors of node
+    freqs = Counter(labeling[q] for q in G[node])
+    max_freq = max(freqs.values())
+    return {label for label, freq in freqs.items() if freq == max_freq}
+
+
+def _update_label(node, labeling, G):
+    """Updates the label of a node using the Prec-Max tie breaking algorithm
+
+    The algorithm is explained in: 'Community Detection via Semi-Synchronous
+    Label Propagation Algorithms' Cordasco and Gargano, 2011
+    """
+    high_labels = _most_frequent_labels(node, labeling, G)
+    if len(high_labels) == 1:
+        labeling[node] = high_labels.pop()
+    elif len(high_labels) > 1:
+        # Prec-Max
+        if labeling[node] not in high_labels:
+            labeling[node] = max(high_labels)