Source code for gtda.mapper.nerve
"""Construct the nerve of a refined Mapper cover."""
# License: GNU AGPLv3
from collections import defaultdict
from itertools import combinations, filterfalse
import numpy as np
from igraph import Graph
from sklearn.base import BaseEstimator, TransformerMixin
def _limit_mapping(mapping):
"""Given a 1D array interpreted as a function
:math:`f : \\{0, \\ldots, n - 1\\}} \to \\{0, \\ldots, n - 1\\}}`, such
that :math:`f^{(k)} = f^{(k + 1)}` for some :math:`k`, find the 1D array
corresponding to :math:`f^{(k)}`."""
terminal_states = np.empty_like(mapping)
for i, initial_target_idx in enumerate(mapping):
temp_target_idx = i
next_target_idx = initial_target_idx
while temp_target_idx != next_target_idx:
temp_target_idx = mapping[temp_target_idx]
next_target_idx = mapping[mapping[temp_target_idx]]
terminal_states[i] = temp_target_idx
return terminal_states
[docs]class Nerve(BaseEstimator, TransformerMixin):
"""1-skeleton of the nerve of a refined Mapper cover, i.e. the Mapper
graph.
This transformer is the final step in the
:class:`gtda.mapper.pipeline.MapperPipeline` objects created
by :func:`gtda.mapper.make_mapper_pipeline`. It corresponds the last two
arrows in `this diagram <../../../../_images/mapper_pipeline.svg>`_.
Parameters
----------
min_intersection : int, optional, default: ``1``
Minimum size of the intersection, between data subsets associated to
any two Mapper nodes, required to create an edge between the nodes in
the Mapper graph. Must be positive.
store_edge_elements : bool, optional, default: ``False``
Whether the indices of data elements associated to Mapper edges (i.e.
in the intersections allowed by `min_intersection`) should be stored in
the :class:`igraph.Graph` object output by :meth:`fit_transform`. When
``True``, might lead to a large :class:`igraph.Graph` object.
contract_nodes : bool, optional, default: ``False``
If ``True``, any node representing a cluster which is a strict subset
of the cluster corresponding to another node is eliminated, and only
one maximal node is kept.
Attributes
----------
graph_ : :class:`igraph.Graph` object
Mapper graph obtained from the input data. Created when :meth:`fit` is
called.
"""
[docs] def __init__(self, min_intersection=1, store_edge_elements=False,
contract_nodes=False):
self.min_intersection = min_intersection
self.store_edge_elements = store_edge_elements
self.contract_nodes = contract_nodes
[docs] def fit(self, X, y=None):
"""Compute the Mapper graph as in :meth:`fit_transform`, but store the
graph as :attr:`graph_` and return the estimator.
Parameters
----------
X : list of list of tuple
See :meth:`fit_transform`.
y : None
There is no need for a target in a transformer, yet the pipeline
API requires this parameter.
Returns
-------
self : object
"""
self.graph_ = self.fit_transform(X, y=y)
return self
[docs] def fit_transform(self, X, y=None):
"""Construct a Mapper graph from a refined Mapper cover.
Parameters
----------
X : ndarray of shape (n_samples,)
Cluster labels describing a refined cover of a dataset produced by
the clustering step of a :class:`gtda.mapper.MapperPipeline`,
as depicted in
`this diagram <../../../../_images/mapper_pipeline.svg>`_. Each
entry in `X` is a tuple of pairs of the form
``(pullback_set_label, partial_cluster_label)`` where
``partial_cluster_label`` is a cluster label within the pullback
cover set identified by ``pullback_set_label``. The unique pairs
correspond to nodes in the output graph.
y : None
There is no need for a target in a transformer, yet the pipeline
API requires this parameter.
Returns
-------
graph : :class:`igraph.Graph` object
Undirected Mapper graph according to `X` and `min_intersection`.
Each node is an :class:`igraph.Vertex` object with attributes
``"pullback_set_label"``, ``"partial_cluster_label"`` and
``"node_elements"``. Each edge is an :class:`igraph.Edge` object
with a ``"weight"`` attribute which is equal to the size of the
intersection between the data subsets associated to its two nodes.
If `store_edge_elements` is ``True`` each edge also has an
additional attribute ``"edge_elements"``.
"""
# TODO: Include a validation step for X
# Graph construction -- vertices with their metadata
labels_to_indices = defaultdict(list)
for i, sample in enumerate(X):
for node_id_pair in sample:
labels_to_indices[node_id_pair].append(i)
labels_to_indices = {key: np.array(value)
for key, value in labels_to_indices.items()}
n_nodes = len(labels_to_indices)
graph = Graph(n_nodes)
# labels_to_indices is a dictionary of, say, N key-value pairs of the
# form (pullback_set_label, partial_cluster_label): node_elements.
# Hence, zip(*labels_to_indices) generates two tuples of length N, each
# corresponding to a type of node attribute in the final graph.
node_attributes = zip(*labels_to_indices)
graph.vs["pullback_set_label"] = next(node_attributes)
graph.vs["partial_cluster_label"] = next(node_attributes)
graph.vs["node_elements"] = [*labels_to_indices.values()]
# Graph construction -- edges with weights given by intersection sizes.
# In general, we need all information in `nodes` to narrow down the set
# of combinations to check, especially when `contract_nodes` is True.
nodes = zip(*zip(*labels_to_indices), labels_to_indices.values())
node_index_pairs, weights, intersections, mapping = \
self._generate_edge_data(nodes, n_nodes)
graph.es["weight"] = 1
graph.add_edges(node_index_pairs)
graph.es["weight"] = weights
if self.store_edge_elements:
graph.es["edge_elements"] = intersections
if self.contract_nodes:
# Due to the order in which itertools.combinations produces pairs,
# and to the preference given to node 1 in the if-elif-else clause
# in `_subset_check_metadata_append`, `mapping` is guaranteed to
# send everything to one of its fixed points after sufficiently
# many repeated applications and, by construction, no two pairs of
# indices in `_limit_mapping(mapping)` can correspond to data
# subsets which are in a subset relation. Thus the nodes are
# correctly contracted by `_limit_mapping(mapping)`.
limit_mapping = _limit_mapping(mapping)
graph.contract_vertices(limit_mapping,
combine_attrs="first")
graph.delete_vertices([i for i in graph.vs.indices
if i != limit_mapping[i]])
return graph
def _generate_edge_data(self, nodes, n_nodes):
def _in_same_pullback_set(_node_tuple):
return _node_tuple[0][1][0] == _node_tuple[1][1][0]
def _do_nothing(*args):
pass
def _intersections_append(_intersection):
return intersections.append(_intersection)
def _metadata_append(
_node_1_idx, _node_2_idx, _intersection_size, _intersection,
*args
):
if _intersection_size >= self.min_intersection:
# Add edge (as a node tuple) to list of node index pairs
node_index_pairs.append((_node_1_idx, _node_2_idx))
weights.append(_intersection_size)
intersection_behavior(_intersection)
def _subset_check_metadata_append(
_node_1_idx, _node_2_idx, _intersection_size, _intersection,
_node_1_elements, _node_2_elements
):
if _intersection_size == len(_node_2_elements):
# Node 2 is contained in node 1 and we remove it in favour of
# node 1.
mapping[_node_2_idx] = _node_1_idx
elif _intersection_size == len(_node_1_elements):
# Node 1 is strictly contained in node 2 and we remove it in
# favour of node 2.
mapping[_node_1_idx] = _node_2_idx
else:
# Edge exists provided `_intersection_size` is large enough
_metadata_append(_node_1_idx, _node_2_idx, _intersection_size,
_intersection)
node_tuples = combinations(enumerate(nodes), 2)
node_index_pairs = []
weights = []
intersections = []
# Choose whether intersections are stored or not.
# `intersection_behavior` is in scope for `_metadata_append` and
# `_subset_check_metadata_append`.
if self.store_edge_elements:
intersection_behavior = _intersections_append
else:
intersection_behavior = _do_nothing
if self.contract_nodes:
mapping = np.arange(n_nodes)
behavior = _subset_check_metadata_append
else:
mapping = None
behavior = _metadata_append
# No need to check for intersections within each pullback set as the
# input is assumed to be a refined Mapper cover
for node_tuple in filterfalse(_in_same_pullback_set, node_tuples):
((node_1_idx, (_, _, node_1_elements)),
(node_2_idx, (_, _, node_2_elements))) = node_tuple
intersection = np.intersect1d(node_1_elements, node_2_elements)
intersection_size = len(intersection)
if intersection_size:
behavior(node_1_idx, node_2_idx, intersection_size,
intersection, node_1_elements, node_2_elements)
else:
continue
return node_index_pairs, weights, intersections, mapping