Source code for gtda.diagrams.preprocessing

"""Persistence diagram preprocessing."""
# License: GNU AGPLv3

from numbers import Real
from types import FunctionType

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

from ._metrics import _AVAILABLE_AMPLITUDE_METRICS, _parallel_amplitude
from ._utils import _filter, _bin, _homology_dimensions_to_sorted_ints
from ..base import PlotterMixin
from ..plotting.persistence_diagrams import plot_diagram
from ..utils._docs import adapt_fit_transform_docs
from ..utils.intervals import Interval
from ..utils.validation import check_diagrams, validate_params


[docs]@adapt_fit_transform_docs class ForgetDimension(BaseEstimator, TransformerMixin, PlotterMixin): """Replaces all homology dimensions in persistence diagrams with ``numpy.inf``. Useful when downstream tasks require the use of topological features all at once -- and not separated between different homology dimensions. See also -------- PairwiseDistance, Amplitude, Scaler, Filtering """
[docs] def __init__(self): pass
[docs] def fit(self, X, y=None): """Do nothing and return the estimator unchanged. This method is here to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : ndarray of shape (n_samples, n_features, 3) Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- self : object """ check_diagrams(X) self._is_fitted = True return self
[docs] def transform(self, X, y=None): """Replace all homology dimensions in `X` with ``numpy.inf``. Parameters ---------- X : ndarray of shape (n_samples, n_features, 3) Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_features, 3) Output persistence diagram. """ check_is_fitted(self, '_is_fitted') Xt = check_diagrams(X, copy=True) Xt[:, :, 2] = np.inf # TODO: for plotting, replace the dimension with a tag return Xt
[docs] @staticmethod def plot(Xt, sample=0, plotly_params=None): """Plot a sample from a collection of persistence diagrams. Parameters ---------- Xt : ndarray of shape (n_samples, n_points, 3) Collection of persistence diagrams, such as returned by :meth:`transform`. sample : int, optional, default: ``0`` Index of the sample in `Xt` to be plotted. plotly_params : dict or None, optional, default: ``None`` Custom parameters to configure the plotly figure. Allowed keys are ``"traces"`` and ``"layout"``, and the corresponding values should be dictionaries containing keyword arguments as would be fed to the :meth:`update_traces` and :meth:`update_layout` methods of :class:`plotly.graph_objects.Figure`. Returns ------- fig : :class:`plotly.graph_objects.Figure` object Plotly figure. """ return plot_diagram( Xt[sample], homology_dimensions=[np.inf], plotly_params=plotly_params )
[docs]@adapt_fit_transform_docs class Scaler(BaseEstimator, TransformerMixin, PlotterMixin): """Linear scaling of persistence diagrams. A positive scale factor :attr:`scale_` is calculated during :meth:`fit` by considering all available persistence diagrams partitioned according to homology dimensions. During :meth:`transform`, all birth-death pairs are divided by :attr:`scale_`. The value of :attr:`scale_` depends on two things: - A way of computing, for each homology dimension, the :ref:`amplitude <vectorization_amplitude_and_kernel>` in that dimension of a persistence diagram consisting of birth-death-dimension triples [b, d, q]. Together, `metric` and `metric_params` define this in the same way as in :class:`Amplitude`. - A scalar-valued function which is applied to the resulting two-dimensional array of amplitudes (one per diagram and homology dimension) to obtain :attr:`scale_`. **Important note**: - Input collections of persistence diagrams for this transformer must satisfy certain requirements, see e.g. :meth:`fit`. Parameters ---------- metric : ``'bottleneck'`` | ``'wasserstein'`` | ``'betti'`` | \ ``'landscape'`` |``'silhouette'`` | ``'heat'`` | \ ``'persistence_image'``, optional, default: ``'bottleneck'`` See the corresponding parameter in :class:`Amplitude`. metric_params : dict or None, optional, default: ``None`` See the corresponding parameter in :class:`Amplitude`. function : callable, optional, default: ``numpy.max`` Function used to extract a positive scalar from the collection of amplitude vectors in :meth:`fit`. Must map 2D arrays to scalars. n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. Attributes ---------- effective_metric_params_ : dict Dictionary containing all information present in `metric_params` as well as relevant quantities computed in :meth:`fit`. homology_dimensions_ : tuple Homology dimensions seen in :meth:`fit`, sorted in ascending order. scale_ : float Value by which to rescale diagrams. See also -------- PairwiseDistance, ForgetDimension, Filtering, Amplitude Notes ----- When `metric` is ``'bottleneck'`` and `function` is ``numpy.max``, :meth:`fit_transform` has the effect of making the lifetime of the most persistent point across all diagrams and homology dimensions equal to 2. To compute scaling factors without first splitting the computation between different homology dimensions, data should be first transformed by an instance of :class:`ForgetDimension`. """ _hyperparameters = { 'metric': {'type': str, 'in': _AVAILABLE_AMPLITUDE_METRICS.keys()}, 'metric_params': {'type': (dict, type(None))}, 'function': {'type': (FunctionType, type(None))} }
[docs] def __init__(self, metric='bottleneck', metric_params=None, function=np.max, n_jobs=None): self.metric = metric self.metric_params = metric_params self.function = function self.n_jobs = n_jobs
[docs] def fit(self, X, y=None): """Store all observed homology dimensions in :attr:`homology_dimensions_` and compute :attr:`scale_`. Then, return the estimator. Parameters ---------- X : ndarray of shape (n_samples, n_features, 3) Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). It is important that, for each possible homology dimension, the number of triples for which q equals that homology dimension is constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- self : object """ X = check_diagrams(X) validate_params( self.get_params(), self._hyperparameters, exclude=['n_jobs']) if self.metric_params is None: self.effective_metric_params_ = {} else: self.effective_metric_params_ = self.metric_params.copy() validate_params(self.effective_metric_params_, _AVAILABLE_AMPLITUDE_METRICS[self.metric]) # Find the unique homology dimensions in the 3D array X passed to `fit` # assuming that they can all be found in its zero-th entry homology_dimensions_fit = np.unique(X[0, :, 2]) self.homology_dimensions_ = \ _homology_dimensions_to_sorted_ints(homology_dimensions_fit) self.effective_metric_params_['samplings'], \ self.effective_metric_params_['step_sizes'] = \ _bin(X, self.metric, **self.effective_metric_params_) if self.metric == 'persistence_image': weight_function = self.effective_metric_params_.get( 'weight_function', None ) weight_function = \ np.ones_like if weight_function is None else weight_function self.effective_metric_params_['weight_function'] = weight_function amplitude_array = _parallel_amplitude(X, self.metric, self.effective_metric_params_, self.homology_dimensions_, self.n_jobs) self.scale_ = self.function(amplitude_array) return self
[docs] def transform(self, X, y=None): """Divide all birth and death values in `X` by :attr:`scale_`. Parameters ---------- X : ndarray of shape (n_samples, n_features, 3) Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). It is important that, for each possible homology dimension, the number of triples for which q equals that homology dimension is constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xs : ndarray of shape (n_samples, n_features, 3) Rescaled diagrams. """ check_is_fitted(self) Xs = check_diagrams(X, copy=True) Xs[:, :, :2] /= self.scale_ return Xs
[docs] def inverse_transform(self, X): """Scale back the data to the original representation. Multiplies by the scale found in :meth:`fit`. Parameters ---------- X : ndarray of shape (n_samples, n_features, 3) Data to apply the inverse transform to, c.f. :meth:`transform`. Returns ------- Xs : ndarray of shape (n_samples, n_features, 3) Rescaled diagrams. """ check_is_fitted(self) Xs = check_diagrams(X, copy=True) Xs[:, :, :2] *= self.scale_ return Xs
[docs] def plot(self, Xt, sample=0, homology_dimensions=None, plotly_params=None): """Plot a sample from a collection of persistence diagrams, with homology in multiple dimensions. Parameters ---------- Xt : ndarray of shape (n_samples, n_points, 3) Collection of persistence diagrams, such as returned by :meth:`transform`. sample : int, optional, default: ``0`` Index of the sample in `Xt` to be plotted. homology_dimensions : list, tuple or None, optional, default: ``None`` Which homology dimensions to include in the plot. ``None`` is equivalent to passing :attr:`homology_dimensions_`. plotly_params : dict or None, optional, default: ``None`` Custom parameters to configure the plotly figure. Allowed keys are ``"traces"`` and ``"layout"``, and the corresponding values should be dictionaries containing keyword arguments as would be fed to the :meth:`update_traces` and :meth:`update_layout` methods of :class:`plotly.graph_objects.Figure`. Returns ------- fig : :class:`plotly.graph_objects.Figure` object Plotly figure. """ if homology_dimensions is None: _homology_dimensions = self.homology_dimensions_ else: _homology_dimensions = homology_dimensions return plot_diagram( Xt[sample], homology_dimensions=_homology_dimensions, plotly_params=plotly_params )
[docs]@adapt_fit_transform_docs class Filtering(BaseEstimator, TransformerMixin, PlotterMixin): """Filtering of persistence diagrams. Filtering a diagram means discarding all points [b, d, q] representing non-trivial topological features whose lifetime d - b is less than or equal to a cutoff value. Points on the diagonal (i.e. for which b and d are equal) may still appear in the output for padding purposes, but carry no information. **Important note**: - Input collections of persistence diagrams for this transformer must satisfy certain requirements, see e.g. :meth:`fit`. Parameters ---------- homology_dimensions : list, tuple, or None, optional, default: ``None`` When set to ``None``, subdiagrams corresponding to all homology dimensions seen in :meth:`fit` will be filtered. Otherwise, it contains the homology dimensions (as non-negative integers) at which filtering should occur. epsilon : float, optional, default: ``0.01`` The cutoff value controlling the amount of filtering. Attributes ---------- homology_dimensions_ : tuple If `homology_dimensions` is set to ``None``, contains the homology dimensions seen in :meth:`fit`, sorted in ascending order. Otherwise, it is a similarly sorted version of `homology_dimensions`. See also -------- PairwiseDistance, ForgetDimension, Scaler, Amplitude """ _hyperparameters = { 'homology_dimensions': { 'type': (list, tuple, type(None)), 'of': {'type': int, 'in': Interval(0, np.inf, closed='left')} }, 'epsilon': {'type': Real, 'in': Interval(0, np.inf, closed='left')} }
[docs] def __init__(self, homology_dimensions=None, epsilon=0.01): self.homology_dimensions = homology_dimensions self.epsilon = epsilon
[docs] def fit(self, X, y=None): """Store relevant homology dimensions in :attr:`homology_dimensions_`. Then, return the estimator. This method is here to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : ndarray of shape (n_samples, n_features, 3) Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). It is important that, for each possible homology dimension, the number of triples for which q equals that homology dimension is constants across the entries of `X`. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- self : object """ X = check_diagrams(X) validate_params( self.get_params(), self._hyperparameters) if self.homology_dimensions is None: # Find the unique homology dimensions in the 3D array X passed to # `fit` assuming that they can all be found in its zero-th entry homology_dimensions = np.unique(X[0, :, 2]) else: homology_dimensions = self.homology_dimensions self.homology_dimensions_ = \ _homology_dimensions_to_sorted_ints(homology_dimensions) return self
[docs] def transform(self, X, y=None): """Filter all relevant persistence subdiagrams. Parameters ---------- X : ndarray of shape (n_samples, n_features, 3) Input data. Array of persistence diagrams, each a collection of triples [b, d, q] representing persistent topological features through their birth (b), death (d) and homology dimension (q). It is important that, for each possible homology dimension, the number of triples for which q equals that homology dimension is constants across the entries of X. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_features_filtered, 3) Filtered persistence diagrams. Only the subdiagrams corresponding to dimensions in :attr:`homology_dimensions_` are filtered. ``n_features_filtered`` is less than or equal to ``n_features``. """ check_is_fitted(self) X = check_diagrams(X) Xt = _filter(X, self.homology_dimensions_, self.epsilon) return Xt
[docs] def plot(self, Xt, sample=0, homology_dimensions=None, plotly_params=None): """Plot a sample from a collection of persistence diagrams, with homology in multiple dimensions. Parameters ---------- Xt : ndarray of shape (n_samples, n_points, 3) Collection of persistence diagrams, such as returned by :meth:`transform`. sample : int, optional, default: ``0`` Index of the sample in `Xt` to be plotted. homology_dimensions : list, tuple or None, optional, default: ``None`` Which homology dimensions to include in the plot. ``None`` is equivalent to passing :attr:`homology_dimensions_`. plotly_params : dict or None, optional, default: ``None`` Custom parameters to configure the plotly figure. Allowed keys are ``"traces"`` and ``"layout"``, and the corresponding values should be dictionaries containing keyword arguments as would be fed to the :meth:`update_traces` and :meth:`update_layout` methods of :class:`plotly.graph_objects.Figure`. Returns ------- fig : :class:`plotly.graph_objects.Figure` object Plotly figure. """ if homology_dimensions is None: _homology_dimensions = self.homology_dimensions_ else: _homology_dimensions = homology_dimensions return plot_diagram( Xt[sample], homology_dimensions=_homology_dimensions, plotly_params=plotly_params )