"""Rescaling methods for persistent homology."""
# License: GNU AGPLv3
import itertools
from numbers import Real
from types import FunctionType
import numpy as np
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import pairwise_distances
from sklearn.utils.validation import check_array, check_is_fitted
from ..base import PlotterMixin
from ..plotting import plot_heatmap
from ..utils._docs import adapt_fit_transform_docs
from ..utils.intervals import Interval
from ..utils.validation import validate_params
[docs]@adapt_fit_transform_docs
class ConsistentRescaling(BaseEstimator, TransformerMixin, PlotterMixin):
"""Rescaling of distances between pairs of points by the geometric mean
of the distances to the respective :math:`k`-th nearest neighbours.
Based on ideas in [1]_. The computation during :meth:`transform` depends on
the nature of the array `X`. If each entry in `X` along axis 0 represents a
distance matrix :math:`D`, then the corresponding entry in the transformed
array is the distance matrix
:math:`D'_{i,j} = D_{i,j}/\\sqrt{D_{i,k_i}D_{j,k_j}}`, where :math:`k_i` is
the index of the :math:`k`-th largest value in row :math:`i` (and similarly
for :math:`j`). If the entries in `X` represent point clouds, their
distance matrices are first computed, and then rescaled according to the
same formula.
Parameters
----------
metric : string or callable, optional, default: ``'euclidean'``
If set to ``'precomputed'``, each entry in `X` along axis 0 is
interpreted to be a distance matrix. Otherwise, entries are
interpreted as feature arrays, and `metric` determines a rule with
which to calculate distances between pairs of instances (i.e. rows)
in these arrays.
If `metric` is a string, it must be one of the options allowed by
:func:`scipy.spatial.distance.pdist` for its metric parameter, or a
metric listed in :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`,
including "euclidean", "manhattan" or "cosine".
If `metric` is a callable function, it is called on each pair of
instances and the resulting value recorded. The callable should take
two arrays from the entry in `X` as input, and return a value
indicating the distance between them.
metric_params : dict or None, optional, default: ``None``
Additional keyword arguments for the metric function.
neighbor_rank : int, optional, default: ``1``
Rank of the neighbors used to modify the metric structure according
to the "consistent rescaling" procedure.
n_jobs : int or None, optional, default: ``None``
The number of jobs to use for the computation. ``None`` means 1
unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
using all processors.
Attributes
----------
effective_metric_params_ : dict
Dictionary containing all information present in `metric_params`.
If `metric_params` is ``None``, it is set to the empty dictionary.
Examples
--------
>>> import numpy as np
>>> from gtda.point_clouds import ConsistentRescaling
>>> X = np.array([[[0, 0], [1, 2], [5, 6]]])
>>> cr = ConsistentRescaling()
>>> X_rescaled = cr.fit_transform(X)
>>> print(X_rescaled.shape)
(1, 3, 3)
See also
--------
ConsecutiveRescaling
References
----------
.. [1] T. Berry and T. Sauer, "Consistent manifold representation for
topological data analysis"; *Foundations of data analysis* **1**,
pp. 1--38, 2019; doi: `10.3934/fods.2019001
<http://dx.doi.org/10.3934/fods.2019001>`_.
"""
_hyperparameters = {
'metric': {'type': (str, FunctionType)},
'metric_params': {'type': (dict, type(None))},
'neighbor_rank': {
'type': int, 'in': Interval(1, np.inf, closed='left')}
}
[docs] def __init__(self, metric='euclidean', metric_params=None, neighbor_rank=1,
n_jobs=None):
self.metric = metric
self.metric_params = metric_params
self.neighbor_rank = neighbor_rank
self.n_jobs = n_jobs
def _consistent_rescaling(self, X):
Xm = pairwise_distances(X, metric=self.metric, n_jobs=1,
**self.effective_metric_params_)
indices_k_neighbor = np.argsort(Xm)[:, self.neighbor_rank]
distance_k_neighbor = Xm[np.arange(X.shape[0]),
indices_k_neighbor]
# Only calculate metric for upper triangle
Xc = np.zeros(Xm.shape)
iterator = itertools.combinations(range(Xm.shape[0]), 2)
for i, j in iterator:
Xc[i, j] = Xm[i, j] / (np.sqrt(distance_k_neighbor[i] *
distance_k_neighbor[j]))
return Xc + Xc.T
[docs] def fit(self, X, y=None):
"""Calculate :attr:`effective_metric_params_`. Then, return the
estimator.
This method is here to implement the usual scikit-learn API and hence
work in pipelines.
Parameters
----------
X : ndarray of shape (n_samples, n_points, n_points) or (n_samples, \
n_points, n_dimensions)
Input data. If ``metric == 'precomputed'``, the input should be an
ndarray whose each entry along axis 0 is a distance matrix of shape
``(n_points, n_points)``. Otherwise, each such entry will be
interpreted as an array of ``n_points`` row vectors in
``n_dimensions``-dimensional space.
y : None
There is no need for a target in a transformer, yet the pipeline
API requires this parameter.
Returns
-------
self : object
"""
check_array(X, allow_nd=True)
validate_params(
self.get_params(), self._hyperparameters, exclude=['n_jobs'])
if self.metric_params is None:
self.effective_metric_params_ = {}
else:
self.effective_metric_params_ = self.metric_params.copy()
return self
[docs] @staticmethod
def plot(Xt, sample=0, colorscale='blues'):
"""Plot a sample from a collection of distance matrices.
Parameters
----------
Xt : ndarray of shape (n_samples, n_points, n_points)
Collection of distance matrices, such as returned by
:meth:`transform`.
sample : int, optional, default: ``0``
Index of the sample to be plotted.
colorscale : str, optional, default: ``'blues'``
Color scale to be used in the heat map. Can be anything allowed by
:class:`plotly.graph_objects.Heatmap`.
"""
return plot_heatmap(Xt[sample], colorscale=colorscale)
[docs]@adapt_fit_transform_docs
class ConsecutiveRescaling(BaseEstimator, TransformerMixin, PlotterMixin):
"""Rescaling of distances between consecutive pairs of points by a fixed
factor.
The computation during :meth:`transform` depends on the nature of the array
`X`. If each entry in `X` along axis 0 represents a distance matrix
:math:`D`, then the corresponding entry in the transformed array is the
distance matrix :math:`D'_{i,i+1} = \\alpha D_{i,i+1}` where
:math:`\\alpha` is a positive factor. If the entries in `X` represent point
clouds, their distance matrices are first computed, and then rescaled
according to the same formula.
Parameters
----------
metric : string or callable, optional, default: ``'euclidean'``
If set to ``'precomputed'``, each entry in `X` along axis 0 is
interpreted to be a distance matrix. Otherwise, entries are
interpreted as feature arrays, and `metric` determines a rule with
which to calculate distances between pairs of instances (i.e. rows)
in these arrays.
If `metric` is a string, it must be one of the options allowed by
:func:`scipy.spatial.distance.pdist` for its metric parameter, or a
metric listed in :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`,
including "euclidean", "manhattan" or "cosine".
If `metric` is a callable function, it is called on each pair of
instances and the resulting value recorded. The callable should take
two arrays from the entry in `X` as input, and return a value
indicating the distance between them.
metric_params : dict or None, optional, default: ``None``
Additional keyword arguments for the metric function.
factor : float, optional, default: ``0.``
Factor by which to multiply the distance between consecutive
points.
n_jobs : int or None, optional, default: ``None``
The number of jobs to use for the computation. ``None`` means 1
unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
using all processors.
Attributes
----------
effective_metric_params_ : dict
Dictionary containing all information present in `metric_params`.
If `metric_params` is ``None``, it is set to the empty dictionary.
Examples
--------
>>> import numpy as np
>>> from gtda.point_clouds import ConsecutiveRescaling
>>> X = np.array([[[0, 0], [1, 2], [5, 6]]])
>>> cr = ConsecutiveRescaling()
>>> X_rescaled = cr.fit_transform(X)
>>> print(X_rescaled.shape)
(1, 3, 3)
See also
--------
ConsistentRescaling
"""
_hyperparameters = {
'metric': {'type': (str, FunctionType)},
'metric_params': {'type': (dict, type(None))},
'factor': {
'type': Real, 'in': Interval(0, np.inf, closed='both')}
}
[docs] def __init__(self, metric='euclidean', metric_params=None, factor=0.,
n_jobs=None):
self.metric = metric
self.metric_params = metric_params
self.factor = factor
self.n_jobs = n_jobs
[docs] def fit(self, X, y=None):
"""Calculate :attr:`effective_metric_params_`. Then, return the
estimator.
This method is here to implement the usual scikit-learn API and hence
work in pipelines.
Parameters
----------
X : ndarray of shape (n_samples, n_points, n_points) or (n_samples, \
n_points, n_dimensions)
Input data. If ``metric == 'precomputed'``, the input should be an
ndarray whose each entry along axis 0 is a distance matrix of shape
``(n_points, n_points)``. Otherwise, each such entry will be
interpreted as an array of ``n_points`` row vectors in
``n_dimensions``-dimensional space.
y : None
There is no need for a target in a transformer, yet the pipeline
API requires this parameter.
Returns
-------
self : object
"""
check_array(X, allow_nd=True)
validate_params(
self.get_params(), self._hyperparameters, exclude=['n_jobs'])
if self.metric_params is None:
self.effective_metric_params_ = {}
else:
self.effective_metric_params_ = self.metric_params.copy()
return self
[docs] @staticmethod
def plot(Xt, sample=0, colorscale='blues'):
"""Plot a sample from a collection of distance matrices.
Parameters
----------
Xt : ndarray of shape (n_samples, n_points, n_points)
Collection of distance matrices, such as returned by
:meth:`transform`.
sample : int, optional, default: ``0``
Index of the sample to be plotted.
colorscale : str, optional, default: ``'blues'``
Color scale to be used in the heat map. Can be anything allowed by
:class:`plotly.graph_objects.Heatmap`.
"""
return plot_heatmap(Xt[sample], colorscale=colorscale)