Source code for gtda.mapper.filter

"""Filter functions commonly used with Mapper."""
# License: GNU AGPLv3

import warnings

import numpy as np
from scipy.spatial.distance import pdist, squareform
from scipy.stats import entropy
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted

from ..utils._docs import adapt_fit_transform_docs


[docs]@adapt_fit_transform_docs class Eccentricity(BaseEstimator, TransformerMixin): """Eccentricities of points in a point cloud or abstract metric space. Let `D` be a square matrix representing distances between points in a point cloud, or directly defining an abstract metric (or metric-like) space. The eccentricity of point `i` in the point cloud or abstract metric space is the `p`-norm (for some `p`) of row `i` in `D`. Parameters ---------- exponent : int or float, optional, default: ``2`` `p`-norm exponent used to calculate eccentricities from the distance matrix. metric : str or function, optional, default: ``'euclidean'`` Metric to use to compute the distance matrix if point cloud data is passed as input, or ``'precomputed'`` to specify that the input is already a distance matrix. If not ``'precomputed'``, it may be anything allowed by :func:`scipy.spatial.distance.pdist`. metric_params : dict, optional, default: ``{}`` Additional keyword arguments for the metric function. """
[docs] def __init__(self, exponent=2, metric='euclidean', metric_params={}): self.exponent = exponent self.metric = metric self.metric_params = metric_params
[docs] def fit(self, X, y=None): """Do nothing and return the estimator unchanged. This method exists to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : array-like of shape (n_samples, n_features) or (n_samples, \ n_samples) Input data. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- self : object """ # TODO: Consider making this transformer stateful so that the # eccentricities of new points relative to the data seen in fit # may be computed. May be useful for supervised tasks with Mapper? # Evaluate performance impact of doing this. check_array(X) self._is_fitted = True return self
[docs] def transform(self, X, y=None): """Compute the eccentricities of points (i.e. rows) in `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) or (n_samples, \ n_samples) Input data. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, 1) Column vector of eccentricities of points in `X`. """ check_is_fitted(self, '_is_fitted') Xt = check_array(X) if self.metric != 'precomputed': Xt = squareform( pdist(Xt, metric=self.metric, **self.metric_params) ) Xt = np.linalg.norm(Xt, axis=1, ord=self.exponent, keepdims=True) return Xt
[docs]@adapt_fit_transform_docs class Entropy(BaseEstimator, TransformerMixin): """Entropy of rows in a two-dimensional array. The rows of the array are interpreted as probability vectors, after taking absolute values if necessary and normalizing. Then, their (base 2) Shannon entropies are computed and returned. """
[docs] def __init__(self): pass
[docs] def fit(self, X, y=None): """Do nothing and return the estimator unchanged. This method exists to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- self : object """ check_array(X) self._is_fitted = True return self
[docs] def transform(self, X, y=None): """For each row in the array, take absolute values of any negative entry, normalise, and compute the Shannon entropy. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, 1) Array of Shannon entropies. """ # TODO: The following is a crude method to ensure each row vector # consists of "probabilities" that sum to one. Consider normalisation # in terms of bin counts? check_is_fitted(self, '_is_fitted') Xt = check_array(X) if np.any(Xt < 0): warnings.warn("Negative values detected in X! Taking absolute " "value to calculate probabilities.") Xt = np.abs(Xt) Xt = entropy(Xt, base=2, axis=1)[:, None] return Xt
[docs]@adapt_fit_transform_docs class Projection(BaseEstimator, TransformerMixin): """Projection onto specified columns. In practice, this simply means returning a selection of columns of the data. Parameters ---------- columns : int or list of int, optional, default: ``0`` The column indices of the array to project onto. """
[docs] def __init__(self, columns=0): self.columns = columns
[docs] def fit(self, X, y=None): """Do nothing and return the estimator unchanged. This method exists to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- self : object """ check_array(X) self._is_fitted = True return self
[docs] def transform(self, X, y=None): """Return selected columns of the data. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data. y : None There is no need for a target in a transformer, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples, n_columns) Output array, where ``n_columns = len(columns)``. """ check_is_fitted(self, '_is_fitted') # Simple duck typing to handle case of pandas dataframe input if hasattr(X, 'columns'): # NB in this case we do not check the health of other columns Xt = check_array(X[self.columns], ensure_2d=False, copy=True) else: Xt = check_array(X, copy=True) Xt = Xt[:, self.columns] Xt = Xt.reshape(len(Xt), -1) return Xt