Source code for gtda.time_series.preprocessing

"""Resampling and stationarization of time series data."""
# License: GNU AGPLv3

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_array, column_or_1d
from sklearn.utils.validation import check_is_fitted

from ..base import TransformerResamplerMixin
from ..utils._docs import adapt_fit_transform_docs
from ..utils.intervals import Interval
from ..utils.validation import validate_params


[docs]@adapt_fit_transform_docs class Resampler(BaseEstimator, TransformerResamplerMixin): """Time series resampling at regular intervals. Parameters ---------- period : int, default: ``2`` The sampling period, i.e. one point every period will be kept. Examples -------- >>> import numpy as np >>> from gtda.time_series import Resampler >>> # Create a noisy signal >>> signal = np.asarray([np.sin(x /40) + np.random.random() ... for x in range(0, 300)]) >>> # Set up the Resampler >>> period = 10 >>> periodic_sampler = Resampler(period=period) >>> # Fit and transform the signal >>> signal_resampled = periodic_sampler.fit_transform(signal) >>> print(signal_resampled.shape) (30,) """ _hyperparameters = { 'period': {'type': int, 'in': Interval(1, np.inf, closed='left')} }
[docs] def __init__(self, period=2): self.period = period
[docs] def fit(self, X, y=None): """Do nothing and return the estimator unchanged. This method is here to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : ndarray of shape (n_samples,) or (n_samples, ...) Input data. y : None Ignored. Returns ------- self : object """ check_array(X, ensure_2d=False, allow_nd=True) validate_params(self.get_params(), self._hyperparameters) self._is_fitted = True return self
[docs] def transform(self, X, y=None): """Resample `X`. Parameters ---------- X : ndarray of shape (n_samples,) or (n_samples, ...) Input data. y : None There is no need for a target, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples_new, ...) Resampled array. ``n_samples_new = n_samples // period``. """ check_is_fitted(self, '_is_fitted') Xt = check_array(X, ensure_2d=False, allow_nd=True, copy=True) if Xt.ndim == 1: Xt = Xt[: None] Xt = Xt[::self.period] return Xt
[docs] def resample(self, y, X=None): """Resample `y`. Parameters ---------- y : ndarray of shape (n_samples,) Target. X : None There is no need for input data, yet the pipeline API requires this parameter. Returns ------- yr : ndarray of shape (n_samples_new,) Resampled target. ``n_samples_new = n_samples // period``. """ check_is_fitted(self, '_is_fitted') yr = column_or_1d(y) yr = yr[::self.period] return yr
[docs]class Stationarizer(BaseEstimator, TransformerResamplerMixin): """Methods for stationarizing time series data. Time series may be stationarized to remove or reduce linear or exponential trends. Parameters ---------- operation : ``'return'`` | ``'log-return'``, default: ``'return'`` The type of stationarization operation to perform. It can have two values: - ``'return'``: This option transforms the time series :math:`{X_t}_t` into the time series of relative returns, i.e. the ratio :math:`(X_t-X_{ t-1})/X_t`. - ``'log-return'``: This option transforms the time series :math:`{X_t}_t` into the time series of relative log-returns, i.e. :math:`\\log(X_t/X_{ t-1})`. Examples -------- >>> import numpy as np >>> from gtda.time_series import Stationarizer >>> # Create a noisy signal >>> signal = np.asarray([np.sin(x /40) + 5 + np.random.random() >>> for x in range(0, 300)]).reshape(-1, 1) >>> # Initialize the stationarizer >>> stationarizer = Stationarizer(operation='return') >>> # Fit and transform the signal >>> signal_stationarized = stationarizer.fit_transform(signal) >>> print(signal_stationarized.shape) (299,) """ _hyperparameters = { 'operation': {'type': str, 'in': ['return', 'log-return']} }
[docs] def __init__(self, operation='return'): self.operation = operation
[docs] def fit(self, X, y=None): """Do nothing and return the estimator unchanged. This method is here to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : ndarray of shape (n_samples,) or (n_samples, ...) Input data. y : None Ignored. Returns ------- self : object """ check_array(X, ensure_2d=False, allow_nd=True) validate_params(self.get_params(), self._hyperparameters) self._is_fitted = True return self
[docs] def transform(self, X, y=None): """Stationarize `X` by applying the procedure given by `operation`. Parameters ---------- X : ndarray of shape (n_samples,) or (n_samples, ...) Input data. y : None There is no need for a target, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray of shape (n_samples_new, ...) Stationarized array. ``n_samples_new = n_samples - 1``. """ check_is_fitted(self, '_is_fitted') Xt = check_array(X, ensure_2d=False, allow_nd=True) if Xt.ndim == 1: Xt = Xt[:, None] if self.operation == 'return': return np.diff(Xt, n=1, axis=0) / Xt[1:] else: # Assumes 'log-return' operation return np.diff(np.log(Xt), n=1, axis=0)
[docs] def resample(self, y, X=None): """Resample `y`. Parameters ---------- y : ndarray of shape (n_samples,) Target. X : None There is no need for input data, yet the pipeline API requires this parameter. Returns ------- yr : ndarray of shape (n_samples_new,) Resampled target. ``n_samples_new = n_samples - 1``. """ check_is_fitted(self, '_is_fitted') y = column_or_1d(y) return y[1:]