Source code for ripser.ripser

"""
MIT License

Copyright (c) 2018 Christopher Tralie and Nathaniel Saul

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""

import warnings

from scipy import sparse

import numpy as np
from sklearn.base import TransformerMixin
from sklearn.metrics.pairwise import pairwise_distances

from pyRipser import doRipsFiltrationDM as DRFDM
from pyRipser import doRipsFiltrationDMSparse as DRFDMSparse


def dpoint2pointcloud(X, i, metric):
    """
    Return the distance from the ith point in a Euclidean point cloud
    to the rest of the points
    Parameters
    ----------
    X: ndarray (n_samples, n_features)
        A numpy array of data
    i: int
        The index of the point from which to return all distances
    metric: string or callable
        The metric to use when calculating distance between instances in a
        feature array
    """
    ds = pairwise_distances(X, X[i, :][None, :], metric=metric).flatten()
    ds[i] = 0
    return ds


def get_greedy_perm(X, n_perm=None, distance_matrix=False, metric="euclidean"):
    """
    Compute a furthest point sampling permutation of a set of points
    Parameters
    ----------
    X: ndarray (n_samples, n_features)
        A numpy array of either data or distance matrix
    distance_matrix: bool
        Indicator that X is a distance matrix, if not we compute
        distances in X using the chosen metric.
    n_perm: int
        Number of points to take in the permutation
    metric: string or callable
        The metric to use when calculating distance between instances in a
        feature array
    Returns
    -------
    idx_perm: ndarray(n_perm)
        Indices of points in the greedy permutation
    lambdas: ndarray(n_perm)
        Covering radii at different points
    dperm2all: ndarray(n_perm, n_samples)
        Distances from points in the greedy permutation to points
        in the original point set
    """
    if not n_perm:
        n_perm = X.shape[0]
    # By default, takes the first point in the list to be the
    # first point in the permutation, but could be random
    idx_perm = np.zeros(n_perm, dtype=np.int64)
    lambdas = np.zeros(n_perm)
    if distance_matrix:

        def dpoint2all(i):
            return X[i, :]
    else:

        def dpoint2all(i):
            return dpoint2pointcloud(X, i, metric)

    ds = dpoint2all(0)
    dperm2all = [ds]
    for i in range(1, n_perm):
        idx = np.argmax(ds)
        idx_perm[i] = idx
        lambdas[i - 1] = ds[idx]
        dperm2all.append(dpoint2all(idx))
        ds = np.minimum(ds, dperm2all[-1])
    lambdas[-1] = np.max(ds)
    dperm2all = np.array(dperm2all)
    return (idx_perm, lambdas, dperm2all)



[docs]
def ripser(
    X,
    maxdim=1,
    thresh=np.inf,
    coeff=2,
    distance_matrix=False,
    do_cocycles=False,
    metric="euclidean",
    n_perm=None,
):
    """Compute persistence diagrams for X.

    X can be a data set of points or a distance matrix. When using a data set
    as X it will be converted to a distance matrix using the metric specified.

    Parameters
    ----------

    X : ndarray (n_samples, n_features)
        A numpy array of either data or distance matrix (also pass `distance_matrix=True`). Can also be a sparse distance matrix of type scipy.sparse

    maxdim: int, optional, default 1
        Maximum homology dimension computed. Will compute all dimensions lower than and equal to this value.  For 1, H_0 and H_1 will be computed.

    thresh: float, default infinity
        Maximum distances considered when constructing filtration.  If infinity, compute the entire filtration.

    coeff: int prime, default 2
        Compute homology with coefficients in the prime field Z/pZ for p=coeff.

    distance_matrix: bool, optional, default False
        When True the input matrix X will be considered a distance matrix.

    do_cocycles: bool, optional, default False
        Computed cocycles will be available in the `cocycles` value
        of the return dictionary.

    metric: string or callable, optional, default "euclidean"
        Use this metric to compute distances between rows of X.

        "euclidean", "manhattan" and "cosine" are already provided metrics
        to choose from by using their name.

        You can provide a callable function and it will be used with two
        rows as arguments, it will be called once for each pair of rows in X.

        The computed distance will be available in the result dictionary under
        the key `dperm2all`.

    n_perm: int, optional, default None
        The number of points to subsample in a "greedy permutation,"
        or a furthest point sampling of the points.  These points
        will be used in lieu of the full point cloud for a faster
        computation, at the expense of some accuracy, which can
        be bounded as a maximum bottleneck distance to all diagrams
        on the original point set

    Returns
    -------
    dict
        The result of the computation.

        .. note::
            Each list in `dgms` has a relative list in `cocycles`.

            >>> r = ripser(...)

            For each dimension ``d`` and index ``k`` then ``r['dgms'][d][k]``
            is the barcode associated to the representative cocycle
            ``r['cocycles'][d][k]``.

        The keys available in the dictionary are the:

            * ``dgms``: list (size maxdim) of ndarray (n_pairs, 2)
                For each dimension less than ``maxdim`` a list of persistence diagrams.
                Each persistent diagram is a pair (birth time, death time).
            * ``cocycles``: list (size maxdim) of list of ndarray
                For each dimension less than ``maxdim`` a list of representative cocycles.
                Each representative cocycle in dimension ``d`` is represented as a
                ndarray of ``(k,d+1)`` elements. Each non zero value of the cocycle
                is laid out in a row, first the ``d`` indices of the simplex and then
                the value of the cocycle on the simplex.  The indices of the simplex
                reference the original point cloud, even if a greedy permutation was used.
            * ``num_edges``: int
                The number of edges added during the computation
            * ``dperm2all``: ndarray(n_samples, n_samples) or ndarray (n_perm, n_samples) if n_perm
                The distance matrix used during the computation. When ``n_perm``
                is not None the distance matrix will only refers to the subsampled
                dataset.
            * ``idx_perm``: ndarray(n_perm) if ``n_perm`` > 0
                Index into the original point cloud of the points used
                as a subsample in the greedy permutation

                    >>> r = ripser(X, n_perm=k)
                    >>> subsampling = X[r['idx_perm']]

            * 'r_cover': float
                Covering radius of the subsampled points.
                If ``n_perm <= 0``, then the full point cloud was used and this is 0

    Examples
    --------

    .. code:: python

        from ripser import ripser, plot_dgms
        from sklearn import datasets
        from persim import plot_diagrams

        data = datasets.make_circles(n_samples=110)[0]
        dgms = ripser(data)['dgms']
        plot_diagrams(dgms, show = True)

    Raises
    ------

    ValueError
        If the distance matrix is not square.

    ValueError
        When using both a greedy permutation and a sparse distance matrix.

    ValueError
        When `n_perm` value is bigger than the number of rows in the matrix.

    ValueError
        When `n_perm` is non positive.

    Warns
    ----

        When using a square matrix without toggling `distance_matrix` to True.

        When there are more columns than rows (as each row is a different data point).

    """

    if distance_matrix:
        if not (X.shape[0] == X.shape[1]):
            raise ValueError("Distance matrix is not square")
    else:
        if X.shape[0] == X.shape[1]:
            warnings.warn(
                "The input matrix is square, but the distance_matrix "
                + "flag is off.  Did you mean to indicate that "
                + "this was a distance matrix?"
            )
        elif X.shape[0] < X.shape[1]:
            warnings.warn(
                "The input point cloud has more columns than rows; "
                + "did you mean to transpose?"
            )

    if n_perm and distance_matrix and sparse.issparse(X):
        raise ValueError(
            "Greedy permutation is not supported for sparse distance matrices"
        )
    if n_perm and n_perm > X.shape[0]:
        raise ValueError(
            "Number of points in greedy permutation is greater"
            + " than number of points in the point cloud"
        )
    if n_perm and n_perm < 0:
        raise ValueError(
            "Should be a strictly positive number of points in the greedy permutation"
        )

    idx_perm = np.arange(X.shape[0])
    r_cover = 0.0
    doing_permutation = False
    if n_perm and n_perm < X.shape[0]:
        doing_permutation = True
        idx_perm, lambdas, dperm2all = get_greedy_perm(
            X, n_perm=n_perm, distance_matrix=distance_matrix, metric=metric
        )
        r_cover = lambdas[-1]
        dm = dperm2all[:, idx_perm]
    else:
        if distance_matrix:
            dm = X
        else:
            dm = pairwise_distances(X, metric=metric)
        dperm2all = dm

    n_points = dm.shape[0]
    if not sparse.issparse(dm) and np.sum(np.abs(dm.diagonal()) > 0) > 0:
        # If any of the diagonal elements are nonzero,
        # convert to sparse format, because currently
        # that's the only format that handles nonzero
        # births
        dm = sparse.coo_matrix(dm)

    if sparse.issparse(dm):
        if sparse.isspmatrix_coo(dm):
            # If the matrix is already COO, we need to order the row and column indices
            # lexicographically to avoid errors. See issue #103
            row, col, data = dm.row, dm.col, dm.data
            lex_sort_idx = np.lexsort((col, row))
            row, col, data = row[lex_sort_idx], col[lex_sort_idx], data[lex_sort_idx]
        else:
            # Lexicographic ordering is performed by scipy upon conversion to COO
            coo = dm.tocoo()
            row, col, data = coo.row, coo.col, coo.data
        res = DRFDMSparse(
            row.astype(dtype=np.int32, order="C"),
            col.astype(dtype=np.int32, order="C"),
            np.array(data, dtype=np.float32, order="C"),
            n_points,
            maxdim,
            thresh,
            coeff,
            do_cocycles,
        )
    else:
        row_grid, col_grid = np.meshgrid(np.arange(n_points), np.arange(n_points))
        DParam = np.array(dm[row_grid > col_grid], dtype=np.float32)
        res = DRFDM(DParam, maxdim, thresh, coeff, do_cocycles)

    # Unwrap persistence diagrams
    dgms = res["births_and_deaths_by_dim"]
    for dim in range(len(dgms)):
        N = int(len(dgms[dim]) / 2)
        dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2])

    # Unwrap cocycles
    cocycles = []
    for dim in range(len(res["cocycles_by_dim"])):
        cocycles.append([])
        for j in range(len(res["cocycles_by_dim"][dim])):
            ccl = res["cocycles_by_dim"][dim][j]
            n = int(len(ccl) / (dim + 2))
            ccl = np.reshape(np.array(ccl, dtype=np.int64), [n, dim + 2])
            ccl[:, -1] = np.mod(ccl[:, -1], coeff)
            if doing_permutation:
                # Retain original indices in the point cloud
                ccl[:, 0:-1] = idx_perm[ccl[:, 0:-1]]
            cocycles[dim].append(ccl)

    ret = {
        "dgms": dgms,
        "cocycles": cocycles,
        "num_edges": res["num_edges"],
        "dperm2all": dperm2all,
        "idx_perm": idx_perm,
        "r_cover": r_cover,
    }
    return ret



def correlation_to_distance(C, check_input=True):
    """
    Converts a Pearson/Spearman correlation matrix into a metric space matrix.

    Using the transformation :math:`D = \\sqrt{2(1 - C)}`, this maps a correlation
    matrix into a valid Euclidean/chordal metric space on a hypersphere. It is
    ideal for downstream persistent homology workflows, satisfying the triangle
    inequality.

    Parameters
    ----------
    C: ndarray (n_samples, n_samples).
        A symmetric correlation matrix with values in the range [-1,1]
    check_input: bool, default=True.
        If True, validate the input matrix as square, symmetric,
        bound-compliant and has a valid diagonal

    Returns
    ----------
    D: ndarray (n_samples, n_samples).
        A symmetric distance matrix with values in the range [0,2].
        Diagonal is zeroed out.

    Raises
    ----------
    ValueError
        If the input is not 2D, is not square, is asymmetric, or contains
        values outside [-1,1] by > 0.05

    Warns
    ----------
    RuntimeWarning
        If the input contains values outside [-1,1] by < 0.05,
        If the input diagonal has entries outside of 1.0 by < 1e-3

    Examples
    --------
    >>> import numpy as np
    >>> from ripser import ripser, correlation_to_distance
    >>> # Generate a dummy symmetric correlation matrix
    >>> C = np.array([[1.0, 0.4, -0.2], [0.4, 1.0, 0.1], [-0.2, 0.1, 1.0]])
    >>> D = correlation_to_distance(C)
    >>> print(D)
    [[0.         1.09544512 1.54919334]
     [1.09544512 0.         1.34164079]
     [1.54919334 1.34164079 0.        ]]
    >>> # Feed straight into ripser
    >>> result = ripser(D, distance_matrix=True)
    """
    C = np.asarray(C, dtype=float)
    if check_input:  # user MAY bypass checks; np.allclose is potential bottleneck
        # errors
        if C.ndim != 2:
            raise ValueError(
                f"Input must be a 2D matrix, got a {C.ndim}D array instead."
            )
        if C.shape[0] != C.shape[1]:
            raise ValueError(f"Matrix must be square (n x n), got shape {C.shape}.")
        if not np.allclose(C, C.T, rtol=1e-5, atol=1e-5):
            raise ValueError("Matrix must be symmetric; C_ij must equal C_ji")
        if np.any(
            (C < -1.05) | (C > 1.05)
        ):  # severe deviations often mean covar. or unscaled data
            raise ValueError(
                "Values are significantly outside the valid correlation bounds [-1,1]. "
                "Ensure you are passing a correlation matrix, not a covariance or raw data matrix."
            )
        # warnings
        if np.any((C < -1.0) | (C > 1.0)):  # moderate deviations will be rounded
            warnings.warn(
                "Some values are slightly outside the valid correlation bounds [-1,1]. "
                "For floating-point precision, these values have been clipped.",
                RuntimeWarning,
            )
        if not np.allclose(np.diag(C), 1.0, atol=1e-3):
            warnings.warn(
                "The matrix diagonal elements deviate from 1.0. Ensure this is a correlation matrix.",
                RuntimeWarning,
            )

    D = 2.0 * (1.0 - np.clip(C, -1.0, 1.0))
    np.sqrt(D, out=D)
    np.fill_diagonal(D, 0.0)
    return D


def lower_star_img(img):
    """
    Construct a lower star filtration on an image

    Parameters
    ----------
    img: ndarray (M, N)
        An array of single channel image data

    Returns
    -------
    I: ndarray (K, 2)
        A 0-dimensional persistence diagram corresponding to the sublevelset filtration
    """
    m, n = img.shape

    idxs = np.arange(m * n).reshape((m, n))

    row_indices = idxs.flatten()
    col_indices = idxs.flatten()
    values = img.flatten()

    # Connect 8 spatial neighbors
    tidxs = np.ones((m + 2, n + 2), dtype=np.int64) * np.nan
    tidxs[1:-1, 1:-1] = idxs

    tD = np.ones_like(tidxs) * np.nan
    tD[1:-1, 1:-1] = img

    for di in [-1, 0, 1]:
        for dj in [-1, 0, 1]:
            if di == 0 and dj == 0:
                continue

            thisJ = np.roll(np.roll(tidxs, di, axis=0), dj, axis=1)
            thisD = np.roll(np.roll(tD, di, axis=0), dj, axis=1)
            thisD = np.maximum(thisD, tD)

            # Deal with boundaries
            boundary = ~np.isnan(thisD)
            thisI = tidxs[boundary]
            thisJ = thisJ[boundary]
            thisD = thisD[boundary]

            row_indices = np.concatenate((row_indices, thisI.flatten()))
            col_indices = np.concatenate((col_indices, thisJ.flatten()))
            values = np.concatenate((values, thisD.flatten()))

    sparseDM = sparse.coo_matrix(
        (values, (row_indices, col_indices)), shape=(idxs.size, idxs.size)
    )

    return ripser(sparseDM, distance_matrix=True, maxdim=0)["dgms"][0]



[docs]
class Rips(TransformerMixin):
    """sklearn style class interface for :code:`ripser` with :code:`fit` and :code:`transform` methods..

    Parameters
    ----------
    maxdim: int, optional, default 1
        Maximum homology dimension computed. Will compute all dimensions
        lower than and equal to this value.
        For 1, H_0 and H_1 will be computed.

    thresh: float, default infinity
        Maximum distances considered when constructing filtration.
        If infinity, compute the entire filtration.

    coeff: int prime, default 2
        Compute homology with coefficients in the prime field Z/pZ for p=coeff.

    do_cocycles: bool
        Indicator of whether to compute cocycles, if so, we compute and store
        cocycles in the `cocycles_` dictionary Rips member variable

    n_perm: int
        The number of points to subsample in a "greedy permutation,"
        or a furthest point sampling of the points.  These points
        will be used in lieu of the full point cloud for a faster
        computation, at the expense of some accuracy, which can
        be bounded as a maximum bottleneck distance to all diagrams
        on the original point set

    verbose: boolean
        Whether to print out information about this object
        as it is constructed

    Attributes
    ----------
    `dgm_`: list of ndarray, each shape (n_pairs, 2)
        After `transform`, `dgm_` contains computed persistence diagrams in
        each dimension

    cocycles_: list (size maxdim) of list of ndarray
        A list of representative cocycles in each dimension.  The list
        in each dimension is parallel to the diagram in that dimension;
        that is, each entry of the list is a representative cocycle of
        the corresponding point expressed as an ndarray(K, d+1), where K is
        the number of nonzero values of the cocycle and d is the dimension
        of the cocycle.  The first d columns of each array index into
        the simplices of the (subsampled) point cloud, and the last column
        is the value of the cocycle at that simplex

    dperm2all_: ndarray(n_samples, n_samples) or ndarray (n_perm, n_samples) if n_perm
        The distance matrix used in the computation if n_perm is none.
        Otherwise, the distance from all points in the permutation to
        all points in the dataset

    metric_: string or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        specified in pairwise_distances, including "euclidean", "manhattan",
        or "cosine". Alternatively, if metric is a callable function, it is
        called on each pair of instances (rows) and the resulting value
        recorded. The callable should take two arrays from X as input and
        return a value indicating the distance between them.

    num_edges: int
        The number of edges added during the computation

    idx_perm: ndarray(n_perm) if n_perm > 0
        Index into the original point cloud of the points used
        as a subsample in the greedy permutation

    r_cover: float
        Covering radius of the subsampled points.
        If n_perm <= 0, then the full point cloud was used and this is 0
    Examples
    --------
     .. code:: python

        from ripser import Rips
        import tadasets

        data = tadasets.dsphere(n=110, d=2)[0]
        rips = Rips()
        rips.transform(data)
        rips.plot()
    """


[docs]
    def __init__(
        self,
        maxdim=1,
        thresh=np.inf,
        coeff=2,
        do_cocycles=False,
        n_perm=None,
        verbose=True,
    ):
        self.maxdim = maxdim
        self.thresh = thresh
        self.coeff = coeff
        self.do_cocycles = do_cocycles
        self.n_perm = n_perm
        self.verbose = verbose

        # Internal variables
        self.dgms_ = None
        self.cocycles_ = None
        self.dperm2all_ = None  # Distance matrix
        self.metric_ = None
        self.num_edges_ = None  # Number of edges added
        self.idx_perm_ = None
        self.r_cover_ = 0.0

        if self.verbose:
            print(
                "Rips(maxdim={}, thresh={}, coeff={}, do_cocycles={}, n_perm = {}, verbose={})".format(
                    maxdim, thresh, coeff, do_cocycles, n_perm, verbose
                )
            )


    def transform(self, X, distance_matrix=False, metric="euclidean"):
        result = ripser(
            X,
            maxdim=self.maxdim,
            thresh=self.thresh,
            coeff=self.coeff,
            do_cocycles=self.do_cocycles,
            distance_matrix=distance_matrix,
            metric=metric,
            n_perm=self.n_perm,
        )
        self.dgms_ = result["dgms"]
        self.num_edges_ = result["num_edges"]
        self.dperm2all_ = result["dperm2all"]
        self.idx_perm_ = result["idx_perm"]
        self.cocycles_ = result["cocycles"]
        self.r_cover_ = result["r_cover"]
        return self.dgms_

    def fit_transform(self, X, distance_matrix=False, metric="euclidean"):
        """
        Compute persistence diagrams for X data array and return the diagrams.

        Parameters
        ----------
        X: ndarray (n_samples, n_features)
            A numpy array of either data or distance matrix.

        distance_matrix: bool
            Indicator that X is a distance matrix, if not we compute a
            distance matrix from X using the chosen metric.

        metric: string or callable
            The metric to use when calculating distance between instances in a
            feature array. If metric is a string, it must be one of the options
            specified in pairwise_distances, including "euclidean", "manhattan",
            or "cosine". Alternatively, if metric is a callable function, it is
            called on each pair of instances (rows) and the resulting value
            recorded. The callable should take two arrays from X as input and
            return a value indicating the distance between them.

        Returns
        -------
        dgms: list (size maxdim) of ndarray (n_pairs, 2)
            A list of persistence diagrams, one for each dimension less
            than maxdim. Each diagram is an ndarray of size (n_pairs, 2) with
            the first column representing the birth time and the second column
            representing the death time of each pair.
        """
        self.transform(X, distance_matrix, metric)
        return self.dgms_

    def plot(self, diagrams=None, *args, **kwargs):
        """A helper function to plot persistence diagrams.

        Parameters
        ----------
        diagrams: ndarray (n_pairs, 2) or list of diagrams
            A diagram or list of diagrams as returned from self.fit.
            If diagram is None, we use `self.dgm_` for plotting.
            If diagram is a list of diagrams, then plot all on the same plot
            using different colors.

        plot_only: list of numeric
            If specified, an array of only the diagrams that should be plotted.

        title: string, default is None
            If title is defined, add it as title of the plot.

        xy_range: list of numeric [xmin, xmax, ymin, ymax]
            User provided range of axes. This is useful for comparing
            multiple persistence diagrams.

        labels: string or list of strings
            Legend labels for each diagram.
            If none are specified, we use H_0, H_1, H_2,... by default.

        colormap: string, default is 'default'
            Any of matplotlib color palettes.
            Some options are 'default', 'seaborn', 'sequential'.
            See all available styles with

            .. code:: python

                import matplotlib as mpl
                print(mpl.styles.available)

        size: numeric, default is 20
            Pixel size of each point plotted.

        ax_color: any valid matplitlib color type.
            See [https://matplotlib.org/api/colors_api.html](https://matplotlib.org/api/colors_api.html) for complete API.

        diagonal: bool, default is True
            Plot the diagonal x=y line.

        lifetime: bool, default is False. If True, diagonal is turned to False.
            Plot life time of each point instead of birth and death.
            Essentially, visualize (x, y-x).

        legend: bool, default is True
            If true, show the legend.

        show: bool, default is True
            Call plt.show() after plotting.
            If you are using self.plot() as part of a subplot,
            set show=False and call plt.show() only once at the end.
        """
        import persim

        if diagrams is None:
            # Allow using transformed diagrams as default
            diagrams = self.dgms_

        persim.plot_diagrams(diagrams, *args, **kwargs)



__all__ = ["Rips", "ripser", "lower_star_img"]