"""
MIT License
Copyright (c) 2018 Christopher Tralie and Nathaniel Saul
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
from itertools import cycle
import warnings
from scipy import sparse
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.metrics.pairwise import pairwise_distances
from pyRipser import doRipsFiltrationDM as DRFDM
from pyRipser import doRipsFiltrationDMSparse as DRFDMSparse
def dpoint2pointcloud(X, i, metric):
"""
Return the distance from the ith point in a Euclidean point cloud
to the rest of the points
Parameters
----------
X: ndarray (n_samples, n_features)
A numpy array of data
i: int
The index of the point from which to return all distances
metric: string or callable
The metric to use when calculating distance between instances in a
feature array
"""
ds = pairwise_distances(X, X[i, :][None, :], metric=metric).flatten()
ds[i] = 0
return ds
def get_greedy_perm(X, n_perm=None, distance_matrix=False, metric="euclidean"):
"""
Compute a furthest point sampling permutation of a set of points
Parameters
----------
X: ndarray (n_samples, n_features)
A numpy array of either data or distance matrix
distance_matrix: bool
Indicator that X is a distance matrix, if not we compute
distances in X using the chosen metric.
n_perm: int
Number of points to take in the permutation
metric: string or callable
The metric to use when calculating distance between instances in a
feature array
Returns
-------
idx_perm: ndarray(n_perm)
Indices of points in the greedy permutation
lambdas: ndarray(n_perm)
Covering radii at different points
dperm2all: ndarray(n_perm, n_samples)
Distances from points in the greedy permutation to points
in the original point set
"""
if not n_perm:
n_perm = X.shape[0]
# By default, takes the first point in the list to be the
# first point in the permutation, but could be random
idx_perm = np.zeros(n_perm, dtype=np.int64)
lambdas = np.zeros(n_perm)
if distance_matrix:
dpoint2all = lambda i: X[i, :]
else:
dpoint2all = lambda i: dpoint2pointcloud(X, i, metric)
ds = dpoint2all(0)
dperm2all = [ds]
for i in range(1, n_perm):
idx = np.argmax(ds)
idx_perm[i] = idx
lambdas[i - 1] = ds[idx]
dperm2all.append(dpoint2all(idx))
ds = np.minimum(ds, dperm2all[-1])
lambdas[-1] = np.max(ds)
dperm2all = np.array(dperm2all)
return (idx_perm, lambdas, dperm2all)
[docs]
def ripser(
X,
maxdim=1,
thresh=np.inf,
coeff=2,
distance_matrix=False,
do_cocycles=False,
metric="euclidean",
n_perm=None,
):
"""Compute persistence diagrams for X.
X can be a data set of points or a distance matrix. When using a data set
as X it will be converted to a distance matrix using the metric specified.
Parameters
----------
X : ndarray (n_samples, n_features)
A numpy array of either data or distance matrix (also pass `distance_matrix=True`). Can also be a sparse distance matrix of type scipy.sparse
maxdim: int, optional, default 1
Maximum homology dimension computed. Will compute all dimensions lower than and equal to this value. For 1, H_0 and H_1 will be computed.
thresh: float, default infinity
Maximum distances considered when constructing filtration. If infinity, compute the entire filtration.
coeff: int prime, default 2
Compute homology with coefficients in the prime field Z/pZ for p=coeff.
distance_matrix: bool, optional, default False
When True the input matrix X will be considered a distance matrix.
do_cocycles: bool, optional, default False
Computed cocycles will be available in the `cocycles` value
of the return dictionary.
metric: string or callable, optional, default "euclidean"
Use this metric to compute distances between rows of X.
"euclidean", "manhattan" and "cosine" are already provided metrics
to choose from by using their name.
You can provide a callable function and it will be used with two
rows as arguments, it will be called once for each pair of rows in X.
The computed distance will be available in the result dictionary under
the key `dperm2all`.
n_perm: int, optional, default None
The number of points to subsample in a "greedy permutation,"
or a furthest point sampling of the points. These points
will be used in lieu of the full point cloud for a faster
computation, at the expense of some accuracy, which can
be bounded as a maximum bottleneck distance to all diagrams
on the original point set
Returns
-------
dict
The result of the computation.
.. note::
Each list in `dgms` has a relative list in `cocycles`.
>>> r = ripser(...)
For each dimension ``d`` and index ``k`` then ``r['dgms'][d][k]``
is the barcode associated to the representative cocycle
``r['cocycles'][d][k]``.
The keys available in the dictionary are the:
* ``dgms``: list (size maxdim) of ndarray (n_pairs, 2)
For each dimension less than ``maxdim`` a list of persistence diagrams.
Each persistent diagram is a pair (birth time, death time).
* ``cocycles``: list (size maxdim) of list of ndarray
For each dimension less than ``maxdim`` a list of representative cocycles.
Each representative cocycle in dimension ``d`` is represented as a
ndarray of ``(k,d+1)`` elements. Each non zero value of the cocycle
is laid out in a row, first the ``d`` indices of the simplex and then
the value of the cocycle on the simplex. The indices of the simplex
reference the original point cloud, even if a greedy permutation was used.
* ``num_edges``: int
The number of edges added during the computation
* ``dperm2all``: ndarray(n_samples, n_samples) or ndarray (n_perm, n_samples) if n_perm
The distance matrix used during the computation. When ``n_perm``
is not None the distance matrix will only refers to the subsampled
dataset.
* ``idx_perm``: ndarray(n_perm) if ``n_perm`` > 0
Index into the original point cloud of the points used
as a subsample in the greedy permutation
>>> r = ripser(X, n_perm=k)
>>> subsampling = X[r['idx_perm']]
* 'r_cover': float
Covering radius of the subsampled points.
If ``n_perm <= 0``, then the full point cloud was used and this is 0
Examples
--------
.. code:: python
from ripser import ripser, plot_dgms
from sklearn import datasets
from persim import plot_diagrams
data = datasets.make_circles(n_samples=110)[0]
dgms = ripser(data)['dgms']
plot_diagrams(dgms, show = True)
Raises
------
ValueError
If the distance matrix is not square.
ValueError
When using both a greedy permutation and a sparse distance matrix.
ValueError
When `n_perm` value is bigger than the number of rows in the matrix.
ValueError
When `n_perm` is non positive.
Warns
----
When using a square matrix without toggling `distance_matrix` to True.
When there are more columns than rows (as each row is a different data point).
"""
if distance_matrix:
if not (X.shape[0] == X.shape[1]):
raise ValueError("Distance matrix is not square")
else:
if X.shape[0] == X.shape[1]:
warnings.warn(
"The input matrix is square, but the distance_matrix "
+ "flag is off. Did you mean to indicate that "
+ "this was a distance matrix?"
)
elif X.shape[0] < X.shape[1]:
warnings.warn(
"The input point cloud has more columns than rows; "
+ "did you mean to transpose?"
)
if n_perm and distance_matrix and sparse.issparse(X):
raise ValueError(
"Greedy permutation is not supported for sparse distance matrices"
)
if n_perm and n_perm > X.shape[0]:
raise ValueError(
"Number of points in greedy permutation is greater"
+ " than number of points in the point cloud"
)
if n_perm and n_perm < 0:
raise ValueError(
"Should be a strictly positive number of points in the greedy permutation"
)
idx_perm = np.arange(X.shape[0])
r_cover = 0.0
doing_permutation = False
if n_perm and n_perm < X.shape[0]:
doing_permutation = True
idx_perm, lambdas, dperm2all = get_greedy_perm(
X, n_perm=n_perm, distance_matrix=distance_matrix, metric=metric
)
r_cover = lambdas[-1]
dm = dperm2all[:, idx_perm]
else:
if distance_matrix:
dm = X
else:
dm = pairwise_distances(X, metric=metric)
dperm2all = dm
n_points = dm.shape[0]
if not sparse.issparse(dm) and np.sum(np.abs(dm.diagonal()) > 0) > 0:
# If any of the diagonal elements are nonzero,
# convert to sparse format, because currently
# that's the only format that handles nonzero
# births
dm = sparse.coo_matrix(dm)
if sparse.issparse(dm):
if sparse.isspmatrix_coo(dm):
# If the matrix is already COO, we need to order the row and column indices
# lexicographically to avoid errors. See issue #103
row, col, data = dm.row, dm.col, dm.data
lex_sort_idx = np.lexsort((col, row))
row, col, data = row[lex_sort_idx], col[lex_sort_idx], data[lex_sort_idx]
else:
# Lexicographic ordering is performed by scipy upon conversion to COO
coo = dm.tocoo()
row, col, data = coo.row, coo.col, coo.data
res = DRFDMSparse(
row.astype(dtype=np.int32, order="C"),
col.astype(dtype=np.int32, order="C"),
np.array(data, dtype=np.float32, order="C"),
n_points,
maxdim,
thresh,
coeff,
do_cocycles
)
else:
I, J = np.meshgrid(np.arange(n_points), np.arange(n_points))
DParam = np.array(dm[I > J], dtype=np.float32)
res = DRFDM(DParam, maxdim, thresh, coeff, do_cocycles)
# Unwrap persistence diagrams
dgms = res["births_and_deaths_by_dim"]
for dim in range(len(dgms)):
N = int(len(dgms[dim]) / 2)
dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2])
# Unwrap cocycles
cocycles = []
for dim in range(len(res["cocycles_by_dim"])):
cocycles.append([])
for j in range(len(res["cocycles_by_dim"][dim])):
ccl = res["cocycles_by_dim"][dim][j]
n = int(len(ccl) / (dim + 2))
ccl = np.reshape(np.array(ccl, dtype=np.int64), [n, dim + 2])
ccl[:, -1] = np.mod(ccl[:, -1], coeff)
if doing_permutation:
# Retain original indices in the point cloud
ccl[:, 0:-1] = idx_perm[ccl[:, 0:-1]]
cocycles[dim].append(ccl)
ret = {
"dgms": dgms,
"cocycles": cocycles,
"num_edges": res["num_edges"],
"dperm2all": dperm2all,
"idx_perm": idx_perm,
"r_cover": r_cover,
}
return ret
def lower_star_img(img):
"""
Construct a lower star filtration on an image
Parameters
----------
img: ndarray (M, N)
An array of single channel image data
Returns
-------
I: ndarray (K, 2)
A 0-dimensional persistence diagram corresponding to the sublevelset filtration
"""
m, n = img.shape
idxs = np.arange(m * n).reshape((m, n))
I = idxs.flatten()
J = idxs.flatten()
V = img.flatten()
# Connect 8 spatial neighbors
tidxs = np.ones((m + 2, n + 2), dtype=np.int64) * np.nan
tidxs[1:-1, 1:-1] = idxs
tD = np.ones_like(tidxs) * np.nan
tD[1:-1, 1:-1] = img
for di in [-1, 0, 1]:
for dj in [-1, 0, 1]:
if di == 0 and dj == 0:
continue
thisJ = np.roll(np.roll(tidxs, di, axis=0), dj, axis=1)
thisD = np.roll(np.roll(tD, di, axis=0), dj, axis=1)
thisD = np.maximum(thisD, tD)
# Deal with boundaries
boundary = ~np.isnan(thisD)
thisI = tidxs[boundary]
thisJ = thisJ[boundary]
thisD = thisD[boundary]
I = np.concatenate((I, thisI.flatten()))
J = np.concatenate((J, thisJ.flatten()))
V = np.concatenate((V, thisD.flatten()))
sparseDM = sparse.coo_matrix((V, (I, J)), shape=(idxs.size, idxs.size))
return ripser(sparseDM, distance_matrix=True, maxdim=0)["dgms"][0]
[docs]
class Rips(TransformerMixin):
""" sklearn style class interface for :code:`ripser` with :code:`fit` and :code:`transform` methods..
Parameters
----------
maxdim: int, optional, default 1
Maximum homology dimension computed. Will compute all dimensions
lower than and equal to this value.
For 1, H_0 and H_1 will be computed.
thresh: float, default infinity
Maximum distances considered when constructing filtration.
If infinity, compute the entire filtration.
coeff: int prime, default 2
Compute homology with coefficients in the prime field Z/pZ for p=coeff.
do_cocycles: bool
Indicator of whether to compute cocycles, if so, we compute and store
cocycles in the `cocycles_` dictionary Rips member variable
n_perm: int
The number of points to subsample in a "greedy permutation,"
or a furthest point sampling of the points. These points
will be used in lieu of the full point cloud for a faster
computation, at the expense of some accuracy, which can
be bounded as a maximum bottleneck distance to all diagrams
on the original point set
verbose: boolean
Whether to print out information about this object
as it is constructed
Attributes
----------
`dgm_`: list of ndarray, each shape (n_pairs, 2)
After `transform`, `dgm_` contains computed persistence diagrams in
each dimension
cocycles_: list (size maxdim) of list of ndarray
A list of representative cocycles in each dimension. The list
in each dimension is parallel to the diagram in that dimension;
that is, each entry of the list is a representative cocycle of
the corresponding point expressed as an ndarray(K, d+1), where K is
the number of nonzero values of the cocycle and d is the dimension
of the cocycle. The first d columns of each array index into
the simplices of the (subsampled) point cloud, and the last column
is the value of the cocycle at that simplex
dperm2all_: ndarray(n_samples, n_samples) or ndarray (n_perm, n_samples) if n_perm
The distance matrix used in the computation if n_perm is none.
Otherwise, the distance from all points in the permutation to
all points in the dataset
metric_: string or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
specified in pairwise_distances, including "euclidean", "manhattan",
or "cosine". Alternatively, if metric is a callable function, it is
called on each pair of instances (rows) and the resulting value
recorded. The callable should take two arrays from X as input and
return a value indicating the distance between them.
num_edges: int
The number of edges added during the computation
idx_perm: ndarray(n_perm) if n_perm > 0
Index into the original point cloud of the points used
as a subsample in the greedy permutation
r_cover: float
Covering radius of the subsampled points.
If n_perm <= 0, then the full point cloud was used and this is 0
Examples
--------
.. code:: python
from ripser import Rips
import tadasets
data = tadasets.dsphere(n=110, d=2)[0]
rips = Rips()
rips.transform(data)
rips.plot()
"""
[docs]
def __init__(
self,
maxdim=1,
thresh=np.inf,
coeff=2,
do_cocycles=False,
n_perm=None,
verbose=True,
):
self.maxdim = maxdim
self.thresh = thresh
self.coeff = coeff
self.do_cocycles = do_cocycles
self.n_perm = n_perm
self.verbose = verbose
# Internal variables
self.dgms_ = None
self.cocycles_ = None
self.dperm2all_ = None # Distance matrix
self.metric_ = None
self.num_edges_ = None # Number of edges added
self.idx_perm_ = None
self.r_cover_ = 0.0
if self.verbose:
print(
"Rips(maxdim={}, thresh={}, coeff={}, do_cocycles={}, n_perm = {}, verbose={})".format(
maxdim, thresh, coeff, do_cocycles, n_perm, verbose
)
)
def transform(self, X, distance_matrix=False, metric="euclidean"):
result = ripser(
X,
maxdim=self.maxdim,
thresh=self.thresh,
coeff=self.coeff,
do_cocycles=self.do_cocycles,
distance_matrix=distance_matrix,
metric=metric,
n_perm=self.n_perm,
)
self.dgms_ = result["dgms"]
self.num_edges_ = result["num_edges"]
self.dperm2all_ = result["dperm2all"]
self.idx_perm_ = result["idx_perm"]
self.cocycles_ = result["cocycles"]
self.r_cover_ = result["r_cover"]
return self.dgms_
def fit_transform(self, X, distance_matrix=False, metric="euclidean"):
"""
Compute persistence diagrams for X data array and return the diagrams.
Parameters
----------
X: ndarray (n_samples, n_features)
A numpy array of either data or distance matrix.
distance_matrix: bool
Indicator that X is a distance matrix, if not we compute a
distance matrix from X using the chosen metric.
metric: string or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
specified in pairwise_distances, including "euclidean", "manhattan",
or "cosine". Alternatively, if metric is a callable function, it is
called on each pair of instances (rows) and the resulting value
recorded. The callable should take two arrays from X as input and
return a value indicating the distance between them.
Returns
-------
dgms: list (size maxdim) of ndarray (n_pairs, 2)
A list of persistence diagrams, one for each dimension less
than maxdim. Each diagram is an ndarray of size (n_pairs, 2) with
the first column representing the birth time and the second column
representing the death time of each pair.
"""
self.transform(X, distance_matrix, metric)
return self.dgms_
def plot(
self,
diagrams=None,
*args,
**kwargs
):
"""A helper function to plot persistence diagrams.
Parameters
----------
diagrams: ndarray (n_pairs, 2) or list of diagrams
A diagram or list of diagrams as returned from self.fit.
If diagram is None, we use `self.dgm_` for plotting.
If diagram is a list of diagrams, then plot all on the same plot
using different colors.
plot_only: list of numeric
If specified, an array of only the diagrams that should be plotted.
title: string, default is None
If title is defined, add it as title of the plot.
xy_range: list of numeric [xmin, xmax, ymin, ymax]
User provided range of axes. This is useful for comparing
multiple persistence diagrams.
labels: string or list of strings
Legend labels for each diagram.
If none are specified, we use H_0, H_1, H_2,... by default.
colormap: string, default is 'default'
Any of matplotlib color palettes.
Some options are 'default', 'seaborn', 'sequential'.
See all available styles with
.. code:: python
import matplotlib as mpl
print(mpl.styles.available)
size: numeric, default is 20
Pixel size of each point plotted.
ax_color: any valid matplitlib color type.
See [https://matplotlib.org/api/colors_api.html](https://matplotlib.org/api/colors_api.html) for complete API.
diagonal: bool, default is True
Plot the diagonal x=y line.
lifetime: bool, default is False. If True, diagonal is turned to False.
Plot life time of each point instead of birth and death.
Essentially, visualize (x, y-x).
legend: bool, default is True
If true, show the legend.
show: bool, default is True
Call plt.show() after plotting.
If you are using self.plot() as part of a subplot,
set show=False and call plt.show() only once at the end.
"""
import matplotlib.pyplot as plt
import matplotlib as mpl
import persim
if diagrams is None:
# Allow using transformed diagrams as default
diagrams = self.dgms_
persim.plot_diagrams(
diagrams,
*args,
**kwargs
)
__all__ = ["Rips", "ripser", "lower_star_img"]