Source code for jina.math.distance

import numpy as np

from typing import Union, TYPE_CHECKING

if TYPE_CHECKING:
    import scipy

_SPARSE_SCIPY_TYPES = Union[
    'scipy.sparse.csr_matrix',
    'scipy.sparse.csc_matrix',
    'scipy.sparse.bsr_matrix',
    'scipy.sparse.coo_matrix',
]


[docs]def pdist( x_mat: Union['np.ndarray', _SPARSE_SCIPY_TYPES], metric: str, is_sparse: bool = False, ) -> 'np.ndarray': """Computes Pairwise distances between observations in n-dimensional space. :param x_mat: Union['np.ndarray','scipy.sparse.csr_matrix', 'scipy.sparse.coo_matrix'] of ndim 2 :param metric: string describing the metric type :param is_sparse: boolean stating if computation must be done with sparse data :return: np.ndarray of ndim 2 """ return cdist(x_mat, x_mat, metric, is_sparse)
[docs]def cdist( x_mat: Union['np.ndarray', _SPARSE_SCIPY_TYPES], y_mat: Union['np.ndarray', _SPARSE_SCIPY_TYPES], metric: str, is_sparse: bool = False, ) -> 'np.ndarray': """Computes the pairwise distance between each row of X and each row on Y according to `metric`. - Let `n_x = x_mat.shape[0]` - Let `n_y = y_mat.shape[0]` - Returns a matrix `dist` of shape `(n_x, n_y)` with `dist[i,j] = metric(x_mat[i], y_mat[j])`. :param x_mat: numpy or scipy array of ndim 2 :param y_mat: numpy or scipy array of ndim 2 :param metric: string describing the metric type :param is_sparse: boolean describing if data type is sparse :return: np.ndarray of ndim 2 """ if metric == 'cosine': if is_sparse: dists = sparse_cosine(x_mat, y_mat) else: dists = cosine(x_mat, y_mat) elif metric == 'sqeuclidean': if is_sparse: dists = sparse_sqeuclidean(x_mat, y_mat) else: dists = sqeuclidean(x_mat, y_mat) elif metric == 'euclidean': if is_sparse: dists = np.sqrt(sparse_sqeuclidean(x_mat, y_mat)) else: dists = np.sqrt(sqeuclidean(x_mat, y_mat)) else: raise ValueError(f'Input metric={metric} not valid') return dists
[docs]def cosine(x_mat: 'np.ndarray', y_mat: 'np.ndarray', eps: float = 1e-7) -> 'np.ndarray': """Cosine distance between each row in x_mat and each row in y_mat. :param x_mat: np.ndarray with ndim=2 :param y_mat: np.ndarray with ndim=2 :param eps: a small jitter to avoid divde by zero :return: np.ndarray with ndim=2 """ return 1 - np.clip( (np.dot(x_mat, y_mat.T) + eps) / ( np.outer(np.linalg.norm(x_mat, axis=1), np.linalg.norm(y_mat, axis=1)) + eps ), -1, 1, )
[docs]def sqeuclidean(x_mat: 'np.ndarray', y_mat: 'np.ndarray') -> 'np.ndarray': """squared Euclidean distance between each row in x_mat and each row in y_mat. :param x_mat: np.ndarray with ndim=2 :param y_mat: np.ndarray with ndim=2 :return: np.ndarray with ndim=2 """ return ( np.sum(y_mat ** 2, axis=1) + np.sum(x_mat ** 2, axis=1)[:, np.newaxis] - 2 * np.dot(x_mat, y_mat.T) )
[docs]def sparse_cosine( x_mat: _SPARSE_SCIPY_TYPES, y_mat: _SPARSE_SCIPY_TYPES ) -> 'np.ndarray': """Cosine distance between each row in x_mat and each row in y_mat. :param x_mat: scipy.sparse like array with ndim=2 :param y_mat: scipy.sparse like array with ndim=2 :return: np.ndarray with ndim=2 """ from scipy.sparse.linalg import norm # we need the np.asarray otherwise we get a np.matrix object that iterates differently return 1 - np.clip( np.asarray( x_mat.dot(y_mat.T) / (np.outer(norm(x_mat, axis=1), norm(y_mat, axis=1))) ), -1, 1, )
[docs]def sparse_sqeuclidean( x_mat: _SPARSE_SCIPY_TYPES, y_mat: _SPARSE_SCIPY_TYPES ) -> 'np.ndarray': """Cosine distance between each row in x_mat and each row in y_mat. :param x_mat: scipy.sparse like array with ndim=2 :param y_mat: scipy.sparse like array with ndim=2 :return: np.ndarray with ndim=2 """ # we need the np.asarray otherwise we get a np.matrix object that iterates differently return np.asarray( y_mat.power(2).sum(axis=1).flatten() + x_mat.power(2).sum(axis=1) - 2 * x_mat.dot(y_mat.T) )