"""Collection of functions for calculating vector-vector similarities."""
import numba
import numpy as np
[docs]
def jaccard_similarity_matrix(references: np.ndarray, queries: np.ndarray) -> np.ndarray:
"""Returns matrix of jaccard indices between all-vs-all vectors of references
and queries.
Parameters
----------
references
Reference vectors as 2D numpy array. Expects that vector_i corresponds to
references[i, :].
queries
Query vectors as 2D numpy array. Expects that vector_i corresponds to
queries[i, :].
Returns
-------
scores
Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
between the vectors references[i, :] and queries[j, :].
"""
# The trick to fast inference is to use float32 since it allows using BLAS
references = np.array(references, dtype=np.float32) # R,N
queries = np.array(queries, dtype=np.float32) # Q,N
intersection = references @ queries.T # R,N @ N,Q -> R,Q
union = np.sum(references, axis=1, keepdims=True) + np.sum(queries,axis=1, keepdims=True).T # R,1+1,Q -> R,Q
union -= intersection
jaccard = np.nan_to_num(intersection / union) # R,Q
return jaccard
[docs]
def dice_similarity_matrix(references: np.ndarray, queries: np.ndarray) -> np.ndarray:
"""Returns matrix of dice similarity scores between all-vs-all vectors of references
and queries.
Parameters
----------
references
Reference vectors as 2D numpy array. Expects that vector_i corresponds to
references[i, :].
queries
Query vectors as 2D numpy array. Expects that vector_i corresponds to
queries[i, :].
Returns
-------
scores
Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
between the vectors references[i, :] and queries[j, :].
"""
references = np.array(references, dtype=np.float32) # R,N
queries = np.array(queries, dtype=np.float32) # Q,N
intersection = references @ queries.T # R,N @ N,Q -> R,Q
union = np.sum(references, axis=1, keepdims=True) + np.sum(queries, axis=1, keepdims=True).T # R,1 + 1,Q -> R, Q
dice = 2 * np.nan_to_num(intersection / union) # R,Q
return dice
[docs]
def cosine_similarity_matrix(references: np.ndarray, queries: np.ndarray) -> np.ndarray:
"""Returns matrix of cosine similarity scores between all-vs-all vectors of
references and queries.
Parameters
----------
references
Reference vectors as 2D numpy array. Expects that vector_i corresponds to
references[i, :].
queries
Query vectors as 2D numpy array. Expects that vector_i corresponds to
queries[i, :].
Returns
-------
scores
Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
between the vectors references[i, :] and queries[j, :].
"""
references = np.array(references, dtype=np.float32) # R,N
queries = np.array(queries, dtype=np.float32) # Q,N
cosine = references @ queries.T # R,N @ N,Q -> R,Q
r_norm = np.sum(references ** 2, axis=1, keepdims=True) # R,N -> R,1
q_norm = np.sum(queries**2, axis=1, keepdims=True) # Q,N -> Q,1
norm = r_norm @ q_norm.T # R,N @ N,Q -> R,Q
cosine = np.nan_to_num(cosine * norm ** -.5) # R,Q
return cosine
[docs]
@numba.njit
def jaccard_index(u: np.ndarray, v: np.ndarray) -> np.float64:
r"""Computes the Jaccard-index (or Jaccard similarity coefficient) of two boolean
1-D arrays.
The Jaccard index between 1-D boolean arrays `u` and `v`,
is defined as
.. math::
J(u,v) = \\frac{u \cap v}
{u \cup v}
Parameters
----------
u :
Input array. Expects boolean vector.
v :
Input array. Expects boolean vector.
Returns
-------
jaccard_similarity
The Jaccard similarity coefficient between vectors `u` and `v`.
"""
u_or_v = np.bitwise_or(u != 0, v != 0)
u_and_v = np.bitwise_and(u != 0, v != 0)
jaccard_score = 0
if u_or_v.sum() != 0:
jaccard_score = np.float64(u_and_v.sum()) / np.float64(u_or_v.sum())
return jaccard_score
[docs]
@numba.njit
def dice_similarity(u: np.ndarray, v: np.ndarray) -> np.float64:
r"""Computes the Dice similarity coefficient (DSC) between two boolean 1-D arrays.
The Dice similarity coefficient between `u` and `v`, is
.. math::
DSC(u,v) = \\frac{2|u \cap v|}
{|u| + |v|}
Parameters
----------
u
Input array. Expects boolean vector.
v
Input array. Expects boolean vector.
Returns
-------
dice_similarity
The Dice similarity coefficient between 1-D arrays `u` and `v`.
"""
u_and_v = np.bitwise_and(u != 0, v != 0)
u_abs_and_v_abs = np.abs(u).sum() + np.abs(v).sum()
dice_score = 0
if u_abs_and_v_abs != 0:
dice_score = 2.0 * np.float64(u_and_v.sum()) / np.float64(u_abs_and_v_abs)
return dice_score
[docs]
@numba.njit
def cosine_similarity(u: np.ndarray, v: np.ndarray) -> np.float64:
"""Calculate cosine similarity score.
Parameters
----------
u
Input vector.
v
Input vector.
Returns
-------
cosine_similarity
The Cosine similarity score between vectors `u` and `v`.
"""
assert u.shape[0] == v.shape[0], "Input vector must have same shape."
uv = 0
uu = 0
vv = 0
for i in range(u.shape[0]):
uv += u[i] * v[i]
uu += u[i] * u[i]
vv += v[i] * v[i]
cosine_score = 0
if uu != 0 and vv != 0:
cosine_score = uv / np.sqrt(uu * vv)
return np.float64(cosine_score)