Source code for matchms.similarity.vector_similarity_functions
"""Collection of functions for calculating vector-vector similarities."""
import numba
import numpy as np
[docs]@numba.njit
def jaccard_similarity_matrix(references: np.ndarray, queries: np.ndarray) -> np.ndarray:
"""Returns matrix of jaccard indices between all-vs-all vectors of references
and queries.
Parameters
----------
references
Reference vectors as 2D numpy array. Expects that vector_i corresponds to
references[i, :].
queries
Query vectors as 2D numpy array. Expects that vector_i corresponds to
queries[i, :].
Returns
-------
scores
Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
between the vectors references[i, :] and queries[j, :].
"""
size1 = references.shape[0]
size2 = queries.shape[0]
scores = np.zeros((size1, size2))
for i in range(size1):
for j in range(size2):
scores[i, j] = jaccard_index(references[i, :], queries[j, :])
return scores
[docs]@numba.njit
def dice_similarity_matrix(references: np.ndarray, queries: np.ndarray) -> np.ndarray:
"""Returns matrix of dice similarity scores between all-vs-all vectors of references
and queries.
Parameters
----------
references
Reference vectors as 2D numpy array. Expects that vector_i corresponds to
references[i, :].
queries
Query vectors as 2D numpy array. Expects that vector_i corresponds to
queries[i, :].
Returns
-------
scores
Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
between the vectors references[i, :] and queries[j, :].
"""
size1 = references.shape[0]
size2 = queries.shape[0]
scores = np.zeros((size1, size2))
for i in range(size1):
for j in range(size2):
scores[i, j] = dice_similarity(references[i, :], queries[j, :])
return scores
[docs]@numba.njit
def cosine_similarity_matrix(references: np.ndarray, queries: np.ndarray) -> np.ndarray:
"""Returns matrix of cosine similarity scores between all-vs-all vectors of
references and queries.
Parameters
----------
references
Reference vectors as 2D numpy array. Expects that vector_i corresponds to
references[i, :].
queries
Query vectors as 2D numpy array. Expects that vector_i corresponds to
queries[i, :].
Returns
-------
scores
Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
between the vectors references[i, :] and queries[j, :].
"""
size1 = references.shape[0]
size2 = queries.shape[0]
scores = np.zeros((size1, size2))
for i in range(size1):
for j in range(size2):
scores[i, j] = cosine_similarity(references[i, :], queries[j, :])
return scores
[docs]@numba.njit
def jaccard_index(u: np.ndarray, v: np.ndarray) -> np.float64:
r"""Computes the Jaccard-index (or Jaccard similarity coefficient) of two boolean
1-D arrays.
The Jaccard index between 1-D boolean arrays `u` and `v`,
is defined as
.. math::
J(u,v) = \\frac{u \cap v}
{u \cup v}
Parameters
----------
u :
Input array. Expects boolean vector.
v :
Input array. Expects boolean vector.
Returns
-------
jaccard_similarity
The Jaccard similarity coefficient between vectors `u` and `v`.
"""
u_or_v = np.bitwise_or(u != 0, v != 0)
u_and_v = np.bitwise_and(u != 0, v != 0)
jaccard_score = 0
if u_or_v.sum() != 0:
jaccard_score = np.float64(u_and_v.sum()) / np.float64(u_or_v.sum())
return jaccard_score
[docs]@numba.njit
def dice_similarity(u: np.ndarray, v: np.ndarray) -> np.float64:
r"""Computes the Dice similarity coefficient (DSC) between two boolean 1-D arrays.
The Dice similarity coefficient between `u` and `v`, is
.. math::
DSC(u,v) = \\frac{2|u \cap v|}
{|u| + |v|}
Parameters
----------
u
Input array. Expects boolean vector.
v
Input array. Expects boolean vector.
Returns
-------
dice_similarity
The Dice similarity coefficient between 1-D arrays `u` and `v`.
"""
u_and_v = np.bitwise_and(u != 0, v != 0)
u_abs_and_v_abs = np.abs(u).sum() + np.abs(v).sum()
dice_score = 0
if u_abs_and_v_abs != 0:
dice_score = 2.0 * np.float64(u_and_v.sum()) / np.float64(u_abs_and_v_abs)
return dice_score
[docs]@numba.njit
def cosine_similarity(u: np.ndarray, v: np.ndarray) -> np.float64:
"""Calculate cosine similarity score.
Parameters
----------
u
Input vector.
v
Input vector.
Returns
-------
cosine_similarity
The Cosine similarity score between vectors `u` and `v`.
"""
assert u.shape[0] == v.shape[0], "Input vector must have same shape."
uv = 0
uu = 0
vv = 0
for i in range(u.shape[0]):
uv += u[i] * v[i]
uu += u[i] * u[i]
vv += v[i] * v[i]
cosine_score = 0
if uu != 0 and vv != 0:
cosine_score = uv / np.sqrt(uu * vv)
return np.float64(cosine_score)