Source code for matchms.similarity.FingerprintSimilarity

from collections.abc import Sequence
import numpy as np
import scipy.sparse as sp
from chemap.metrics import (
    tanimoto_similarity_matrix,
)
from matchms.Fingerprints import Fingerprints
from matchms.Scores import Scores
from matchms.typing import SpectrumType
from .BaseSimilarity import BaseSimilarity
from .vector_similarity_functions import cosine_similarity_matrix


[docs] class FingerprintSimilarity(BaseSimilarity): """Calculate similarity between molecules based on molecular fingerprints. Fingerprints can either be provided explicitly as :class:`~matchms.Fingerprints` objects or computed internally from input spectra. This class no longer expects fingerprints to be stored directly in spectrum metadata. Instead, it uses a :class:`~matchms.Fingerprints` container. Currently supported similarity measures are: - ``"cosine"`` - ``"tanimoto"`` Notes ----- - Tanimoto is used in its generalized form and therefore also works for count/weighted fingerprints. - Fingerprints may be stored densely (NumPy) or sparsely (CSR). """ is_commutative = True score_datatype = np.float64 score_fields = ("score",)
[docs] def __init__( self, fingerprint_generator, similarity_measure: str = "tanimoto", set_empty_scores: float | int | str = "nan", ignore_stereochemistry: bool = False, count: bool = False, folded: bool = True, return_csr: bool = False, invalid_policy: str = "raise", **fingerprint_config_kwargs, ): """ Parameters ---------- fingerprint_generator A chemap-compatible fingerprint generator. similarity_measure Choose similarity measure from ``"cosine"`` or ``"tanimoto"``. The default is ``"tanimoto"``. set_empty_scores Define what should be returned instead of a similarity score in cases where fingerprints are missing. The default is ``"nan"``, which will return ``np.nan`` in such cases. ignore_stereochemistry Passed to internally created :class:`~matchms.Fingerprints` objects. count Passed to internally created :class:`~matchms.Fingerprints` objects. folded Passed to internally created :class:`~matchms.Fingerprints` objects. return_csr Passed to internally created :class:`~matchms.Fingerprints` objects. invalid_policy Passed to internally created :class:`~matchms.Fingerprints` objects. **fingerprint_config_kwargs Additional keyword arguments passed to internally created :class:`~matchms.Fingerprints` objects. """ assert similarity_measure in ["cosine", "tanimoto"], "Unknown similarity measure." self.fingerprint_generator = fingerprint_generator self.similarity_measure = similarity_measure self.set_empty_scores = set_empty_scores self.ignore_stereochemistry = ignore_stereochemistry self.count = count self.folded = folded self.return_csr = return_csr self.invalid_policy = invalid_policy self.fingerprint_config_kwargs = fingerprint_config_kwargs
[docs] def pair(self, spectrum_1: SpectrumType, spectrum_2: SpectrumType): """Pairwise fingerprint similarity is not supported in this API. FingerprintSimilarity works on precomputed Fingerprints containers or computes fingerprints internally for collections of spectra in `matrix()`. Use `matrix(...)` instead. """ raise NotImplementedError( "FingerprintSimilarity.pair() is not supported. " "Use matrix(...) with spectra or Fingerprints objects instead." )
[docs] def matrix( self, spectra_1: Sequence[SpectrumType] | None = None, spectra_2: Sequence[SpectrumType] | None = None, fingerprints_1: Fingerprints | None = None, fingerprints_2: Fingerprints | None = None, score_fields: Sequence[str] | None = None, progress_bar: bool = True, ) -> Scores: """Calculate matrix of fingerprint-based similarity scores. Parameters ---------- spectra_1 First collection of spectra. Used only if `fingerprints_1` is not given. spectra_2 Second collection of spectra. Used only if `fingerprints_2` is not given. If None and `fingerprints_2` is None, compare the first input against itself. fingerprints_1 Optional precomputed Fingerprints object for the first input. fingerprints_2 Optional precomputed Fingerprints object for the second input. If None, compare the first input against itself. score_fields Requested score fields. Only ``("score",)`` is supported. progress_bar Included for API compatibility. Not used here. Returns ------- Scores Dense score matrix as a ``Scores`` object. """ del progress_bar selected_fields = self._resolve_score_fields(score_fields) if selected_fields != ("score",): raise NotImplementedError( "FingerprintSimilarity.matrix() supports only score_fields=('score',)." ) fingerprints_1, fingerprints_2, is_symmetric = self._prepare_fingerprint_inputs( spectra_1=spectra_1, spectra_2=spectra_2, fingerprints_1=fingerprints_1, fingerprints_2=fingerprints_2, ) X1 = self._fingerprint_matrix(fingerprints_1) X2 = self._fingerprint_matrix(fingerprints_2) n_rows = fingerprints_1.fingerprint_count n_cols = fingerprints_2.fingerprint_count assert n_rows > 0 and n_cols > 0, ( "Not enough molecular fingerprints.", "Provide valid spectra or precomputed Fingerprints first.", ) similarity_matrix = self._compute_similarity_matrix(X1, X2) if is_symmetric and similarity_matrix.shape[0] == similarity_matrix.shape[1]: similarity_matrix = 0.5 * (similarity_matrix + similarity_matrix.T) return Scores({"score": similarity_matrix.astype(self.score_datatype, copy=False)})
# ------------------------------------------------------------------------- # Helpers # ------------------------------------------------------------------------- def _prepare_fingerprint_inputs( self, spectra_1: Sequence[SpectrumType] | None, spectra_2: Sequence[SpectrumType] | None, fingerprints_1: Fingerprints | None, fingerprints_2: Fingerprints | None, ) -> tuple[Fingerprints, Fingerprints, bool]: """Normalize spectra / fingerprints inputs into two Fingerprints objects.""" if fingerprints_1 is None and spectra_1 is None: raise ValueError("Either spectra_1 or fingerprints_1 must be provided.") if fingerprints_1 is not None and spectra_1 is not None: raise ValueError("Provide either spectra_1 or fingerprints_1, not both.") if fingerprints_2 is not None and spectra_2 is not None: raise ValueError("Provide either spectra_2 or fingerprints_2, not both.") if fingerprints_1 is None: fingerprints_1 = self._compute_fingerprints_from_spectra(spectra_1) if fingerprints_2 is None and spectra_2 is None: return fingerprints_1, fingerprints_1, True if fingerprints_2 is None: fingerprints_2 = self._compute_fingerprints_from_spectra(spectra_2) return fingerprints_1, fingerprints_2, False def _compute_fingerprints_from_spectra(self, spectra: Sequence[SpectrumType]) -> Fingerprints: """Compute fingerprints from spectra using the configured Fingerprints container.""" fingerprints = Fingerprints( fingerprint_generator=self.fingerprint_generator, ignore_stereochemistry=self.ignore_stereochemistry, count=self.count, folded=self.folded, return_csr=self.return_csr, invalid_policy=self.invalid_policy, **self.fingerprint_config_kwargs, ) fingerprints.compute_fingerprints(list(spectra)) return fingerprints @staticmethod def _fingerprint_matrix(fingerprints: Fingerprints): """Return stored fingerprint matrix.""" if fingerprints.fingerprints is None or fingerprints.fingerprint_count == 0: raise ValueError("Fingerprint container is empty.") return fingerprints.fingerprints def _compute_similarity_matrix(self, fingerprints_1, fingerprints_2) -> np.ndarray: """Compute similarity block between two fingerprint matrices.""" if self.similarity_measure == "cosine": if sp.issparse(fingerprints_1): fingerprints_1 = fingerprints_1.toarray() if sp.issparse(fingerprints_2): fingerprints_2 = fingerprints_2.toarray() fingerprints_1 = np.asarray(fingerprints_1, dtype=np.float32) fingerprints_2 = np.asarray(fingerprints_2, dtype=np.float32) return cosine_similarity_matrix(fingerprints_1, fingerprints_2) if self.similarity_measure == "tanimoto": kind = "sparse" if sp.issparse(fingerprints_1) and sp.issparse(fingerprints_2) else "dense" if kind == "dense": if sp.issparse(fingerprints_1): fingerprints_1 = fingerprints_1.toarray() if sp.issparse(fingerprints_2): fingerprints_2 = fingerprints_2.toarray() fingerprints_1 = np.asarray(fingerprints_1, dtype=np.float32) fingerprints_2 = np.asarray(fingerprints_2, dtype=np.float32) return tanimoto_similarity_matrix( fingerprints_1, fingerprints_2, kind=kind, ) raise NotImplementedError