Source code for matchms.similarity.CosineGreedy

import numpy as np
from matchms.typing import SpectrumType
from .BaseSimilarity import BaseSimilarityWithSparse
from .spectrum_similarity_functions import collect_peak_pairs, filter_noise, score_best_matches



[docs]
class CosineGreedy(BaseSimilarityWithSparse):
    """Calculate 'cosine similarity score' between two spectra.

    The cosine score aims at quantifying the similarity between two mass spectra.
    The score is calculated by finding best possible matches between peaks
    of two spectra. Two peaks are considered a potential match if their
    m/z ratios lie within the given 'tolerance'.
    The underlying peak assignment problem is here solved in a 'greedy' way.
    This can perform notably faster, but does occasionally deviate slightly from
    a fully correct solution (as with the Hungarian algorithm, see
    :class:`~matchms.similarity.CosineHungarian`). In practice this will rarely
    affect similarity scores notably, in particular for smaller tolerances.

    For example

    .. testcode::

        import numpy as np
        from matchms import Spectrum
        from matchms.similarity import CosineGreedy

        spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]),
                             intensities=np.array([0.7, 0.2, 0.1]),
                             metadata={"precursor_mz": 200.0})
        spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]),
                         intensities=np.array([0.4, 0.2, 0.1]),
                         metadata={"precursor_mz": 190.0})

        # Use factory to construct a similarity function
        cosine_greedy = CosineGreedy(tolerance=0.2)

        score = cosine_greedy.pair(spectrum_1, spectrum_2)

        print(f"Cosine score is {score['score']:.2f} with {score['matches']} matched peaks")

    Should output

    .. testoutput::

        Cosine score is 0.83 with 1 matched peaks

    Unlike in matchms < 1.0, this method also applies a noise filter by default,
    which removes peaks with intensity below a certain cutoff. This is typically
    highly beneficial for the performance of the greedy algorithm, and for most
    applications the results are very similar to the exact assignment variant.
    If you want to disable this noise filtering, you can set ``noise_cutoff`` to 0 or None.
    """

    # Set key characteristics as class attributes (see BaseSimilarity for details).
    is_commutative = True
    score_datatype = [("score", np.float64), ("matches", "int")]
    score_fields =("score", "matches")



[docs]
    def __init__(
            self, tolerance: float = 0.1,
            mz_power: float = 0.0,
            intensity_power: float = 1.0,
            noise_cutoff: float = 0.01,
            ):
        """
        Parameters
        ----------
        tolerance:
            Peaks will be considered a match when <= tolerance apart. Default is 0.1.
        mz_power:
            The power to raise m/z to in the cosine function. The default is 0, in which
            case the peak intensity products will not depend on the m/z ratios.
        intensity_power:
            The power to raise intensity to in the cosine function. The default is 1.
        noise_cutoff:
            Minimum relative intensity for a peak to be considered. Default is 0.01.
        """
        self.tolerance = tolerance
        self.mz_power = mz_power
        self.intensity_power = intensity_power
        self.noise_cutoff = noise_cutoff



[docs]
    def pair(self, spectrum_1: SpectrumType, spectrum_2: SpectrumType) -> tuple[float, int]:
        """Calculate cosine score between two spectra.

        Parameters
        ----------
        spectrum_1
            First spectrum.
        spectrum_2
            Second spectrum.

        Returns
        -------
        Score
            Tuple with cosine score and number of matched peaks.
            The score can be access as `score["score"]` and the number of matched peaks as `score["matches"]`.
        """

        def get_matching_pairs():
            """Get pairs of peaks that match within the given tolerance."""
            matching_pairs = collect_peak_pairs(
                spec1, spec2, self.tolerance, shift=0.0,
                mz_power=self.mz_power, intensity_power=self.intensity_power
            )
            if matching_pairs is None:
                return None
            matching_pairs = matching_pairs[np.argsort(matching_pairs[:, 2], kind="mergesort")[::-1], :]
            return matching_pairs

        spec1 = spectrum_1.peaks.to_numpy
        spec2 = spectrum_2.peaks.to_numpy
        # Filter noise from spectra by removing peaks with intensity below a certain cutoff.
        if self.noise_cutoff and self.noise_cutoff > 0.0:
            spec1 = np.stack(filter_noise(spec1[:, 0], spec1[:, 1], self.noise_cutoff), axis=-1)
            spec2 = np.stack(filter_noise(spec2[:, 0], spec2[:, 1], self.noise_cutoff), axis=-1)

        matching_pairs = get_matching_pairs()
        if matching_pairs is None:
            return np.asarray((float(0), 0), dtype=self.score_datatype)
        score = score_best_matches(matching_pairs, spec1, spec2, self.mz_power, self.intensity_power)
        return np.asarray(score, dtype=self.score_datatype)