Source code for matchms.similarity.ModifiedCosineGreedy

import logging
import numpy as np
from matchms.typing import SpectrumType
from ._precursor_validation import get_valid_precursor_mz
from .BaseSimilarity import BaseSimilarityWithSparse
from .CosineGreedy import CosineGreedy
from .spectrum_similarity_functions import collect_peak_pairs, filter_noise, score_best_matches


logger = logging.getLogger("matchms")



[docs]
class ModifiedCosineGreedy(BaseSimilarityWithSparse):
    """Calculate an approximate modified cosine score between mass spectra.

    This implementation solves the peak assignment in a greedy way and is therefore
    an approximation. See :class:`~matchms.similarity.ModifiedCosineHungarian` for
    the exact assignment variant.

    The modified cosine score aims at quantifying the similarity between two
    mass spectra. Two peaks are considered a potential match if their m/z ratios
    lie within the given ``tolerance``, or if their m/z ratios lie within the
    tolerance once a mass-shift is applied. The mass shift is the difference in
    precursor m/z between the two spectra.

    See Watrous et al. [PNAS, 2012, https://www.pnas.org/content/109/26/E1743]
    for further details.

    Unlike in matchms < 1.0, this method also applies a noise filter by default,
    which removes peaks with intensity below a certain cutoff. This is typically
    highly beneficial for the performance of the greedy algorithm, and for most
    applications the results are very similar to the exact assignment variant.
    If you want to disable this noise filtering, you can set ``noise_cutoff`` to 0 or None.
    """

    is_commutative = True
    score_datatype = [("score", np.float64), ("matches", "int")]
    score_fields = ("score", "matches")


[docs]
    def __init__(
            self, tolerance: float = 0.1,
            mz_power: float = 0.0,
            intensity_power: float = 1.0,
            noise_cutoff: float = 0.01,
            ):
        """Initialize approximate modified cosine.

        Parameters
        ----------
        tolerance:
            Peaks will be considered a match when <= tolerance apart. Default is 0.1.
        mz_power:
            The power to raise mz to in the cosine function. The default is 0, in which
            case the peak intensity products will not depend on the m/z ratios.
        intensity_power:
            The power to raise intensity to in the cosine function. The default is 1.
        noise_cutoff:
            Minimum relative intensity for a peak to be considered. Default is 0.01.
        """
        self.tolerance = tolerance
        self.mz_power = mz_power
        self.intensity_power = intensity_power
        self.noise_cutoff = noise_cutoff



[docs]
    def pair(self, spectrum_1: SpectrumType, spectrum_2: SpectrumType) -> tuple[float, int]:
        """Calculate approximate modified cosine score between two spectra."""

        precursor_mz_ref = get_valid_precursor_mz(spectrum_1, logger)
        precursor_mz_query = get_valid_precursor_mz(spectrum_2, logger)
        mass_shift = precursor_mz_ref - precursor_mz_query

        if abs(mass_shift) <= self.tolerance:
            return CosineGreedy(
                tolerance=self.tolerance,
                mz_power=self.mz_power,
                intensity_power=self.intensity_power,
            ).pair(spectrum_1, spectrum_2)

        def get_matching_pairs():
            """Find all pairs of peaks that match within the given tolerance."""
            zero_pairs = collect_peak_pairs(
                spec1, spec2, self.tolerance, shift=0.0,
                mz_power=self.mz_power, intensity_power=self.intensity_power
            )
            nonzero_pairs = collect_peak_pairs(
                spec1, spec2, self.tolerance, shift=mass_shift,
                mz_power=self.mz_power, intensity_power=self.intensity_power
            )

            if zero_pairs is None:
                zero_pairs = np.zeros((0, 3))
            if nonzero_pairs is None:
                nonzero_pairs = np.zeros((0, 3))
            matching_pairs = np.concatenate((zero_pairs, nonzero_pairs), axis=0)
            if matching_pairs.shape[0] > 0:
                matching_pairs = matching_pairs[np.argsort(matching_pairs[:, 2], kind="mergesort")[::-1], :]
            return matching_pairs

        spec1 = spectrum_1.peaks.to_numpy
        spec2 = spectrum_2.peaks.to_numpy
        # Filter noise from spectra by removing peaks with intensity below a certain cutoff.
        if self.noise_cutoff and self.noise_cutoff > 0.0:
            spec1 = np.stack(filter_noise(spec1[:, 0], spec1[:, 1], self.noise_cutoff), axis=-1)
            spec2 = np.stack(filter_noise(spec2[:, 0], spec2[:, 1], self.noise_cutoff), axis=-1)

        matching_pairs = get_matching_pairs()
        if matching_pairs.shape[0] == 0:
            return np.asarray((float(0), 0), dtype=self.score_datatype)
        score = score_best_matches(matching_pairs, spec1, spec2, self.mz_power, self.intensity_power)
        return np.asarray(score, dtype=self.score_datatype)