Source code for matchms.similarity.NeutralLossesCosine

import logging
from typing import Tuple
import numpy as np
from matchms.typing import SpectrumType
from ._precursor_validation import get_valid_precursor_mz
from .BaseSimilarity import BaseSimilarity
from .spectrum_similarity_functions import collect_peak_pairs, score_best_matches


logger = logging.getLogger("matchms")


[docs]
class NeutralLossesCosine(BaseSimilarity):
    """Calculate 'neutral losses cosine score' between mass spectra.

    The neutral losses cosine score aims at quantifying the similarity between two
    mass spectra. The score is calculated by finding best possible matches between
    peaks of two spectra. Two peaks are considered a potential match if their
    m/z ratios lie within the given 'tolerance' once a mass-shift is applied.
    The mass shift is the difference in precursor-m/z between the two spectra.
    In general, `ModifiedCosineGreedy` is recommended over `NeutralLossesCosine` because
    it will on average deliver more reliable results.

    """

    # Set key characteristics as class attributes
    is_commutative = True
    # Set output data type, e.g. ("score", "float") or [("score", "float"), ("matches", "int")]
    score_datatype = [("score", np.float64), ("matches", "int")]


[docs]
    def __init__(self,tolerance: float = 0.1, mz_power: float = 0.0,
                 intensity_power: float = 1.0, ignore_peaks_above_precursor: bool = True):
        """
        Parameters
        ----------
        tolerance:
            Peaks will be considered a match when <= tolerance apart. Default is 0.1.
        mz_power:
            The power to raise mz to in the cosine function. The default is 0, in which
            case the peak intensity products will not depend on the m/z ratios.
        intensity_power:
            The power to raise intensity to in the cosine function. The default is 1.
        ignore_peaks_above_precursor:
            By default this is set to True, meaning that peaks with m/z values larger
            than the precursor-m/z will be ignored (since those would correspond to negative
            "neutral losses").
        """
        self.tolerance = tolerance
        self.mz_power = mz_power
        self.intensity_power = intensity_power
        self.ignore_peaks_above_precursor = ignore_peaks_above_precursor


    def _get_matching_pairs(self, spec1, spec2, mass_shift: float) -> np.ndarray:
        """Find all pairs of peaks that match within the given tolerance."""
        matching_pairs = collect_peak_pairs(
            spec1, spec2, self.tolerance,
            shift=mass_shift, mz_power=self.mz_power,
            intensity_power=self.intensity_power
        )
        if matching_pairs is None:
            return None
        if matching_pairs.shape[0] > 0:
            matching_pairs = matching_pairs[np.argsort(matching_pairs[:, 2], kind="mergesort")[::-1], :]
        return matching_pairs
    

[docs]
    def pair(self, reference: SpectrumType, query: SpectrumType) -> Tuple[float, int]:
        """Calculate neutral losses cosine score between two spectra.

        Parameters
        ----------
        reference
            Single reference spectrum.
        query
            Single query spectrum.

        Returns
        -------

        Tuple with cosine score and number of matched peaks.
        """

        precursor_mz_ref = get_valid_precursor_mz(reference, logger)
        precursor_mz_query = get_valid_precursor_mz(query, logger)
        mass_shift = precursor_mz_ref - precursor_mz_query

        spec1 = reference.peaks.to_numpy
        spec2 = query.peaks.to_numpy
        
        if self.ignore_peaks_above_precursor:
            spec1 = spec1[np.where(spec1[:, 0] < precursor_mz_ref)]
            spec2 = spec2[np.where(spec2[:, 0] < precursor_mz_query)]

        matching_pairs = self._get_matching_pairs(spec1, spec2, mass_shift)
        if matching_pairs is None:
            return np.asarray((float(0), 0), dtype=self.score_datatype)
        score = score_best_matches(matching_pairs, spec1, spec2, self.mz_power, self.intensity_power)
        return np.asarray(score, dtype=self.score_datatype)