import logging
import numpy as np
from matchms.typing import SpectrumType
from ._precursor_validation import get_valid_precursor_mz
from .BaseSimilarity import BaseSimilarityWithSparse
from .CosineGreedy import CosineGreedy
from .spectrum_similarity_functions import collect_peak_pairs, filter_noise, score_best_matches
logger = logging.getLogger("matchms")
[docs]
class ModifiedCosineGreedy(BaseSimilarityWithSparse):
"""Calculate an approximate modified cosine score between mass spectra.
This implementation solves the peak assignment in a greedy way and is therefore
an approximation. See :class:`~matchms.similarity.ModifiedCosineHungarian` for
the exact assignment variant.
The modified cosine score aims at quantifying the similarity between two
mass spectra. Two peaks are considered a potential match if their m/z ratios
lie within the given ``tolerance``, or if their m/z ratios lie within the
tolerance once a mass-shift is applied. The mass shift is the difference in
precursor m/z between the two spectra.
See Watrous et al. [PNAS, 2012, https://www.pnas.org/content/109/26/E1743]
for further details.
Unlike in matchms < 1.0, this method also applies a noise filter by default,
which removes peaks with intensity below a certain cutoff. This is typically
highly beneficial for the performance of the greedy algorithm, and for most
applications the results are very similar to the exact assignment variant.
If you want to disable this noise filtering, you can set ``noise_cutoff`` to 0 or None.
"""
is_commutative = True
score_datatype = [("score", np.float64), ("matches", "int")]
score_fields = ("score", "matches")
[docs]
def __init__(
self, tolerance: float = 0.1,
mz_power: float = 0.0,
intensity_power: float = 1.0,
noise_cutoff: float = 0.01,
):
"""Initialize approximate modified cosine.
Parameters
----------
tolerance:
Peaks will be considered a match when <= tolerance apart. Default is 0.1.
mz_power:
The power to raise mz to in the cosine function. The default is 0, in which
case the peak intensity products will not depend on the m/z ratios.
intensity_power:
The power to raise intensity to in the cosine function. The default is 1.
noise_cutoff:
Minimum relative intensity for a peak to be considered. Default is 0.01.
"""
self.tolerance = tolerance
self.mz_power = mz_power
self.intensity_power = intensity_power
self.noise_cutoff = noise_cutoff
[docs]
def pair(self, spectrum_1: SpectrumType, spectrum_2: SpectrumType) -> tuple[float, int]:
"""Calculate approximate modified cosine score between two spectra."""
precursor_mz_ref = get_valid_precursor_mz(spectrum_1, logger)
precursor_mz_query = get_valid_precursor_mz(spectrum_2, logger)
mass_shift = precursor_mz_ref - precursor_mz_query
if abs(mass_shift) <= self.tolerance:
return CosineGreedy(
tolerance=self.tolerance,
mz_power=self.mz_power,
intensity_power=self.intensity_power,
).pair(spectrum_1, spectrum_2)
def get_matching_pairs():
"""Find all pairs of peaks that match within the given tolerance."""
zero_pairs = collect_peak_pairs(
spec1, spec2, self.tolerance, shift=0.0,
mz_power=self.mz_power, intensity_power=self.intensity_power
)
nonzero_pairs = collect_peak_pairs(
spec1, spec2, self.tolerance, shift=mass_shift,
mz_power=self.mz_power, intensity_power=self.intensity_power
)
if zero_pairs is None:
zero_pairs = np.zeros((0, 3))
if nonzero_pairs is None:
nonzero_pairs = np.zeros((0, 3))
matching_pairs = np.concatenate((zero_pairs, nonzero_pairs), axis=0)
if matching_pairs.shape[0] > 0:
matching_pairs = matching_pairs[np.argsort(matching_pairs[:, 2], kind="mergesort")[::-1], :]
return matching_pairs
spec1 = spectrum_1.peaks.to_numpy
spec2 = spectrum_2.peaks.to_numpy
# Filter noise from spectra by removing peaks with intensity below a certain cutoff.
if self.noise_cutoff and self.noise_cutoff > 0.0:
spec1 = np.stack(filter_noise(spec1[:, 0], spec1[:, 1], self.noise_cutoff), axis=-1)
spec2 = np.stack(filter_noise(spec2[:, 0], spec2[:, 1], self.noise_cutoff), axis=-1)
matching_pairs = get_matching_pairs()
if matching_pairs.shape[0] == 0:
return np.asarray((float(0), 0), dtype=self.score_datatype)
score = score_best_matches(matching_pairs, spec1, spec2, self.mz_power, self.intensity_power)
return np.asarray(score, dtype=self.score_datatype)