Source code for matchms.similarity.ModifiedCosineGreedy

import logging
import numpy as np
from matchms.typing import SpectrumType
from ._precursor_validation import get_valid_precursor_mz
from .BaseSimilarity import BaseSimilarityWithSparse
from .CosineGreedy import CosineGreedy
from .spectrum_similarity_functions import collect_peak_pairs, filter_noise, score_best_matches


logger = logging.getLogger("matchms")


[docs] class ModifiedCosineGreedy(BaseSimilarityWithSparse): """Calculate an approximate modified cosine score between mass spectra. This implementation solves the peak assignment in a greedy way and is therefore an approximation. See :class:`~matchms.similarity.ModifiedCosineHungarian` for the exact assignment variant. The modified cosine score aims at quantifying the similarity between two mass spectra. Two peaks are considered a potential match if their m/z ratios lie within the given ``tolerance``, or if their m/z ratios lie within the tolerance once a mass-shift is applied. The mass shift is the difference in precursor m/z between the two spectra. See Watrous et al. [PNAS, 2012, https://www.pnas.org/content/109/26/E1743] for further details. Unlike in matchms < 1.0, this method also applies a noise filter by default, which removes peaks with intensity below a certain cutoff. This is typically highly beneficial for the performance of the greedy algorithm, and for most applications the results are very similar to the exact assignment variant. If you want to disable this noise filtering, you can set ``noise_cutoff`` to 0 or None. """ is_commutative = True score_datatype = [("score", np.float64), ("matches", "int")] score_fields = ("score", "matches")
[docs] def __init__( self, tolerance: float = 0.1, mz_power: float = 0.0, intensity_power: float = 1.0, noise_cutoff: float = 0.01, ): """Initialize approximate modified cosine. Parameters ---------- tolerance: Peaks will be considered a match when <= tolerance apart. Default is 0.1. mz_power: The power to raise mz to in the cosine function. The default is 0, in which case the peak intensity products will not depend on the m/z ratios. intensity_power: The power to raise intensity to in the cosine function. The default is 1. noise_cutoff: Minimum relative intensity for a peak to be considered. Default is 0.01. """ self.tolerance = tolerance self.mz_power = mz_power self.intensity_power = intensity_power self.noise_cutoff = noise_cutoff
[docs] def pair(self, spectrum_1: SpectrumType, spectrum_2: SpectrumType) -> tuple[float, int]: """Calculate approximate modified cosine score between two spectra.""" precursor_mz_ref = get_valid_precursor_mz(spectrum_1, logger) precursor_mz_query = get_valid_precursor_mz(spectrum_2, logger) mass_shift = precursor_mz_ref - precursor_mz_query if abs(mass_shift) <= self.tolerance: return CosineGreedy( tolerance=self.tolerance, mz_power=self.mz_power, intensity_power=self.intensity_power, ).pair(spectrum_1, spectrum_2) def get_matching_pairs(): """Find all pairs of peaks that match within the given tolerance.""" zero_pairs = collect_peak_pairs( spec1, spec2, self.tolerance, shift=0.0, mz_power=self.mz_power, intensity_power=self.intensity_power ) nonzero_pairs = collect_peak_pairs( spec1, spec2, self.tolerance, shift=mass_shift, mz_power=self.mz_power, intensity_power=self.intensity_power ) if zero_pairs is None: zero_pairs = np.zeros((0, 3)) if nonzero_pairs is None: nonzero_pairs = np.zeros((0, 3)) matching_pairs = np.concatenate((zero_pairs, nonzero_pairs), axis=0) if matching_pairs.shape[0] > 0: matching_pairs = matching_pairs[np.argsort(matching_pairs[:, 2], kind="mergesort")[::-1], :] return matching_pairs spec1 = spectrum_1.peaks.to_numpy spec2 = spectrum_2.peaks.to_numpy # Filter noise from spectra by removing peaks with intensity below a certain cutoff. if self.noise_cutoff and self.noise_cutoff > 0.0: spec1 = np.stack(filter_noise(spec1[:, 0], spec1[:, 1], self.noise_cutoff), axis=-1) spec2 = np.stack(filter_noise(spec2[:, 0], spec2[:, 1], self.noise_cutoff), axis=-1) matching_pairs = get_matching_pairs() if matching_pairs.shape[0] == 0: return np.asarray((float(0), 0), dtype=self.score_datatype) score = score_best_matches(matching_pairs, spec1, spec2, self.mz_power, self.intensity_power) return np.asarray(score, dtype=self.score_datatype)