Source code for matchms.similarity.CosineLinear

from collections.abc import Sequence
import numpy as np
from tqdm import tqdm  # type: ignore[import-untyped]
from matchms.Scores import Scores
from matchms.typing import SpectrumType
from .BaseSimilarity import BaseSimilarity
from .cosine_linear_functions import linear_cosine_score, sirius_merge_close_peaks


[docs] class CosineLinear(BaseSimilarity): """Calculate 'linear cosine similarity score' between two spectra. This implements the CosineLinear similarity from SIRIUS (BOECKER lab), which achieves O(n+m) time complexity by requiring spectra to be "well-separated" (consecutive peaks more than 2x tolerance apart). A preprocessing step (sirius_merge_close_peaks) enforces this invariant by greedily merging close peaks in descending intensity order. For example .. testcode:: import numpy as np from matchms import Spectrum from matchms.similarity import CosineLinear reference = Spectrum(mz=np.array([100, 150, 200.]), intensities=np.array([0.7, 0.2, 0.1])) query = Spectrum(mz=np.array([100, 140, 190.]), intensities=np.array([0.4, 0.2, 0.1])) cosine_linear = CosineLinear(tolerance=0.2) score = cosine_linear.pair(reference, query) print(f"CosineLinear score is {score['score']:.2f} with {score['matches']} matched peaks") Should output .. testoutput:: CosineLinear score is 0.83 with 1 matched peaks """ is_commutative = True score_datatype = [("score", np.float64), ("matches", "int")] # type: ignore[assignment] score_fields = ("score", "matches")
[docs] def __init__(self, tolerance: float = 0.1, mz_power: float = 0.0, intensity_power: float = 1.0): """ Parameters ---------- tolerance: Peaks will be considered a match when <= tolerance apart. Default is 0.1. Peaks closer than 2 * tolerance are merged before scoring. mz_power: The power to raise m/z to in the cosine function. The default is 0, in which case the peak intensity products will not depend on the m/z ratios. intensity_power: The power to raise intensity to in the cosine function. The default is 1. """ self.tolerance = tolerance self.mz_power = mz_power self.intensity_power = intensity_power
[docs] def pair(self, reference: SpectrumType, query: SpectrumType) -> tuple[float, int]: """Calculate linear cosine score between two spectra. Parameters ---------- reference Single reference spectrum. query Single query spectrum. Returns ------- Score Tuple with cosine score and number of matched peaks. """ spec1 = reference.peaks.to_numpy spec2 = query.peaks.to_numpy spec1 = sirius_merge_close_peaks(spec1, self.tolerance) spec2 = sirius_merge_close_peaks(spec2, self.tolerance) score, matches = linear_cosine_score( spec1, spec2, self.tolerance, self.mz_power, self.intensity_power, ) return np.asarray((score, matches), dtype=self.score_datatype)
[docs] def matrix( self, spectra_1: Sequence[SpectrumType], spectra_2: Sequence[SpectrumType] | None = None, score_fields: Sequence[str] | None = None, progress_bar: bool = True, ): """Optimized matrix computation that precomputes merged spectra. Each spectrum is merged once (N+M calls to sirius_merge_close_peaks) instead of 2*N*M times in the naive double-loop approach. """ spectra_2, is_symmetric = self._prepare_inputs(spectra_1, spectra_2) selected_fields = self._resolve_score_fields(score_fields) n_rows = len(spectra_1) n_cols = len(spectra_2) result = self._create_dense_result(n_rows, n_cols, selected_fields) merged_refs = [sirius_merge_close_peaks(r.peaks.to_numpy, self.tolerance) for r in spectra_1] if is_symmetric: merged_queries = merged_refs else: merged_queries = [ sirius_merge_close_peaks(spectrum.peaks.to_numpy, self.tolerance) for spectrum in spectra_2 ] for i_ref in tqdm( range(n_rows), desc="Calculating similarities", disable=not progress_bar, ): if is_symmetric and self.is_commutative: query_range = range(i_ref, n_cols) else: query_range = range(n_cols) for i_query in query_range: score, matches = linear_cosine_score( merged_refs[i_ref], merged_queries[i_query], self.tolerance, self.mz_power, self.intensity_power, ) score_array = self._as_score((score, matches)) self._store_in_dense_result( result, i_ref, i_query, score_array, selected_fields, ) if is_symmetric and self.is_commutative and i_ref != i_query: self._store_in_dense_result( result, i_query, i_ref, score_array, selected_fields, ) return Scores(result)