Source code for matchms.similarity.BinnedEmbeddingSimilarity

from collections.abc import Iterable
import numpy as np
from matchms.typing import SpectrumType
from .BaseEmbeddingSimilarity import BaseEmbeddingSimilarity


[docs] class BinnedEmbeddingSimilarity(BaseEmbeddingSimilarity): """Compare spectra by cosine/euclidean similarity of binned intensities. Spectra are converted to fixed-length vectors by summing intensities in equally spaced m/z bins. Each vector is normalized to its maximum bin intensity when that maximum is positive. Empty spectra, spectra without peaks in the configured m/z range, and spectra with only zero intensities produce a zero vector instead of NaNs. Parameters ---------- similarity Similarity measure used for comparing embeddings. Supported values are ``"cosine"`` and ``"euclidean"``. max_mz Maximum m/z value to include. Values outside ``[0, max_mz]`` are ignored. bin_width Width of each m/z bin. intensity_power Power applied to peak intensities before binning. """
[docs] def __init__( self, similarity: str = "cosine", max_mz: float = 1005, bin_width: float = 1, intensity_power: float = 1 ): super().__init__(similarity=similarity) if max_mz <= 0: raise ValueError("max_mz must be > 0.") if bin_width <= 0: raise ValueError("bin_width must be > 0.") self.max_mz = max_mz self.bin_width = bin_width self.intensity_power = intensity_power
@property def n_bins(self) -> int: """Number of bins used for each embedding vector.""" return int(np.floor(self.max_mz / self.bin_width)) + 1 def _bin_spectrum(self, spectrum: SpectrumType) -> np.ndarray: """Bin a spectrum's peaks into fixed-width m/z bins. Parameters ---------- spectrum : SpectrumType The spectrum to bin. Returns ------- np.ndarray Array of binned and normalized intensities. """ mzs = spectrum.peaks.mz intensities = spectrum.peaks.intensities binned_intensities = np.zeros(self.n_bins, dtype=np.float64) if mzs.size == 0: return binned_intensities valid_mask = (mzs >= 0) & (mzs <= self.max_mz) if not np.any(valid_mask): return binned_intensities valid_mzs = mzs[valid_mask] valid_intensities = intensities[valid_mask] ** self.intensity_power bin_indices = np.floor(valid_mzs / self.bin_width).astype(np.int64) bin_indices = np.clip(bin_indices, 0, self.n_bins - 1) np.add.at(binned_intensities, bin_indices, valid_intensities) max_intensity = np.max(binned_intensities) if max_intensity > 0: binned_intensities /= max_intensity return binned_intensities
[docs] def compute_embeddings(self, spectra: Iterable[SpectrumType]) -> np.ndarray: """Convert spectra into binned embeddings. Parameters ---------- spectra : Iterable[SpectrumType] The spectra to convert into embeddings. Returns ------- np.ndarray Array of shape (n_spectra, n_bins) containing the binned embeddings. """ embeddings = [self._bin_spectrum(spectrum) for spectrum in spectra] if not embeddings: return np.zeros((0, self.n_bins), dtype=np.float64) return np.vstack(embeddings)