Source code for matchms.similarity.MetadataMatch
import logging
from typing import List
import numpy as np
from sparsestack import StackedSparseArray
from matchms.similarity.spectrum_similarity_functions import (
number_matching, number_matching_symmetric)
from matchms.typing import SpectrumType
from .BaseSimilarity import BaseSimilarity
logger = logging.getLogger("matchms")
[docs]class MetadataMatch(BaseSimilarity):
"""Return True if metadata entries of a specified field match between two spectra.
This is supposed to be used to compare a wide range of possible metadata entries and
use this to later select related or similar spectra.
Example to calculate scores between 2 pairs of spectrums and iterate over the scores
.. testcode::
import numpy as np
from matchms import calculate_scores
from matchms import Spectrum
from matchms.similarity import MetadataMatch
spectrum_1 = Spectrum(mz=np.array([]),
intensities=np.array([]),
metadata={"instrument_type": "orbitrap",
"id": 1})
spectrum_2 = Spectrum(mz=np.array([]),
intensities=np.array([]),
metadata={"instrument_type": "qtof",
"id": 2})
spectrum_3 = Spectrum(mz=np.array([]),
intensities=np.array([]),
metadata={"instrument_type": "qtof",
"id": 3})
spectrum_4 = Spectrum(mz=np.array([]),
intensities=np.array([]),
metadata={"instrument_type": "orbitrap",
"id": 4})
references = [spectrum_1, spectrum_2]
queries = [spectrum_3, spectrum_4]
similarity_score = MetadataMatch(field="instrument_type")
scores = calculate_scores(references, queries, similarity_score)
for (reference, query, score) in scores:
print(f"Metadata match between {reference.get('id')} and {query.get('id')}" +
f" is {score}")
Should output
.. testoutput::
Metadata match between 1 and 4 is [True]
Metadata match between 2 and 3 is [True]
"""
# Set key characteristics as class attributes
is_commutative = True
score_datatype = bool
[docs] def __init__(self, field: str,
matching_type: str = "equal_match",
tolerance: float = 0.1):
"""
Parameters
----------
field
Specify field name for metadata that should be compared.
matching_type
Specify how field entries should be matched. Can be one of
["equal_match", "difference"].
tolerance
Specify tolerance below which two values are counted as match.
This only applied to numerical values.
"""
self.field = field
self.tolerance = tolerance
assert matching_type in ["equal_match", "difference"], \
"Expected type from ['equal_match', 'difference']"
self.matching_type = matching_type
[docs] def pair(self, reference: SpectrumType, query: SpectrumType) -> float:
"""Compare precursor m/z between reference and query spectrum.
Parameters
----------
reference
Single reference spectrum.
query
Single query spectrum.
"""
entry_ref = reference.get(self.field)
entry_query = query.get(self.field)
if entry_ref is None or entry_query is None:
return np.asarray(False, dtype=self.score_datatype)
if self.matching_type == "equal_match":
score = (entry_ref == entry_query)
return np.asarray(score, dtype=self.score_datatype)
if isinstance(entry_ref, (int, float)) and isinstance(entry_query, (int, float)):
score = abs(entry_ref - entry_query) <= self.tolerance
return np.asarray(score, dtype=self.score_datatype)
logger.warning("Non-numerical entry not compatible with 'difference' method")
return np.asarray(False, dtype=self.score_datatype)
[docs] def matrix(self, references: List[SpectrumType], queries: List[SpectrumType],
array_type: str = "numpy",
is_symmetric: bool = False) -> np.ndarray:
"""Compare parent masses between all references and queries.
Parameters
----------
references
List/array of reference spectrums.
queries
List/array of Single query spectrums.
array_type
Specify the output array type. Can be "numpy" or "sparse".
Default is "numpy" and will return a numpy array. "sparse" will return a COO-sparse array.
is_symmetric
Set to True when *references* and *queries* are identical (as for instance for an all-vs-all
comparison). By using the fact that score[i,j] = score[j,i] the calculation will be about
2x faster.
"""
def collect_entries(spectrums):
"""Collect metadata entries."""
entries = []
for spectrum in spectrums:
entry = spectrum.get(self.field)
if entry is None:
msg = f"No {self.field} entry found for spectrum."
logger.warning(msg)
entry = np.nan
elif self.matching_type == "difference" and not isinstance(entry, (int, float)):
msg = f"Non-numerical entry ({entry}) not compatible with 'difference' method."
logger.warning(msg)
entry = np.nan
entries.append(entry)
return np.asarray(entries)
entries_ref = collect_entries(references)
entries_query = collect_entries(queries)
if self.matching_type == "equal_match":
scores = np.zeros((len(entries_ref), len(entries_query)))
for i, entry in enumerate(entries_query):
idx = np.where(entries_ref == entry)
scores[idx, i] = 1
return scores.astype(self.score_datatype)
if is_symmetric:
rows, cols, scores = number_matching_symmetric(entries_ref,
self.tolerance)
else:
rows, cols, scores = number_matching(entries_ref, entries_query,
self.tolerance)
if array_type == "numpy":
scores_array = np.zeros((len(entries_ref), len(entries_query)))
scores_array[rows, cols] = scores.astype(self.score_datatype)
return scores_array
if array_type == "sparse":
scores_array = StackedSparseArray(len(entries_ref), len(entries_query))
scores_array.add_sparse_data(rows, cols, scores.astype(self.score_datatype), "")
return scores_array
raise ValueError("")