from typing import List, Union
import numpy as np
from sparsestack import StackedSparseArray
from matchms.typing import SpectrumType
from .BaseSimilarity import BaseSimilarity
from .vector_similarity_functions import (cosine_similarity,
cosine_similarity_matrix,
dice_similarity,
dice_similarity_matrix,
jaccard_index,
jaccard_similarity_matrix)
[docs]class FingerprintSimilarity(BaseSimilarity):
"""Calculate similarity between molecules based on their fingerprints.
For this similarity measure to work, fingerprints are expected to be derived
by running :meth:`~matchms.filtering.add_fingerprint`.
Code example:
.. testcode::
import numpy as np
from matchms import calculate_scores
from matchms import Spectrum
from matchms.filtering import add_fingerprint
from matchms.similarity import FingerprintSimilarity
spectrum_1 = Spectrum(mz=np.array([], dtype="float"),
intensities=np.array([], dtype="float"),
metadata={"smiles": "CCC(C)C(C(=O)O)NC(=O)CCl"})
spectrum_2 = Spectrum(mz=np.array([], dtype="float"),
intensities=np.array([], dtype="float"),
metadata={"smiles": "CC(C)C(C(=O)O)NC(=O)CCl"})
spectrum_3 = Spectrum(mz=np.array([], dtype="float"),
intensities=np.array([], dtype="float"),
metadata={"smiles": "C(C(=O)O)(NC(=O)O)S"})
spectrums = [spectrum_1, spectrum_2, spectrum_3]
# Add fingerprints
spectrums = [add_fingerprint(x, nbits=256) for x in spectrums]
# Specify type and calculate similarities
similarity_measure = FingerprintSimilarity("jaccard")
scores = calculate_scores(spectrums, spectrums, similarity_measure)
print(np.round(scores.scores.to_array(), 3))
Should output
.. testoutput::
[[1. 0.878 0.415]
[0.878 1. 0.444]
[0.415 0.444 1. ]]
"""
# Set key characteristics as class attributes
is_commutative = True
# Set output data type, e.g. "float" or [("score", "float"), ("matches", "int")]
score_datatype = np.float64
[docs] def __init__(self, similarity_measure: str = "jaccard",
set_empty_scores: Union[float, int, str] = "nan"):
"""
Parameters
----------
similarity_measure:
Chose similarity measure form "cosine", "dice", "jaccard".
The default is "jaccard".
set_empty_scores:
Define what should be given instead of a similarity score in cases
where fingprints are missing. The default is "nan", which will return
np.nan's in such cases.
"""
self.set_empty_scores = set_empty_scores
assert similarity_measure in ["cosine", "dice", "jaccard"], "Unknown similarity measure."
self.similarity_measure = similarity_measure
[docs] def pair(self, reference: SpectrumType, query: SpectrumType) -> float:
"""Calculate fingerprint based similarity score between two spectra.
Parameters
----------
reference
Single reference spectrum.
query
Single query spectrum.
"""
fingerprint_ref = reference.get("fingerprint")
fingerprint_query = query.get("fingerprint")
if self.similarity_measure == "jaccard":
return jaccard_index(fingerprint_ref, fingerprint_query)
if self.similarity_measure == "dice":
return dice_similarity(fingerprint_ref, fingerprint_query)
if self.similarity_measure == "cosine":
score = cosine_similarity(fingerprint_ref, fingerprint_query)
return np.asarray(score, dtype=self.score_datatype)
raise NotImplementedError
[docs] def matrix(self, references: List[SpectrumType], queries: List[SpectrumType],
array_type: str = "numpy",
is_symmetric: bool = False) -> np.array:
"""Calculate matrix of fingerprint based similarity scores.
Parameters
----------
references:
List of reference spectrums.
queries:
List of query spectrums.
array_type
Specify the output array type. Can be "numpy" or "sparse".
Default is "numpy" and will return a numpy array. "sparse" will return a COO-sparse array
"""
def get_fingerprints(spectrums):
for index, spectrum in enumerate(spectrums):
yield index, spectrum.get("fingerprint")
def collect_fingerprints(spectrums):
"""Collect fingerprints and indices of spectrum with finterprints."""
idx_fingerprints = []
fingerprints = []
for index, fp in get_fingerprints(spectrums):
if fp is not None:
idx_fingerprints.append(index)
fingerprints.append(fp)
return np.asarray(fingerprints), np.asarray(idx_fingerprints)
def create_full_matrix():
"""Create matrix for all similarities."""
similarity_matrix = np.zeros((len(references), len(queries)))
if self.set_empty_scores == "nan":
similarity_matrix[:] = np.nan
elif isinstance(self.set_empty_scores, (float, int)):
similarity_matrix[:] = self.set_empty_scores
return similarity_matrix
fingerprints1, idx_fingerprints1 = collect_fingerprints(references)
fingerprints2, idx_fingerprints2 = collect_fingerprints(queries)
assert idx_fingerprints1.size > 0 and idx_fingerprints2.size > 0, ("Not enouth molecular fingerprints.",
"Apply 'add_fingerprint'filter first.")
# Calculate similarity score matrix following specified method
similarity_matrix = create_full_matrix()
if self.similarity_measure == "jaccard":
similarity_matrix[np.ix_(idx_fingerprints1,
idx_fingerprints2)] = jaccard_similarity_matrix(fingerprints1,
fingerprints2)
elif self.similarity_measure == "dice":
similarity_matrix[np.ix_(idx_fingerprints1,
idx_fingerprints2)] = dice_similarity_matrix(fingerprints1,
fingerprints2)
elif self.similarity_measure == "cosine":
similarity_matrix[np.ix_(idx_fingerprints1,
idx_fingerprints2)] = cosine_similarity_matrix(fingerprints1,
fingerprints2)
if array_type == "sparse":
scores_array = StackedSparseArray(len(references), len(queries))
scores_array.add_dense_matrix(similarity_matrix.astype(self.score_datatype), "")
return scores_array
if array_type == "numpy":
return similarity_matrix.astype(self.score_datatype)
raise NotImplementedError("Output array type is not yet implemented.")