Source code for matchms.similarity.FingerprintSimilarity

```from typing import List, Union
import numpy
from matchms.typing import SpectrumType
from .BaseSimilarity import BaseSimilarity
from .vector_similarity_functions import (cosine_similarity,
cosine_similarity_matrix,
dice_similarity,
dice_similarity_matrix,
jaccard_index,
jaccard_similarity_matrix)

[docs]class FingerprintSimilarity(BaseSimilarity):
"""Calculate similarity between molecules based on their fingerprints.

For this similarity measure to work, fingerprints are expected to be derived

Code example:

.. testcode::

import numpy as np
from matchms import calculate_scores
from matchms import Spectrum
from matchms.similarity import FingerprintSimilarity

spectrum_1 = Spectrum(mz=np.array([], dtype="float"),
intensities=np.array([], dtype="float"),

spectrum_2 = Spectrum(mz=np.array([], dtype="float"),
intensities=np.array([], dtype="float"),

spectrum_3 = Spectrum(mz=np.array([], dtype="float"),
intensities=np.array([], dtype="float"),

spectrums = [spectrum_1, spectrum_2, spectrum_3]
spectrums = [add_fingerprint(x, nbits=256) for x in spectrums]

# Specify type and calculate similarities
similarity_measure = FingerprintSimilarity("jaccard")
scores = calculate_scores(spectrums, spectrums, similarity_measure)
print(np.round(scores.scores, 3))

Should output

.. testoutput::

[[1.    0.878 0.415]
[0.878 1.    0.444]
[0.415 0.444 1.   ]]

"""
# Set key characteristics as class attributes
is_commutative = True
# Set output data type, e.g.  "float" or [("score", "float"), ("matches", "int")]
score_datatype = numpy.float64

[docs]    def __init__(self, similarity_measure: str = "jaccard",
set_empty_scores: Union[float, int, str] = "nan"):
"""

Parameters
----------
similarity_measure:
Chose similarity measure form "cosine", "dice", "jaccard".
The default is "jaccard".
set_empty_scores:
Define what should be given instead of a similarity score in cases
where fingprints are missing. The default is "nan", which will return
numpy.nan's in such cases.
"""
self.set_empty_scores = set_empty_scores
assert similarity_measure in ["cosine", "dice", "jaccard"], "Unknown similarity measure."
self.similarity_measure = similarity_measure

[docs]    def pair(self, reference: SpectrumType, query: SpectrumType) -> float:
"""Calculate fingerprint based similarity score between two spectra.

Parameters
----------
reference
Single reference spectrum.
query
Single query spectrum.
"""
fingerprint_ref = reference.get("fingerprint")
fingerprint_query = query.get("fingerprint")
if self.similarity_measure == "jaccard":
return jaccard_index(fingerprint_ref, fingerprint_query)

if self.similarity_measure == "dice":
return dice_similarity(fingerprint_ref, fingerprint_query)

if self.similarity_measure == "cosine":
score = cosine_similarity(fingerprint_ref, fingerprint_query)
return numpy.asarray(score, dtype=self.score_datatype)

raise NotImplementedError

[docs]    def matrix(self, references: List[SpectrumType], queries: List[SpectrumType],
is_symmetric: bool = False) -> numpy.array:
"""Calculate matrix of fingerprint based similarity scores.

Parameters
----------
references:
List of reference spectrums.
queries:
List of query spectrums.
"""
def get_fingerprints(spectrums):
for index, spectrum in enumerate(spectrums):
yield index, spectrum.get("fingerprint")

def collect_fingerprints(spectrums):
"""Collect fingerprints and indices of spectrum with finterprints."""
idx_fingerprints = []
fingerprints = []
for index, fp in get_fingerprints(spectrums):
if fp is not None:
idx_fingerprints.append(index)
fingerprints.append(fp)
return numpy.asarray(fingerprints), numpy.asarray(idx_fingerprints)

def create_full_matrix():
"""Create matrix for all similarities."""
similarity_matrix = numpy.zeros((len(references), len(queries)))
if self.set_empty_scores == "nan":
similarity_matrix[:] = numpy.nan
elif isinstance(self.set_empty_scores, (float, int)):
similarity_matrix[:] = self.set_empty_scores
return similarity_matrix

fingerprints1, idx_fingerprints1 = collect_fingerprints(references)
fingerprints2, idx_fingerprints2 = collect_fingerprints(queries)
assert idx_fingerprints1.size > 0 and idx_fingerprints2.size > 0, ("Not enouth molecular fingerprints.",

# Calculate similarity score matrix following specified method
similarity_matrix = create_full_matrix()
if self.similarity_measure == "jaccard":
similarity_matrix[numpy.ix_(idx_fingerprints1,
idx_fingerprints2)] = jaccard_similarity_matrix(fingerprints1,
fingerprints2)
elif self.similarity_measure == "dice":
similarity_matrix[numpy.ix_(idx_fingerprints1,
idx_fingerprints2)] = dice_similarity_matrix(fingerprints1,
fingerprints2)
elif self.similarity_measure == "cosine":
similarity_matrix[numpy.ix_(idx_fingerprints1,
idx_fingerprints2)] = cosine_similarity_matrix(fingerprints1,
fingerprints2)
return similarity_matrix.astype(self.score_datatype)
```