Source code for matchms.similarity.BaseSimilarity

from abc import abstractmethod
from typing import List
import numpy as np
from sparsestack import StackedSparseArray
from matchms.typing import SpectrumType

[docs]class BaseSimilarity: """Similarity function base class. When building a custom similarity measure, inherit from this class and implement the desired methods. Attributes ---------- is_commutative Whether similarity function is commutative, which means that the order of spectrums does not matter (similarity(A, B) == similarity(B, A)). Default is True. """ # Set key characteristics as class attributes is_commutative = True # Set output data type, e.g. "float" or [("score", "float"), ("matches", "int")] score_datatype = np.float64
[docs] @abstractmethod def pair(self, reference: SpectrumType, query: SpectrumType) -> float: """Method to calculate the similarity for one input pair. Parameters ---------- reference Single reference spectrum. query Single query spectrum. Returns score as numpy array (using self.score_datatype). For instance returning np.asarray(score, dtype=self.score_datatype) """ raise NotImplementedError
[docs] def matrix(self, references: List[SpectrumType], queries: List[SpectrumType], array_type: str = "numpy", is_symmetric: bool = False) -> np.ndarray: """Optional: Provide optimized method to calculate an np.array of similarity scores for given reference and query spectrums. If no method is added here, the following naive implementation (i.e. a double for-loop) is used. Parameters ---------- references List of reference objects queries List of query objects array_type Specify the output array type. Can be "numpy" or "sparse". Default is "numpy" and will return a numpy array. "sparse" will return a COO-sparse array. is_symmetric Set to True when *references* and *queries* are identical (as for instance for an all-vs-all comparison). By using the fact that score[i,j] = score[j,i] the calculation will be about 2x faster. """ #pylint: disable=too-many-locals n_rows = len(references) n_cols = len(queries) idx_row = [] idx_col = [] scores = [] for i_ref, reference in enumerate(references[:n_rows]): if is_symmetric and self.is_commutative: for i_query, query in enumerate(queries[i_ref:n_cols], start=i_ref): score = self.pair(reference, query) if self.keep_score(score): idx_row += [i_ref, i_query] idx_col += [i_query, i_ref] scores += [score, score] else: for i_query, query in enumerate(queries[:n_cols]): score = self.pair(reference, query) if self.keep_score(score): idx_row.append(i_ref) idx_col.append(i_query) scores.append(score) idx_row = np.array(idx_row) idx_col = np.array(idx_col) scores_data = np.array(scores, dtype=self.score_datatype) # TODO: make StackedSpareseArray the default and add fixed function to output different formats (with code below) if array_type == "numpy": scores_array = np.zeros(shape=(n_rows, n_cols), dtype=self.score_datatype) scores_array[idx_row, idx_col] = scores_data.reshape(-1) return scores_array if array_type == "sparse": scores_array = StackedSparseArray(n_rows, n_cols) scores_array.add_sparse_data(idx_row, idx_col, scores_data, "") return scores_array raise ValueError("array_type must be 'numpy' or 'sparse'.")
[docs] def sparse_array(self, references: List[SpectrumType], queries: List[SpectrumType], idx_row, idx_col, is_symmetric: bool = False): """Optional: Provide optimized method to calculate an sparse matrix of similarity scores. Compute similarity scores for pairs of reference and query spectrums as given by the indices idx_row (references) and idx_col (queries). If no method is added here, the following naive implementation (i.e. a for-loop) is used. Parameters ---------- references List of reference objects queries List of query objects idx_row List/array of row indices idx_col List/array of column indices is_symmetric Set to True when *references* and *queries* are identical (as for instance for an all-vs-all comparison). By using the fact that score[i,j] = score[j,i] the calculation will be about 2x faster. """ # pylint: disable=too-many-arguments if is_symmetric is True: pass # TODO: consider implementing faster method for symmetric cases assert idx_row.shape == idx_col.shape, "col and row indices must be of same shape" scores = np.zeros((len(idx_row)), dtype=self.score_datatype) # TODO: switch to sparse matrix for i, row in enumerate(idx_row): col = idx_col[i] scores[i] = self.pair(references[row], queries[col]) return scores
[docs] def keep_score(self, score): """In the `.matrix` method scores will be collected in a sparse way. Overwrite this method here if values other than `False` or `0` should not be stored in the final collection. """ if len(score.dtype) > 1: # if structured array valuelike = True for dtype_name in score.dtype.names: valuelike = valuelike and (score[dtype_name] != 0) return valuelike return score != 0
[docs] def to_dict(self) -> dict: """Return a dictionary representation of a similarity function.""" return {"__Similarity__": self.__class__.__name__, **self.__dict__}