Source code for matchms.networking.networking_functions

""" Helper functions to build and handle spectral networks
"""
from typing import Tuple
import numpy as np
from matchms import Scores


[docs] def get_top_hits(scores: Scores, identifier_key: str = "spectrum_id", top_n: int = 25, search_by: str = "queries", score_name: str = None, ignore_diagonal: bool = False) -> Tuple[dict, dict]: """Get top_n highest scores (and indices) for every entry. Parameters ---------- scores Matchms Scores object containing all similarities. identifier_key Metadata key for unique intentifier for each spectrum in scores. Will also be used for the naming the network nodes. Default is 'spectrum_id'. top_n Return the indexes and scores for the top_n highest scores. Scores between a spectrum with itself (diagonal of scores.scores) will not be taken into account. search_by Chose between 'queries' or 'references' which decides if the top_n matches for every spectrum in scores.queries or in scores.references will be collected and returned. score_name Name of the score that should be used (if scores contains multiple different scores). ignore_diagonal Set to True if scores.scores is symmetric (i.e. if references and queries were the same) and if scores between spectra with themselves should be excluded. """ # pylint: disable=protected-access, too-many-arguments assert search_by in ["queries", "references"], \ "search_by must be 'queries' or 'references" if score_name is None: score_name = scores._scores.guess_score_name() if search_by == "queries": return get_top_hits_by_query(scores, identifier_key, top_n, score_name, ignore_diagonal) return get_top_hits_by_references(scores, identifier_key, top_n, score_name, ignore_diagonal)
[docs] def get_top_hits_by_references(scores: Scores, identifier_key: str, top_n: int, score_name: str, ignore_diagonal: bool)-> Tuple[dict, dict]: """Get the top hits from the scoring by "references". This function differs only slightly from the one by query. Args: scores (Scores): Scores from which to retrieve the queries. identifier_key (str): Key to use as identifier for the spectra. top_n (int): N for the top N to receive. score_name (str): Score name to retrieve the top hits from. ignore_diagonal (bool): Whether to ignore self matches on the diagonal. Returns: dict, dict: Dictionaries of indices and scores. """ similars_idx = {} similars_scores = {} for i, spec in enumerate(scores.references): spec_id = spec.get(identifier_key) _, c, v = scores.scores[i, :, score_name] idx = np.argsort(v)[::-1][:top_n] if ignore_diagonal: idx = idx[c[idx] != i] similars_idx[spec_id] = c[idx][:top_n] similars_scores[spec_id] = v[idx][:top_n] return similars_idx,similars_scores
[docs] def get_top_hits_by_query(scores: Scores, identifier_key: str, top_n: int, score_name: str, ignore_diagonal: bool)-> Tuple[dict, dict]: """Get the top hits in the network from the "query" spectra perspective Args: scores (Scores): scores matrix from which to extract the hits identifier_key (str): Key to use as identifier for the spectra top_n (int): N for the number of spectra to retrieve. score_name (str): Name of the score to retrieve ignore_diagonal (bool): Whether to ignore self hits on the diagonal or not. Returns: dict, dict: Dictionaries of indices and scores. """ similars_idx = {} similars_scores = {} for i, spec in enumerate(scores.queries): spec_id = spec.get(identifier_key) r, _, v = scores.scores[:, i, score_name] idx = np.argsort(v)[::-1] if ignore_diagonal: idx = idx[r[idx] != i] similars_idx[spec_id] = r[idx][:top_n] similars_scores[spec_id] = v[idx][:top_n] return similars_idx, similars_scores