Source code for matchms.networking.SimilarityNetwork

from typing import Optional
import networkx as nx
import numpy
from matchms import Scores
from .networking_functions import get_top_hits


[docs]class SimilarityNetwork: """Create a spectal network from spectrum similarities. For example .. testcode:: import numpy as np from matchms import Spectrum, calculate_scores from matchms.similarity import ModifiedCosine from matchms.networking import SimilarityNetwork spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), intensities=np.array([0.7, 0.2, 0.1]), metadata={"precursor_mz": 100.0, "test_id": "one"}) spectrum_2 = Spectrum(mz=np.array([104.9, 140, 190.]), intensities=np.array([0.4, 0.2, 0.1]), metadata={"precursor_mz": 105.0, "test_id": "two"}) # Use factory to construct a similarity function modified_cosine = ModifiedCosine(tolerance=0.2) spectrums = [spectrum_1, spectrum_2] scores = calculate_scores(spectrums, spectrums, modified_cosine) ms_network = SimilarityNetwork(identifier_key="test_id") ms_network.create_network(scores) nodes = list(ms_network.graph.nodes()) nodes.sort() print(nodes) Should output .. testoutput:: ['one', 'two'] """
[docs] def __init__(self, identifier_key: str = "spectrum_id", top_n: int = 20, max_links: int = 10, score_cutoff: float = 0.7, link_method: str = 'single', keep_unconnected_nodes: bool = True): """ Parameters ---------- identifier_key Metadata key for unique intentifier for each spectrum in scores. Will also be used for the naming the network nodes. Default is 'spectrum_id'. top_n Consider edge between spectrumA and spectrumB if score falls into top_n for spectrumA or spectrumB (link_method="single"), or into top_n for spectrumA and spectrumB (link_method="mutual"). From those potential links, only max_links will be kept, so top_n must be >= max_links. max_links Maximum number of links to add per node. Default = 10. Due to incoming links, total number of links per node can be higher. The links are populated by looping over the query spectrums. Important side note: The max_links restriction is strict which means that if scores around max_links are equal still only max_links will be added which can results in some random variations (sorting spectra with equal scores restuls in a random order of such elements). score_cutoff Threshold for given similarities. Edges/Links will only be made for similarities > score_cutoff. Default = 0.7. link_method Chose between 'single' and 'mutual'. 'single will add all links based on individual nodes. 'mutual' will only add links if that link appears in the given top-n list for both nodes. keep_unconnected_nodes If set to True (default) all spectra will be included as nodes even if they have no connections/edges of other spectra. If set to False all nodes without connections will be removed. """ # pylint: disable=too-many-arguments self.identifier_key = identifier_key self.top_n = top_n self.max_links = max_links self.score_cutoff = score_cutoff self.link_method = link_method self.keep_unconnected_nodes = keep_unconnected_nodes self.graph: Optional[nx.Graph] = None """NetworkX graph. Set after calling create_network()"""
@staticmethod def _select_edge_score(similars_scores: dict, scores_type: numpy.dtype): """Chose one value if score contains multiple values (e.g. "score" and "matches")""" if len(scores_type) > 1 and "score" in scores_type.names: return {key: value["score"] for key, value in similars_scores.items()} if len(scores_type) > 1: # Assume that first entry is desired score return {key: value[0] for key, value in similars_scores.items()} return similars_scores
[docs] def create_network(self, scores: Scores): """ Function to create network from given top-n similarity values. Expects that similarities given in scores are from an all-vs-all comparison including all possible pairs. Parameters ---------- scores Matchms Scores object containing all spectrums and pair similarities for generating a network. """ assert self.top_n >= self.max_links, "top_n must be >= max_links" assert numpy.all(scores.queries == scores.references), \ "Expected symmetric scores object with queries==references" unique_ids = list({s.get(self.identifier_key) for s in scores.queries}) # Initialize network graph, add nodes msnet = nx.Graph() msnet.add_nodes_from(unique_ids) # Collect location and score of highest scoring candidates for queries and references similars_idx, similars_scores = get_top_hits(scores, identifier_key=self.identifier_key, top_n=self.top_n, search_by="queries", ignore_diagonal=True) similars_scores = self._select_edge_score(similars_scores, scores.scores.dtype) # Add edges based on global threshold (cutoff) for weights for i, spec in enumerate(scores.queries): query_id = spec.get(self.identifier_key) ref_candidates = numpy.array([scores.references[x].get(self.identifier_key) for x in similars_idx[query_id]]) idx = numpy.where((similars_scores[query_id] >= self.score_cutoff) & (ref_candidates != query_id))[0][:self.max_links] if self.link_method == "single": new_edges = [(query_id, str(ref_candidates[x]), float(similars_scores[query_id][x])) for x in idx] elif self.link_method == "mutual": new_edges = [(query_id, str(ref_candidates[x]), float(similars_scores[query_id][x])) for x in idx if i in similars_idx[ref_candidates[x]][:]] else: raise ValueError("Link method not kown") msnet.add_weighted_edges_from(new_edges) if not self.keep_unconnected_nodes: msnet.remove_nodes_from(list(nx.isolates(msnet))) self.graph = msnet
[docs] def export_to_graphml(self, filename: str): """Save the network as .graphml file. Parameters ---------- filename Specify filename for exporting the graph. """ if not self.graph: raise ValueError("No network found. Make sure to first run .create_network() step") nx.write_graphml_lxml(self.graph, filename)