Source code for matchms.networking.SimilarityNetwork

import json
from typing import Optional
import networkx as nx
import numpy as np
from matchms import Scores
from .networking_functions import get_top_hits


[docs]class SimilarityNetwork: """Create a spectral network from spectrum similarities. For example .. testcode:: import numpy as np from matchms import Spectrum, calculate_scores from matchms.similarity import ModifiedCosine from matchms.networking import SimilarityNetwork spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), intensities=np.array([0.7, 0.2, 0.1]), metadata={"precursor_mz": 100.0, "test_id": "one"}) spectrum_2 = Spectrum(mz=np.array([104.9, 140, 190.]), intensities=np.array([0.4, 0.2, 0.1]), metadata={"precursor_mz": 105.0, "test_id": "two"}) # Use factory to construct a similarity function modified_cosine = ModifiedCosine(tolerance=0.2) spectrums = [spectrum_1, spectrum_2] scores = calculate_scores(spectrums, spectrums, modified_cosine) ms_network = SimilarityNetwork(identifier_key="test_id") ms_network.create_network(scores, score_name="ModifiedCosine_score") nodes = list(ms_network.graph.nodes()) nodes.sort() print(nodes) Should output .. testoutput:: ['one', 'two'] """
[docs] def __init__(self, identifier_key: str = "spectrum_id", top_n: int = 20, max_links: int = 10, score_cutoff: float = 0.7, link_method: str = 'single', keep_unconnected_nodes: bool = True): """ Parameters ---------- identifier_key Metadata key for unique identifier for each spectrum in scores. Will also be used for the naming the network nodes. Default is 'spectrum_id'. top_n Consider edge between spectrumA and spectrumB if score falls into top_n for spectrumA or spectrumB (link_method="single"), or into top_n for spectrumA and spectrumB (link_method="mutual"). From those potential links, only max_links will be kept, so top_n must be >= max_links. max_links Maximum number of links to add per node. Default = 10. Due to incoming links, total number of links per node can be higher. The links are populated by looping over the query spectrums. Important side note: The max_links restriction is strict which means that if scores around max_links are equal still only max_links will be added which can results in some random variations (sorting spectra with equal scores results in a random order of such elements). score_cutoff Threshold for given similarities. Edges/Links will only be made for similarities > score_cutoff. Default = 0.7. link_method Chose between 'single' and 'mutual'. 'single will add all links based on individual nodes. 'mutual' will only add links if that link appears in the given top-n list for both nodes. keep_unconnected_nodes If set to True (default) all spectra will be included as nodes even if they have no connections/edges of other spectra. If set to False all nodes without connections will be removed. """ # pylint: disable=too-many-arguments self.identifier_key = identifier_key self.top_n = top_n self.max_links = max_links self.score_cutoff = score_cutoff self.link_method = link_method self.keep_unconnected_nodes = keep_unconnected_nodes self.graph: Optional[nx.Graph] = None """NetworkX graph. Set after calling create_network()"""
[docs] def create_network(self, scores: Scores, score_name: str = None): """ Function to create network from given top-n similarity values. Expects that similarities given in scores are from an all-vs-all comparison including all possible pairs. Parameters ---------- scores Matchms Scores object containing all spectrums and pair similarities for generating a network. """ if score_name is None: score_name = scores.scores.guess_score_name() assert self.top_n >= self.max_links, "top_n must be >= max_links" assert np.all(scores.queries == scores.references), \ "Expected symmetric scores object with queries==references" unique_ids = list({s.get(self.identifier_key) for s in scores.queries}) # Initialize network graph, add nodes msnet = nx.Graph() msnet.add_nodes_from(unique_ids) # Collect location and score of highest scoring candidates for queries and references similars_idx, similars_scores = get_top_hits(scores, identifier_key=self.identifier_key, top_n=self.top_n, search_by="queries", score_name=score_name, ignore_diagonal=True) # Add edges based on global threshold (cutoff) for weights for i, spec in enumerate(scores.queries): query_id = spec.get(self.identifier_key) ref_candidates = np.array([scores.references[x].get(self.identifier_key) for x in similars_idx[query_id]]) idx = np.where((similars_scores[query_id] >= self.score_cutoff) & (ref_candidates != query_id))[0][:self.max_links] if self.link_method == "single": new_edges = [(query_id, str(ref_candidates[x]), float(similars_scores[query_id][x])) for x in idx] elif self.link_method == "mutual": new_edges = [(query_id, str(ref_candidates[x]), float(similars_scores[query_id][x])) for x in idx if i in similars_idx[ref_candidates[x]][:]] else: raise ValueError("Link method not kown") msnet.add_weighted_edges_from(new_edges) if not self.keep_unconnected_nodes: msnet.remove_nodes_from(list(nx.isolates(msnet))) self.graph = msnet
[docs] def export_to_file(self, filename: str, graph_format: str = "graphml"): """ Save the network to a file with chosen format. Parameters ---------- filename Path to file to write to. graph_format Format, in which to store the network graph. Supported formats are: "cyjs", "gexf", "gml", "graphml", "json". Default is "graphml". """ if not self.graph: raise ValueError("No network found. Make sure to first run .create_network() step") writer = self._generate_writer(graph_format) writer(filename)
def _generate_writer(self, graph_format: str): writer = {"cyjs": self._export_to_cyjs, "gexf": self._export_to_gexf, "gml": self._export_to_gml, "graphml": self.export_to_graphml, "json": self._export_to_node_link_json} assert graph_format in writer, "Format not supported.\n" \ "Please use one of supported formats: 'cyjs', 'gexf', 'gml', 'graphml', 'json'" return writer[graph_format]
[docs] def export_to_graphml(self, filename: str): """Save the network as .graphml file. Parameters ---------- filename Specify filename for exporting the graph. """ nx.write_graphml_lxml(self.graph, filename)
def _export_to_cyjs(self, filename: str): """Save the network in cyjs format.""" graph = nx.cytoscape_data(self.graph) return self._write_to_json(graph, filename) def _export_to_node_link_json(self, filename: str): """Save the network in node-link format.""" graph = nx.node_link_data(self.graph) return self._write_to_json(graph, filename) @staticmethod def _write_to_json(graph: dict, filename: str): """Save the network as JSON file. Parameters ---------- graph JSON-dictionary type graph to save. filename Specify filename for exporting the graph. """ with open(filename, "w", encoding="utf-8") as file: json.dump(graph, file) def _export_to_gexf(self, filename: str): """Save the network as .gexf file.""" nx.write_gexf(self.graph, filename) def _export_to_gml(self, filename: str): """Save the network as .gml file.""" nx.write_gml(self.graph, filename)