Source code for matchms.networking.SimilarityNetwork

import json
from collections.abc import Sequence
import networkx as nx
import numpy as np
from matchms import Scores
from .networking_functions import get_top_hits


[docs] class SimilarityNetwork: """Create a similarity network from all-vs-all spectrum similarities. For example .. testcode:: import numpy as np from matchms import Spectrum, calculate_scores from matchms.similarity import ModifiedCosineGreedy from matchms.networking import SimilarityNetwork spectrum_1 = Spectrum( mz=np.array([100, 150, 200.0]), intensities=np.array([0.7, 0.2, 0.1]), metadata={"precursor_mz": 100.0, "test_id": "one"}, ) spectrum_2 = Spectrum( mz=np.array([104.9, 140, 190.0]), intensities=np.array([0.4, 0.2, 0.1]), metadata={"precursor_mz": 105.0, "test_id": "two"}, ) modified_cosine = ModifiedCosineGreedy(tolerance=0.2) spectra = [spectrum_1, spectrum_2] scores = calculate_scores(spectra, spectra, modified_cosine) identifiers = [s.get("test_id") for s in spectra] ms_network = SimilarityNetwork() ms_network.create_network(scores, identifiers=identifiers, score_name="score") nodes = list(ms_network.graph.nodes()) nodes.sort() print(nodes) Should output .. testoutput:: ['one', 'two'] """
[docs] def __init__( self, top_n: int = 20, max_links: int = 10, score_cutoff: float = 0.7, link_method: str = "single", keep_unconnected_nodes: bool = True, ): """ Parameters ---------- top_n Consider an edge between node A and node B if the score falls into the top_n hits of A or B (``link_method="single"``), or into the top_n hits of both A and B (``link_method="mutual"``). From those potential links, only ``max_links`` are kept per node, so ``top_n`` must be >= ``max_links``. max_links Maximum number of outgoing links to add per node. Default is 10. Due to incoming links, total degree can be higher. score_cutoff Threshold for similarities. Edges are only created for similarities >= ``score_cutoff``. link_method Choose between ``"single"`` and ``"mutual"``. - ``"single"`` adds all eligible top-k links. - ``"mutual"`` only adds a link if both nodes rank each other within their respective top-k lists. keep_unconnected_nodes If True (default), all identifiers are included as nodes even if they have no edges. If False, isolated nodes are removed. """ self.top_n = top_n self.max_links = max_links self.score_cutoff = score_cutoff self.link_method = link_method self.keep_unconnected_nodes = keep_unconnected_nodes self.graph: nx.Graph | None = None
[docs] def create_network( self, scores: Scores, identifiers: Sequence[str], score_name: str | None = None, ) -> None: """Create a similarity network from a square all-vs-all Scores object. Parameters ---------- scores Matchms Scores object containing all-vs-all similarities. The score matrix must be square. identifiers Node identifiers corresponding to the rows/columns of the score matrix. Must have length equal to ``scores.shape[0]``. score_name Name of the score field to use. If None: - scalar Scores: the only field is used - multi-field Scores: ``"score"`` is used if present """ if self.top_n < self.max_links: raise ValueError("top_n must be >= max_links.") if self.link_method not in {"single", "mutual"}: raise ValueError("link_method must be either 'single' or 'mutual'.") n_rows, n_cols = scores.shape if n_rows != n_cols: raise TypeError("Expected square all-vs-all scores for network creation.") if len(identifiers) != n_rows: raise ValueError( f"identifiers must have length {n_rows}, but got {len(identifiers)}." ) if len(set(identifiers)) != len(identifiers): raise ValueError("identifiers must be unique.") msnet = nx.Graph() msnet.add_nodes_from(identifiers) similars_idx, similars_scores = get_top_hits( scores=scores, top_n=self.top_n, axis=1, score_name=score_name, identifiers=identifiers, ignore_diagonal=True, ) for i, source_id in enumerate(identifiers): candidate_indices = similars_idx[source_id] candidate_scores = similars_scores[source_id] if len(candidate_indices) == 0: continue target_ids = np.array([identifiers[j] for j in candidate_indices], dtype=object) keep = np.where(candidate_scores >= self.score_cutoff)[0][: self.max_links] if self.link_method == "single": new_edges = [ (source_id, str(target_ids[k]), float(candidate_scores[k])) for k in keep ] else: # mutual new_edges = [] for k in keep: target_idx = candidate_indices[k] target_id = identifiers[target_idx] if i in similars_idx[target_id][: self.top_n]: new_edges.append( (source_id, str(target_id), float(candidate_scores[k])) ) msnet.add_weighted_edges_from(new_edges) if not self.keep_unconnected_nodes: msnet.remove_nodes_from(list(nx.isolates(msnet))) self.graph = msnet
[docs] def export_to_file(self, filename: str, graph_format: str = "graphml"): """Save the network to a file. Parameters ---------- filename Path to output file. graph_format Output format. Supported formats are: ``"cyjs"``, ``"gexf"``, ``"gml"``, ``"graphml"``, ``"json"``. """ if self.graph is None: raise ValueError("No network found. Make sure to first run create_network().") writer = self._generate_writer(graph_format) writer(filename)
def _generate_writer(self, graph_format: str): writer = { "cyjs": self._export_to_cyjs, "gexf": self._export_to_gexf, "gml": self._export_to_gml, "graphml": self.export_to_graphml, "json": self._export_to_node_link_json, } if graph_format not in writer: raise ValueError( "Format not supported. Please use one of: " "'cyjs', 'gexf', 'gml', 'graphml', 'json'." ) return writer[graph_format]
[docs] def export_to_graphml(self, filename: str): """Save the network as GraphML.""" nx.write_graphml_lxml(self.graph, filename)
def _export_to_cyjs(self, filename: str): """Save the network in Cytoscape JSON format.""" graph = nx.cytoscape_data(self.graph) self._write_to_json(graph, filename) def _export_to_node_link_json(self, filename: str): """Save the network in node-link JSON format.""" graph = nx.node_link_data(self.graph, edges="links") self._write_to_json(graph, filename) @staticmethod def _write_to_json(graph: dict, filename: str): """Save the network as JSON file.""" with open(filename, "w", encoding="utf-8") as file: json.dump(graph, file) def _export_to_gexf(self, filename: str): """Save the network as GEXF.""" nx.write_gexf(self.graph, filename) def _export_to_gml(self, filename: str): """Save the network as GML.""" nx.write_gml(self.graph, filename)