Source code for matchms.importing.parsing_utils

"""Helper functions for parsing metadata.
"""
import ast
from typing import Any, Dict, Union
import numpy as np
from matchms.Spectrum import Spectrum


[docs]def find_by_key(data: Union[list, dict], target: str) -> Any: """Helper function to return entries from nested list/dictionary. Parameters ---------- data: Nested dictionary or list in which entry should be searched. target: Name of field to search for in data. """ if hasattr(data, "items"): for key, value in data.items(): if key == target: yield value if isinstance(value, dict): yield from find_by_key(value, target) elif isinstance(value, list): for val in value: yield from find_by_key(val, target) elif isinstance(data, list): for subdata in data: yield from find_by_key(subdata, target)
[docs]def parse_mzml_mzxml_metadata(spectrum_dict: dict) -> dict: """Parse relevant mzml (or mzxml) metadata entries. Parameters ---------- spectrum_dict: Spectrum dictionary containing metadata fields. Metadata parsing may easily break when field key names vary. The following metadata information is considered here: - precursor_mz, searched for in: -->"precursor"/"precursorMz"--> ... --> "selected ion m/z"/"precursorMz" - charge, searched for in: --> "charge state"/"polarity" - title, searched for in "spectrum title" - scan_number, searched for in "num" - scan_start_time, searched for in "scan start time" - retention_time, searched for in "retentionTime" """ charge = None title = None scan_number = None precursor_mz = None scan_time = None retention_time = None first_search = list(find_by_key(spectrum_dict, "precursor")) if not first_search: first_search = list(find_by_key(spectrum_dict, "precursorMz")) if first_search: precursor_mz_search = list(find_by_key(first_search, "selected ion m/z")) if not precursor_mz_search: precursor_mz_search = list(find_by_key(first_search, "precursorMz")) if precursor_mz_search: precursor_mz = float(precursor_mz_search[0]) precursor_charge = list(find_by_key(first_search, "charge state")) if precursor_charge: charge = int(precursor_charge[0]) elif "polarity" in spectrum_dict: if spectrum_dict["polarity"] == "-": charge = -1 elif spectrum_dict["polarity"] == "+": charge = 1 if "spectrum title" in spectrum_dict: title = spectrum_dict["spectrum title"] if "num" in spectrum_dict: scan_number = spectrum_dict["num"] scan_time = list(find_by_key(spectrum_dict, "scan start time")) retention_time = list(find_by_key(spectrum_dict, "retentionTime")) return { "charge": charge, "scan_number": scan_number, "title": title, "precursor_mz": precursor_mz, "scan_start_time": scan_time, "retention_time": retention_time }
[docs]def sort_by_mz(mz, intensities): """Sort mz values and intensities by mz.""" if not np.all(mz[:-1] <= mz[1:]): idx_sorted = np.argsort(mz) mz = mz[idx_sorted] intensities = intensities[idx_sorted] return mz, intensities
[docs]def parse_spectrum_dict(spectrum: Dict, metadata_harmonization, spectrum_type = "pyteomics") -> Spectrum: """Parse a spectrum dict (as read from a msp file for instance) to a matchms Spectrum.""" metadata = spectrum.get("params", None) mz = spectrum["m/z array"] intensities = spectrum["intensity array"] if spectrum_type == "pyteomics": if "peak_comments" in metadata.keys(): metadata["peak_comments"] = ast.literal_eval(str(metadata["peak_comments"])) else: peak_comments = spectrum["peak comments"] if peak_comments != {}: metadata["peak_comments"] = peak_comments mz, intensities = sort_by_mz(mz=mz, intensities=intensities) return Spectrum( mz=mz, intensities=intensities, metadata=metadata, metadata_harmonization=metadata_harmonization )