Source code for matchms.importing.load_from_msp

import re
from typing import Generator, List, Tuple
import numpy as np
from matchms.importing.parsing_utils import parse_spectrum_dict
from matchms.Spectrum import Spectrum



[docs]
def load_from_msp(filename: str, metadata_harmonization: bool = True) -> Generator[Spectrum, None, None]:
    """
    MSP file to a :py:class:`~matchms.Spectrum.Spectrum` objects
    Function that reads a .msp file and converts the info
    in :py:class:`~matchms.Spectrum.Spectrum` objects.

    Parameters
    ----------
    filename:
        Path of the msp file.
    metadata_harmonization : bool, optional
        Set to False if metadata harmonization to default keys is not desired.
        The default is True.

    Yields
    ------
    Yield a spectrum object with the data of the msp file


    Example:

    .. code-block:: python

        from matchms.importing import load_from_msp

        # Download msp file from MassBank of North America repository at https://mona.fiehnlab.ucdavis.edu/
        file_msp = "MoNA-export-GC-MS-first10.msp"
        spectra = list(load_from_msp(file_msp))
    """
    for spectrum in parse_msp_file(filename):
        yield parse_spectrum_dict(spectrum=spectrum, metadata_harmonization=metadata_harmonization, spectrum_type="own")




[docs]
def parse_msp_file(filename: str) -> Generator[dict, None, None]:
    """Read msp file and parse info in List of spectrum dictionaries."""

    # Lists/dicts that will contain all params, masses and intensities of each molecule
    params = {}
    masses = np.array([])
    intensities = np.array([])
    peak_comments = {}

    # Peaks counter. Used to track and count the number of peaks
    peakscount = 0

    with open(filename, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            rline = line.rstrip()

            if len(rline) == 0:
                continue

            if contains_metadata(rline):
                parse_metadata(rline, params)
                continue

            mz, ints, comment = _parse_line_with_peaks(rline)

            masses = np.append(masses, mz)
            intensities = np.append(intensities, ints)

            if comment is not None:
                peak_comments.update({float(masses[-1]): comment})

            peakscount += len(mz)

            # Obtaining the masses and intensities
            if int(params["num peaks"]) == peakscount:
                peakscount = 0
                # Handles edge cases with GOLM files where the nominal mass is written with a comma instead of a dot
                nominal_mass = params.get("mw")
                if nominal_mass and isinstance(nominal_mass, str):
                    params["mw"] = nominal_mass.replace(",", ".")

                yield {
                    "params": (params),
                    "m/z array": masses,
                    "intensity array": intensities,
                    "peak comments": peak_comments,
                }

                params = {}
                masses = []
                intensities = []
                peak_comments = {}



def _parse_line_with_peaks(rline: str) -> Tuple[List[float], List[float], str]:
    """Parse a line containing peaks consisting of mz and intensity values with optional comments.

    Args:
        rline (str): Line with peaks read from the MSP.

    Returns:
        Tuple[List[float], List[float], str]: mz, intensity and peak comments obtained from the line.
    """
    comment, rline = get_peak_comment(rline)
    mz, intensities = get_peak_values(rline)

    return mz, intensities, comment



[docs]
def get_peak_values(peak: str) -> Tuple[List[float], List[float]]:
    """Get the m/z and intensity value from the line containing the peak information."""
    tokens = re.findall(r"(\d+(?:\.\d+)?(?:[eE][-+]?\d+)?)", peak)
    if len(tokens) % 2 != 0:
        raise RuntimeError("Wrong peak format detected!")

    tokens = list(map(float, tokens))
    mz = tokens[0::2]
    intensities = tokens[1::2]
    return mz, intensities




[docs]
def get_peak_comment(rline: str) -> Tuple[str, str]:
    """Get the peak comment from the line containing the peak information."""
    try:
        comment = re.findall(r"[\"\'](.*)[\"\']", rline)[0]
        rline = rline[: rline.index('"')]
    except IndexError:
        comment = None
    return comment, rline




[docs]
def parse_metadata(rline: str, params: dict):
    """Reads metadata contained in line into params dict.

    The complexity of this function stems from the fact that MSP allows for many different formats of metadata.

    Parameters
    ----------
    rline: str
        The line of the read MSP file that contains metadata.
    params : dict
        The params that the key value pairs of the metadata line will be added to.
    """
    splitted_line = rline.split(":", 1)
    if len(splitted_line) != 2:
        return

    key, value = splitted_line[0].strip().lower(), splitted_line[1].strip()

    if key == "comments" and "=" in value:
        _parse_comments(value, params)
    elif key == "synon" and rline.count(":") >= 2:
        _parse_synon(rline, params)
    else:
        # Fallback for generic key: value pairs
        params[key] = value



def _parse_comments(value: str, params: dict):
    """Parses key-value pairs from comments line into params."""
    value = value.replace("'", '"')  # Normalize quotes
    pattern = (
        r'(\S+)="([^"]*)"|'
        r'"(\w+)=([^"]*)"|'
        r'"([^"]*)=([^"]*)"|'
        r"(\S+)=(\d+(?:\.\d*)?)"
    )
    for match in re.findall(pattern, value):
        match = [i for i in match if i]
        if len(match) == 2:
            m_key, m_value = match
            m_key = m_key.strip().lower()
            m_value = m_value.strip()
            if m_key == "smiles" and m_key in params:
                params[f"{m_key}_2"] = m_value
            else:
                params[m_key] = m_value


def _parse_synon(rline: str, params: dict):
    """Parses synon lines with multiple colons."""
    parts = rline.split(":", 2)
    synon_key = f"{parts[0].strip().lower()}: {parts[1].strip().lower()}"
    synon_value = parts[2].strip().replace(",", ".")
    if synon_key == "synon: metb n":
        params.setdefault(synon_key, []).append(synon_value)
    else:
        params[synon_key] = synon_value



[docs]
def contains_metadata(rline: str) -> bool:
    """Check if line contains Spectrum metadata."""
    has_colon = ":" in rline
    return has_colon and not _is_golm_peak_format(rline)



def _is_golm_peak_format(rline: str) -> bool:
    """This function detects whether a line is a line containing peaks in the GOLM MSP format.

    The GOLM MSP format encodes peaks as mz:intensity - this resembles a metadata line, but actually contains peaks.
    It is therefore necessary to explicitly check this corner case when determining whether a line is peaks or metadata.

    Args:
        rline (str): Line to check whether it contains peaks from GOLM

    Returns:
        bool: Whether the line is a line with peaks from GOLM or not.
    """
    return re.match(r"(\d+:{1}\d+)", rline) is not None