Source code for matchms.importing.load_from_msp
import re
from typing import Generator, List, Tuple
import numpy as np
from matchms.importing.parsing_utils import parse_spectrum_dict
from matchms.Spectrum import Spectrum
[docs]def load_from_msp(filename: str,
metadata_harmonization: bool = True) -> Generator[Spectrum, None, None]:
"""
MSP file to a :py:class:`~matchms.Spectrum.Spectrum` objects
Function that reads a .msp file and converts the info
in :py:class:`~matchms.Spectrum.Spectrum` objects.
Parameters
----------
filename:
Path of the msp file.
metadata_harmonization : bool, optional
Set to False if metadata harmonization to default keys is not desired.
The default is True.
Yields
------
Yield a spectrum object with the data of the msp file
Example:
.. code-block:: python
from matchms.importing import load_from_msp
# Download msp file from MassBank of North America repository at https://mona.fiehnlab.ucdavis.edu/
file_msp = "MoNA-export-GC-MS-first10.msp"
spectrums = list(load_from_msp(file_msp))
"""
for spectrum in parse_msp_file(filename):
yield parse_spectrum_dict(
spectrum=spectrum,
metadata_harmonization=metadata_harmonization,
spectrum_type = "own"
)
[docs]def parse_msp_file(filename: str) -> Generator[dict, None, None]:
"""Read msp file and parse info in List of spectrum dictionaries."""
# Lists/dicts that will contain all params, masses and intensities of each molecule
params = {}
masses = np.array([])
intensities = np.array([])
peak_comments = {}
# Peaks counter. Used to track and count the number of peaks
peakscount = 0
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
rline = line.rstrip()
if len(rline) == 0:
continue
if contains_metadata(rline):
parse_metadata(rline, params)
continue
mz, ints, comment = _parse_line_with_peaks(rline)
masses = np.append(masses, mz)
intensities = np.append(intensities, ints)
if comment is not None:
peak_comments.update({masses[-1]: comment})
peakscount += len(mz)
# Obtaining the masses and intensities
if int(params['num peaks']) == peakscount:
peakscount = 0
yield {
'params': (params),
'm/z array': masses,
'intensity array': intensities,
'peak comments': peak_comments
}
params = {}
masses = []
intensities = []
peak_comments = {}
def _parse_line_with_peaks(rline: str) -> Tuple[List[float], List[float], str]:
"""Parse a line containing peaks consisting of mz and intensity values with optional comments.
Args:
rline (str): Line with peaks read from the MSP.
Returns:
Tuple[List[float], List[float], str]: mz, intensity and peak comments obtained from the line.
"""
comment, rline = get_peak_comment(rline)
mz, intensities = get_peak_values(rline)
return mz, intensities, comment
[docs]def get_peak_values(peak: str) -> Tuple[List[float], List[float]]:
""" Get the m/z and intensity value from the line containing the peak information. """
tokens = re.findall(r'(\d+(?:\.\d+)?(?:e[-+]?\d+)?)', peak)
if len(tokens) % 2 != 0:
raise RuntimeError("Wrong peak format detected!")
tokens = list(map(float, tokens))
mz = tokens[0::2]
intensities = tokens[1::2]
return mz, intensities
def _is_golm_peak_format(rline: str) -> bool:
"""This function detects whether a line is a line containing peaks in the GOLM MSP format.
The GOLM MSP format encodes peaks as mz:intensity - this resembles a metadata line, but actually contains peaks.
It is therefore necessary to explicitly check this corner case when determining whether a line is peaks or metadata.
Args:
rline (str): Line to check whether it contains peaks from GOLM
Returns:
bool: Whether the line is a line with peaks from GOLM or not.
"""
return re.match(r"(\d+:{1}\d+)", rline) is not None