Source code for matchms.filtering.metadata_processing.derive_adduct_from_name

import logging
import re
from typing import List, Optional
from matchms.filtering.filter_utils.interpret_unknown_adduct import \
    get_multiplier_and_mass_from_adduct
from matchms.Spectrum import Spectrum
from ..filter_utils.load_known_adducts import load_known_adducts
from .clean_adduct import _clean_adduct


logger = logging.getLogger("matchms")


[docs]def derive_adduct_from_name(spectrum_in: Spectrum,
                            remove_adduct_from_name: bool = True) -> Optional[Spectrum]:
    """Find adduct in compound name and add to metadata (if not present yet).

    Method to interpret the given compound name to find the adduct.

    Parameters
    ----------
    spectrum_in:
        Input spectrum.
    remove_adduct_from_name:
        Remove found adducts from compound name if set to True. Default is True.
    """
    if spectrum_in is None:
        return None

    spectrum = spectrum_in.clone()

    compound_name = spectrum.get("compound_name", None)
    if compound_name is None:
        if spectrum.get("name", None) not in [None, ""]:
            logger.warning("Found 'name' but not 'compound_name' in metadata"
                           "Apply 'add_compound_name' filter first.")
        return spectrum
    # Detect adduct in compound name
    parts_that_look_like_adduct = []
    name_split = compound_name.split(" ")
    for name_part in name_split:
        if _looks_like_adduct(name_part):
            # Some adducts occur more than once. So they are all removed.
            parts_that_look_like_adduct.append(name_part)

    if remove_adduct_from_name and len(parts_that_look_like_adduct) > 0:
        name_adduct_removed = " ".join([x for x in name_split if x not in parts_that_look_like_adduct])
        name_adduct_removed = name_adduct_removed.strip("; ")
        spectrum.set("compound_name", name_adduct_removed)
        logger.info("Removed adduct %s from compound name.", parts_that_look_like_adduct)

    if len(parts_that_look_like_adduct) > 0 and not _looks_like_adduct(spectrum.get("adduct")):
        best_adduct = _select_best_adduct(parts_that_look_like_adduct)
        if best_adduct:
            # Add found adduct to metadata (if not present yet)
            spectrum.set("adduct", best_adduct)
            logger.info("Added adduct %s from the compound name to metadata.", spectrum.get('adduct'))

    return spectrum


def _select_best_adduct(list_of_adducts: List[str]) -> Optional[str]:
    """Selects an adduct that can actually be interpreted (complete with charge and known elements)"""
    unique_cleaned_adducts = list({_clean_adduct(adduct) for adduct in list_of_adducts})
    if len(unique_cleaned_adducts) == 1:
        return unique_cleaned_adducts[0]

    completely_correct_adduct = []
    for adduct in unique_cleaned_adducts:
        multiplier, correction_mass = get_multiplier_and_mass_from_adduct(adduct)
        # check if both multiplier and correction mass are not None
        if multiplier and correction_mass:
            completely_correct_adduct.append(adduct)
    if len(completely_correct_adduct) == 0:
        return None
    if len(completely_correct_adduct) == 1:
        return completely_correct_adduct[0]
    logger.warning("Two potential adducts were found in the compound name that are both valid adducts. "
                   "The first adduct is used. The adducts found are: %s", completely_correct_adduct)
    return completely_correct_adduct[0]


def _looks_like_adduct(adduct):
    """Return True if input string has expected format of an adduct."""
    if not isinstance(adduct, str):
        return False
    # Clean adduct
    adduct = _clean_adduct(adduct)
    # Load lists of default known adducts
    known_adducts = load_known_adducts()
    if adduct in list(known_adducts["adduct"]):
        return True

    # Expect format like: "[2M-H]" or "[2M+Na]+"
    regexp1 = r"^\[(([0-4]M)|(M[0-9])|(M))((Br)|(Br81)|(Cl)|(Cl37)|(S)){0,}[+-][A-Z0-9\+\-\(\)aglire]{1,}[\]0-4+-]{1,4}"
    return re.search(regexp1, adduct) is not None
Source code for matchms.filtering.metadata_processing.derive_adduct_from_name

Navigation

Related Topics