Source code for matchms.filtering.metadata_processing.derive_adduct_from_name
import logging
import re
from typing import List, Optional
from matchms.filtering.filter_utils.interpret_unknown_adduct import \
get_multiplier_and_mass_from_adduct
from matchms.Spectrum import Spectrum
from ..filter_utils.load_known_adducts import load_known_adducts
from .clean_adduct import _clean_adduct
logger = logging.getLogger("matchms")
[docs]def derive_adduct_from_name(spectrum_in: Spectrum,
remove_adduct_from_name: bool = True) -> Optional[Spectrum]:
"""Find adduct in compound name and add to metadata (if not present yet).
Method to interpret the given compound name to find the adduct.
Parameters
----------
spectrum_in:
Input spectrum.
remove_adduct_from_name:
Remove found adducts from compound name if set to True. Default is True.
"""
if spectrum_in is None:
return None
spectrum = spectrum_in.clone()
compound_name = spectrum.get("compound_name", None)
if compound_name is None:
if spectrum.get("name", None) not in [None, ""]:
logger.warning("Found 'name' but not 'compound_name' in metadata"
"Apply 'add_compound_name' filter first.")
return spectrum
# Detect adduct in compound name
parts_that_look_like_adduct = []
name_split = compound_name.split(" ")
for name_part in name_split:
if _looks_like_adduct(name_part):
# Some adducts occur more than once. So they are all removed.
parts_that_look_like_adduct.append(name_part)
if remove_adduct_from_name and len(parts_that_look_like_adduct) > 0:
name_adduct_removed = " ".join([x for x in name_split if x not in parts_that_look_like_adduct])
name_adduct_removed = name_adduct_removed.strip("; ")
spectrum.set("compound_name", name_adduct_removed)
logger.info("Removed adduct %s from compound name.", parts_that_look_like_adduct)
if len(parts_that_look_like_adduct) > 0 and not _looks_like_adduct(spectrum.get("adduct")):
best_adduct = _select_best_adduct(parts_that_look_like_adduct)
if best_adduct:
# Add found adduct to metadata (if not present yet)
spectrum.set("adduct", best_adduct)
logger.info("Added adduct %s from the compound name to metadata.", spectrum.get('adduct'))
return spectrum
def _select_best_adduct(list_of_adducts: List[str]) -> Optional[str]:
"""Selects an adduct that can actually be interpreted (complete with charge and known elements)"""
unique_cleaned_adducts = list({_clean_adduct(adduct) for adduct in list_of_adducts})
if len(unique_cleaned_adducts) == 1:
return unique_cleaned_adducts[0]
completely_correct_adduct = []
for adduct in unique_cleaned_adducts:
multiplier, correction_mass = get_multiplier_and_mass_from_adduct(adduct)
# check if both multiplier and correction mass are not None
if multiplier and correction_mass:
completely_correct_adduct.append(adduct)
if len(completely_correct_adduct) == 0:
return None
if len(completely_correct_adduct) == 1:
return completely_correct_adduct[0]
logger.warning("Two potential adducts were found in the compound name that are both valid adducts. "
"The first adduct is used. The adducts found are: %s", completely_correct_adduct)
return completely_correct_adduct[0]
def _looks_like_adduct(adduct):
"""Return True if input string has expected format of an adduct."""
if not isinstance(adduct, str):
return False
# Clean adduct
adduct = _clean_adduct(adduct)
# Load lists of default known adducts
known_adducts = load_known_adducts()
if adduct in list(known_adducts["adduct"]):
return True
# Expect format like: "[2M-H]" or "[2M+Na]+"
regexp1 = r"^\[(([0-4]M)|(M[0-9])|(M))((Br)|(Br81)|(Cl)|(Cl37)|(S)){0,}[+-][A-Z0-9\+\-\(\)aglire]{1,}[\]0-4+-]{1,4}"
return re.search(regexp1, adduct) is not None