Source code for matchms.filtering.metadata_processing.clean_adduct

import logging
import re
from matchms.filtering.filter_utils.interpret_unknown_adduct import \
    get_charge_of_adduct
from matchms.filtering.filter_utils.load_known_adducts import \
    load_known_adduct_conversions


logger = logging.getLogger("matchms")


[docs]def clean_adduct(spectrum_in): """Clean adduct and make it consistent in style. Will transform adduct strings of type 'M+H+' to '[M+H]+'. Parameters ---------- spectrum_in Matchms Spectrum object. """ if spectrum_in is None: return None adduct = spectrum_in.get("adduct") if adduct is None: return spectrum_in spectrum = spectrum_in.clone() cleaned_adduct = _clean_adduct(adduct, spectrum.get('charge')) if spectrum.get("charge"): if spectrum.get("charge") != get_charge_of_adduct(cleaned_adduct): logger.warning("The charge in the adduct: %s and the given charge: %s do not match", adduct, spectrum.get("charge")) else: # set charge to adduct charge_from_adduct = get_charge_of_adduct(cleaned_adduct) if charge_from_adduct: logger.info("Unknown charge was derived from adduct: %s, now charge is %s", cleaned_adduct, charge_from_adduct) spectrum.set("charge", charge_from_adduct) if adduct != cleaned_adduct: spectrum.set("adduct", cleaned_adduct) logger.info("The adduct %s was set to %s", adduct, cleaned_adduct) return spectrum
def _clean_adduct(adduct: str, charge=None) -> str: """Clean adduct and make it consistent in style. Will transform adduct strings of type 'M+H+' to '[M+H]+'. Parameters ---------- adduct Input adduct string to be cleaned/edited. """ if not isinstance(adduct, str): return adduct adduct = adduct.strip().replace("\n", "").replace("*", "") adduct = adduct.replace("++", "2+").replace("--", "2-") if not adduct.startswith("["): adduct = _add_missing_brackets_to_adduct(adduct) if adduct.endswith("]"): charge = _convert_int_charge_to_str(charge) if charge: adduct += charge return _convert_known_adduct(adduct) def _add_missing_brackets_to_adduct(adduct): """Adds missing brackets to an adduct and moves the charge outside. """ def _get_adduct_charge(adduct): # Remove parts that can confuse the charge extraction. Because they end with a number and the ] is missing. for mol_part in ["CH2", "CH3", "NH3", "NH4", "O2"]: if mol_part in adduct: adduct = adduct.split(mol_part)[-1] regex_charges = r"[1-3]{0,1}[+-]{1,2}$" match = re.search(regex_charges, adduct) if match: return match.group(0) return match if not adduct.startswith("["): adduct = "[" + adduct if adduct.endswith("]"): return adduct adduct_charge = _get_adduct_charge(adduct) if adduct_charge is None: return adduct + "]" return adduct[:-len(adduct_charge)] + "]" + adduct_charge def _convert_int_charge_to_str(charge): """Converts integer to str format of charge e.g.: 1 -> + -1 -> - 2 -> 2+ -2 -> 2- """ if charge is None: return None if not isinstance(charge, int): logger.warning("Charge is not given as int. Apply 'make_charge_int' filter first.") return None if charge == 0: return None if charge < 0: sign = "-" else: sign = "+" if charge in (-1, 1): return sign return str(abs(charge)) + sign def _convert_known_adduct(adduct): """Convert adduct if conversion rule is known""" adduct_conversions = load_known_adduct_conversions() if adduct in adduct_conversions: return adduct_conversions[adduct] return adduct