Source code for matchms.filtering.metadata_processing.clean_compound_name

import logging
import re
from matchms.typing import SpectrumType


logger = logging.getLogger("matchms")


[docs]def clean_compound_name(spectrum_in: SpectrumType) -> SpectrumType: """Clean compound name. A list of frequently seen name additions that do not belong to the compound name will be removed. """ if spectrum_in is None: return None spectrum = spectrum_in.clone() if spectrum.get("compound_name", None) is not None: name = spectrum.get("compound_name") else: assert spectrum.get("name", None) in [None, ""], ("Found 'name' but not 'compound_name' in metadata", "Apply 'add_compound_name' filter first.") return spectrum # Clean compound name name_cleaned = _remove_parts_by_regular_expression(name) name_cleaned = _remove_known_non_compound_parts(name_cleaned) name_cleaned = _remove_misplaced_mass(name_cleaned) if name_cleaned != name: spectrum.set("compound_name", name_cleaned) logger.info("Added cleaned compound name: %s", name_cleaned) return spectrum
def _remove_parts_by_regular_expression(name: str): """Clean name string by removing known parts that don't belong there.""" name = name.strip() # remove type NCGC00180417-03_C31H40O16_ name = re.split(r"[A-Z]{3,6}[0-9]{8,12}-[0-9]{2,5}_[A-Z,0-9]{4,15}_", name)[-1] # remove type NCGC00160232-01! or MLS001142816-01! name = re.split(r"[A-Z]{3,6}[0-9]{8,12}-[0-9]{2,3}\!", name)[-1] # remove type Massbank:EA008813 option1|option2|option3 name = re.split(r"((Massbank:)|(MassbankEU:))[A-Z]{2}[0-9]{5,6}.*\|", name)[-1] # remove type Massbank:EA008813 or MassbankEU:EA008813 name = re.split(r"((Massbank:)|(MassbankEU:))[A-Z]{2}[0-9]{5,6}", name)[-1] # remove type HMDB:HMDB00943-1336 name = re.split(r"HMDB:HMDB[0-9]{4,7}-[0-9]{1,7}", name)[-1] # remove type MoNA:662599 name = re.split(r"MoNA:[0-9]{5,10}", name)[-1] # ReSpect:PS013405 option1|option2|option3... name = re.split(r"ReSpect:[A-Z]{2,3}[0-9]{6}.*\|", name)[-1] # ReSpect:PS013405 option1 name = re.split(r"[A-Z]{2,3}[0-9]{6}( )", name)[-1] # remove type 0072_2-Mercaptobenzothiaz name = re.split(r"^[0-9]{4}_", name)[-1] # remove type nameofcompound_CID20_170920 or Spiraeoside_HCD30_170919 name = re.split(r"_((HCD)|(CID))[0-9]{2}_[0-9]{5,6}$", name)[0] # Removes the collision energy from the compound name. Also allows for occurances of - 40.0 eV Unknown name = re.split(r"(?: - )?[0-9]+(?:\.[0-9]+)? ?[eE][Vv](?: Unknown)?$", name)[0] return name def _remove_known_non_compound_parts(name: str): """Remove known non compound-name strings from name.""" parts_remove = ["Spectral Match to", "from NIST14", "Massbank:"] for part in parts_remove: name = name.replace(part, "") return name.strip("; ") def _remove_misplaced_mass(name: str): """Remove occasionally occurring parent mass addition to name.""" regex_mass = r"^[0-9]{2,4}\.[0-9]$" end_part = name.split(" ")[-1] if re.search(regex_mass, end_part) is not None: return name.replace(end_part, "").strip() return name