Source code for matchms.filtering.SpeciesString
import re
[docs]class SpeciesString:
"""
A class to process and clean different types of chemical structure strings including InChI,
InChIKey, and SMILES.
The class takes a raw input string, determines the intended structure type, and then cleans
the string based on its type.
Attributes
----------
dirty : str
Raw input string representing a chemical structure.
target : str
The intended structure type determined from the input string. Could be 'inchi', 'inchikey',
'smiles', or None if no valid type was identified.
cleaned : str
The cleaned structure string.
"""
[docs] def __init__(self, dirty: str):
"""
Constructs a new instance of the SpeciesString class.
Parameters
----------
dirty : str
The raw input string representing a chemical structure.
"""
self.dirty = dirty
self.target = None
self.cleaned = None
self.guess_target()
self.clean()
def __str__(self):
if self.cleaned == "":
return ""
return f"({self.target}): {self.cleaned}"
[docs] def clean(self):
"""Clean the input string based on its determined structure type."""
if self.target is None:
self.cleaned = ""
elif self.target == "inchi":
self.clean_as_inchi()
elif self.target == "inchikey":
self.clean_as_inchikey()
elif self.target == "smiles":
self.clean_as_smiles()
return self
[docs] def clean_as_inchi(self):
"""Search for valid inchi and harmonize it."""
regexp = r"(1S\/|1\/)[0-9A-Za-z\.]{2,}\/([ch])[0-9].*$"
found = re.search(regexp, self.dirty)
if found is None:
self.cleaned = ""
else:
self.cleaned = "InChI=" + found[0].replace('"', "")
[docs] def clean_as_inchikey(self):
"""Search for valid inchikey and harmonize it."""
regexp = r"[A-Z]{14}-[A-Z]{10}-[A-Z]"
found = re.search(regexp, self.dirty)
if found is None:
self.cleaned = ""
else:
self.cleaned = found[0]
[docs] def clean_as_smiles(self):
"""Search for valid smiles and harmonize it."""
regexp = r"^([^J][0-9BCOHNSOPIFKcons@+\-\[\]\(\)\\\/%=#$,.~&!|Si|Se|Br|Mg|Na|Cl|Al]{3,})$"
found = re.search(regexp, self.dirty)
if found is None:
self.cleaned = ""
else:
self.cleaned = found[0]
[docs] def guess_target(self):
"""Determine the intended structure type of the input string."""
if self.looks_like_an_inchikey():
self.target = "inchikey"
elif self.looks_like_an_inchi():
self.target = "inchi"
elif self.looks_like_a_smiles():
self.target = "smiles"
else:
self.target = None
return self
[docs] def looks_like_an_inchi(self):
"""Search for first piece of InChI."""
regexp = r"(InChI=1|1)(S\/|\/)[0-9, A-Z, a-z,\.]{2,}\/([ch])[0-9]"
return re.search(regexp, self.dirty) is not None
[docs] def looks_like_an_inchikey(self):
"""Return True if string has format of inchikey."""
regexp = r"[A-Z]{14}-[A-Z]{10}-[A-Z]"
return re.search(regexp, self.dirty) is not None
[docs] def looks_like_a_smiles(self):
"""Return True if string is made of allowed charcters for smiles."""
regexp = r"^([^J][0-9BCOHNSOPIFKcons@+\-\[\]\(\)\\\/%=#$,.~&!|Si|Se|Br|Mg|Na|Cl|Al]{3,})$"
return re.search(regexp, self.dirty) is not None