Source code for matchms.Fingerprints

import json
import logging
from typing import Optional
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem.rdchem import Mol
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator, GetRDKitFPGenerator
from matchms.filtering.filter_utils.smile_inchi_inchikey_conversions import (
    is_valid_inchi,
    is_valid_inchikey,
    is_valid_smiles,
)
from matchms.typing import SpectrumType
from .utils import to_camel_case


logger = logging.getLogger("matchms")
FP_ALGORITHMS = {
    "daylight": lambda args: GetRDKitFPGenerator(**args),
    "morgan1": lambda args: GetMorganGenerator(**args, radius=1),
    "morgan2": lambda args: GetMorganGenerator(**args, radius=2),
    "morgan3": lambda args: GetMorganGenerator(**args, radius=3),
}


[docs] class Fingerprints: """ Computes and stores inchikey-fingerprint mapping for a list of spectra, For example .. testcode:: from matchms import Fingerprints from matchms import Spectrum import numpy as np spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), intensities=np.array([0.7, 0.2, 0.1]), metadata={"inchikey": "OTMSDBZUPAUEDD-UHFFFAOYSA-N", "smiles":"CC"}) spectrum_2 = Spectrum(mz=np.array([100, 150, 200.]), intensities=np.array([0.7, 0.2, 0.1]), metadata={"inchikey": "UGFAIRIUMAVXCW-UHFFFAOYSA-N","smiles": "[C-]#[O+]"}) spectra = [spectrum_1, spectrum_2] fpgen = Fingerprints() fpgen.compute_fingerprints(spectra) print(fpgen.fingerprint_count) print(type(fpgen.get_fingerprint_by_inchikey('OTMSDBZUPAUEDD-UHFFFAOYSA-N'))) Should output .. testoutput:: 2 <class 'numpy.ndarray'> Attributes ---------- config: The configuration for the fingerprints e.g., used algorithm, nbits, ... fingerprints: The computed fingerprints. Use after compute_fingerprints(). fingerprints_count The number of fingerprints computed. to_dataframe A DataFrame containing the inchikey and fingerprint """
[docs] def __init__( self, fingerprint_algorithm: str = "daylight", fingerprint_method: str = "bit", nbits: int = 2048, ignore_stereochemistry: bool = False, **kwargs, ): """ Parameters ---------- fingerprint_algorithm The fingerprint algorithm to use. Available options: daylight, morgan1, morgan2, morgan3. fingerprint_method The fingerprint method to use. Available options: bit, sparse_bit, count, sparse_count. nbits The number of bits or fingerprint size. Defaults to 2048. ignore_stereochemistry Determines which inchikey version will be used. If set to true the first 14 chars of the inchikey are used. """ self.inchikey_fingerprint_mapping = {} self.fingerprint_algorithm = fingerprint_algorithm self.fingerprint_method = fingerprint_method self.nbits = nbits self.ignore_stereochemistry = ignore_stereochemistry self.kwargs = kwargs
def __str__(self): return json.dumps({"config: ": self.config, "inchikey_fingerprint_mapping": self.inchikey_fingerprint_mapping}) @property def config(self) -> dict: return { "fingerprint_algorithm": self.fingerprint_algorithm, "fingerprint_method": self.fingerprint_method, "nbits": self.nbits, "ingore_stereochemistry": self.ignore_stereochemistry, "additional_keyword_arguments": self.kwargs, } @property def fingerprints(self): return self.inchikey_fingerprint_mapping @property def fingerprint_count(self) -> int: return len(self.inchikey_fingerprint_mapping) @property def to_dataframe(self) -> pd.DataFrame: return pd.DataFrame( data={"fingerprint": list(self.inchikey_fingerprint_mapping.values())}, index=list(self.inchikey_fingerprint_mapping.keys()), )
[docs] def get_fingerprint_by_inchikey(self, inchikey: str) -> Optional[np.ndarray]: """ Get fingerprint by inchikey. Parameters ---------- inchikey Inchikey of a spectrum. Return: -------------- Optional[np.ndarray] The corresponding fingerprint. """ if inchikey in self.inchikey_fingerprint_mapping: return self.inchikey_fingerprint_mapping[inchikey] if (len(inchikey) == 14) and not self.ignore_stereochemistry: raise ValueError("Expected full 27 character InChIKey (or ignore_stereochemistry set to True)") if not is_valid_inchikey(inchikey): logger.warning("The provided inchikey is not valid or may be the short form.") logger.warning("Fingerprint is not present for given Spectrum/InchiKey. Use compute_fingerprint() first.") return None
[docs] def get_fingerprint_by_spectrum(self, spectrum: SpectrumType) -> Optional[np.ndarray]: """ Get fingerprint by spectrum. Parameters ---------- spectrum Spectrum with a inchikey. Return: -------------- Optional[np.ndarray] The corresponding fingerprint. """ inchikey = spectrum.get("inchikey") return self.get_fingerprint_by_inchikey(inchikey)
[docs] def compute_fingerprint(self, spectrum: SpectrumType) -> Optional[np.ndarray]: """ Computes a single fingerprint for a given spectrum. Parameters ---------- spectrum A spectrum for which a fingerprint is to be calculated. Return: -------------- Optional[np.ndarray] The corresponding fingerprint. """ fingerprint = None if spectrum.get("smiles"): fingerprint = _derive_fingerprint_from_smiles( spectrum.get("smiles"), self.fingerprint_algorithm, self.fingerprint_method, self.nbits, **self.kwargs ) if fingerprint is None and spectrum.get("inchi"): fingerprint = _derive_fingerprint_from_inchi( spectrum.get("inchi"), self.fingerprint_algorithm, self.fingerprint_method, self.nbits, **self.kwargs ) return fingerprint
[docs] def compute_fingerprints(self, spectra: list[SpectrumType]): """ Computes fingerprints for a list of spectra. This will first create a dict with unique spectra and then computes fingerprints for all mols. Only valid fingerprints will be added to the mapping. Query specific fingerprints by using get_fingerprint_by_spectrum() or get_fingerprint_by_inchikey() Parameters ---------- spectra List of Spectrum """ # Get/Set unique spectra via inchikey unique_spectra = {} for spectrum in spectra: try: # Validate metadata _validate_metadata(spectrum, self.ignore_stereochemistry) inchikey = spectrum.get("inchikey") # Add inchikey/spectrum to unique_spectra and ensure smiles or inchi if inchikey not in unique_spectra: unique_spectra[inchikey] = spectrum except ValueError: logger.warning("%s doesn't have a inchikey. Skipping.", spectrum) # Get mols of unique spectra from smiles/inchi mols = [_get_mol(spectrum) for spectrum in unique_spectra.values()] # Get fingerprints of all mols fingerprints = _mols_to_fingerprints( mols, self.fingerprint_algorithm, self.fingerprint_method, self.nbits, **self.kwargs ) # Map inchikey - fingerprint for inchikey, fp in zip(unique_spectra.keys(), fingerprints, strict=True): if isinstance(fp, np.ndarray) and fp.sum() > 0: self.inchikey_fingerprint_mapping[inchikey] = fp
def _get_mol(spectrum: SpectrumType) -> Optional[Mol]: """ Get the molecule either from smiles or inchi. Parameter: ---------- spectrum: SpectrumType Spectrum to get the mol from. Return: -------------- Optional[Mol] RDKit Mol object or None if smiles and inchi missing or generation failed. """ mol = None if spectrum.get("smiles"): return Chem.MolFromSmiles(spectrum.get("smiles")) if spectrum.get("inchi"): return Chem.MolFromInchi(spectrum.get("inchi")) return mol def _validate_metadata(spectrum: SpectrumType, ignore_stereochemistry: bool): """ Validates metadata for a given spectrum. Checks for a valid inchikey or if stereochemistry is ignored check for a inchikey of 14 chars. Checks if inchi or smiles are valid. Parameters ---------- spectrum Spectrum to validate ignore_stereochemistry If true, a inchikey should contain 14 chars. """ inchikey = spectrum.get("inchikey") if ignore_stereochemistry: if len(inchikey) > 14: spectrum.set("inchikey", inchikey[:14]) elif len(inchikey) < 14: raise ValueError("Inchikey is missing or invalid.") elif not is_valid_inchikey(inchikey): raise ValueError("Inchikey is missing or invalid.") if not is_valid_inchi(spectrum.get("inchi")) and not is_valid_smiles(spectrum.get("smiles")): raise ValueError("Inchi or smiles is missing or invalid.") return spectrum def _derive_fingerprint_from_smiles( smiles: str, fingerprint_algorithm: str, fingerprint_method: str, nbits: int, **kwargs ) -> Optional[np.ndarray]: """Calculate molecule fingerprint based on given smiles or inchi (using rdkit). Requires conda package *rdkit* to be installed. Parameters ---------- smiles Input smiles to derive fingerprint from. fingerprint_type Determine method for deriving molecular fingerprints. Supported choices are 'daylight', 'morgan1', 'morgan2', 'morgan3'. nbits Dimension or number of bits of generated fingerprint. Returns ------- fingerprint Molecular fingerprint. """ mol = Chem.MolFromSmiles(smiles) if mol is None: return None return _mol_to_fingerprint(mol, fingerprint_algorithm, fingerprint_method, nbits, **kwargs) def _derive_fingerprint_from_inchi( inchi: str, fingerprint_algorithm: str, fingerprint_method: str, nbits: int, **kwargs ) -> Optional[np.ndarray]: """Calculate molecule fingerprint based on given inchi (using rdkit). Requires conda package *rdkit* to be installed. Parameters ---------- inchi Input InChI to derive fingerprint from. fingerprint_type Determine method for deriving molecular fingerprints. Supported choices are 'daylight', 'morgan1', 'morgan2', 'morgan3'. nbits Dimension or number of bits of generated fingerprint. Returns ------- fingerprint: np.array Molecular fingerprint. """ mol = Chem.MolFromInchi(inchi) if mol is None: return None return _mol_to_fingerprint(mol, fingerprint_algorithm, fingerprint_method, nbits, **kwargs) def _mol_to_fingerprint( mol: Mol, fingerprint_algorithm: str, fingerprint_type: str, nbits: int, **kwargs ) -> Optional[np.ndarray]: """Convert rdkit mol (molecule) to molecular fingerprint. Requires conda package *rdkit* to be installed. Parameters ---------- mol Input rdkit molecule. fingerprint_algorithm Determine algorithm for deriving molecular fingerprints. Supported algorithms are 'daylight', 'morgan1', 'morgan2', 'morgan3'. fingerprint_type Determine type for deriving molecular fingerprints. Supported types are 'bit', 'sparse_bit', 'count', 'sparse_count'. nbits Dimension or number of bits of generated fingerprint. **kwargs Keyword arguments to pass additional parameters to FingerprintGenerator. The keywords should match the corresponding RDKit implementation (e.g., min_path/max_path for RDKitFPGenerator). See https://rdkit.org/docs/source/rdkit.Chem.rdFingerprintGenerator.html. Returns ------- fingerprint Molecular fingerprint. """ types = { "bit": "GetFingerprint", "sparse_bit": "GetSparseFingerprint", "count": "GetCountFingerprint", "sparse_count": "GetSparseCountFingerprint", } if fingerprint_algorithm not in FP_ALGORITHMS: raise ValueError(f"Unkown fingerprint algorithm given. Available algorithms: {list(FP_ALGORITHMS.keys())}.") if fingerprint_type not in types: raise ValueError(f"Unkown fingerprint type given. Available types: {list(types.keys())}.") args = {"fpSize": nbits, **{to_camel_case(k): v for k, v in kwargs.items()}} generator = FP_ALGORITHMS[fingerprint_algorithm](args) fingerprint_func = getattr(generator, types[fingerprint_type]) fingerprint = fingerprint_func(mol) return np.array(fingerprint) if fingerprint else None def _mols_to_fingerprints( mols: list[Mol], fingerprint_algorithm: str, fingerprint_type: str, nbits: int, **kwargs ) -> np.ndarray: """ Computes a fingerprints for a list of molecules. Parameters ---------- fingerprint_algorithm Specifies algorithm for deriving fingerprints. Supported algorithms are 'daylight', 'morgan1', 'morgan2', 'morgan3'. fingerprint_type Determine type for deriving molecular fingerprints. Supported types are 'bit', 'sparse_bit', 'count', 'sparse_count'. nbits Dimension or number of bits of generated fingerprint. **kwargs Keyword arguments to pass additional parameters to FingerprintGenerator. The keywords should match the corresponding RDKit implementation (e.g., min_path/max_path for RDKitFPGenerator). See https://rdkit.org/docs/source/rdkit.Chem.rdFingerprintGenerator.html. Return: -------------- np.ndarray A np.ndarray of np.ndarrays, containing a fingerprints for each molecule. If the fingerprint for a molecule cannot be calculated, the corresponding fingerprint is an ndarray with zeros. """ types = { "bit": "GetFingerprints", "sparse_bit": "GetSparseFingerprints", "count": "GetCountFingerprints", "sparse_count": "GetSparseCountFingerprints", } if fingerprint_algorithm not in FP_ALGORITHMS: raise ValueError(f"Unkown fingerprint algorithm given. Available algorithms: {list(FP_ALGORITHMS.keys())}.") if fingerprint_type not in types: raise ValueError(f"Unkown fingerprint type given. Available types: {list(types.keys())}.") args = {"fpSize": nbits, **{to_camel_case(k): v for k, v in kwargs.items()}} generator = FP_ALGORITHMS[fingerprint_algorithm](args) fingerprint_func = getattr(generator, types[fingerprint_type]) fingerprints = fingerprint_func(mols, numThreads=-1) out = np.zeros((len(mols), nbits), dtype=np.int8) assert len(fingerprints) == len(mols) for i, fp in enumerate(fingerprints): if fp is not None: DataStructs.ConvertToNumpyArray(fp, out[i]) return out