import json
import logging
from dataclasses import dataclass
import numpy as np
import pandas as pd
import scipy.sparse as sp
from chemap import FingerprintConfig, compute_fingerprints
from rdkit import Chem
from matchms.filtering.filter_utils.smile_inchi_inchikey_conversions import (
is_valid_inchi,
is_valid_inchikey,
is_valid_smiles,
)
from matchms.typing import SpectrumType
logger = logging.getLogger("matchms")
@dataclass(frozen=True)
class _FingerprintRecord:
"""Internal record linking one unique compound to structure metadata."""
inchikey: str
smiles: str | None
inchi: str | None
[docs]
class Fingerprints:
"""Compute and store an InChIKey-to-fingerprint mapping for a collection of spectra.
This class is a container for molecular fingerprints keyed by InChIKey.
Fingerprints are computed for unique compounds only and stored either as a
dense NumPy array or as a SciPy CSR sparse matrix.
Compared to the older implementation, this refactor is designed for larger
scale use cases and delegates fingerprint computation to :mod:`chemap`.
Example
-------
.. testcode::
import numpy as np
from rdkit.Chem import rdFingerprintGenerator
from matchms import Fingerprints, Spectrum
spectrum_1 = Spectrum(
mz=np.array([100, 150, 200.]),
intensities=np.array([0.7, 0.2, 0.1]),
metadata={
"inchikey": "OTMSDBZUPAUEDD-UHFFFAOYSA-N",
"smiles": "CC",
"precursor_mz": 150.0,
},
)
spectrum_2 = Spectrum(
mz=np.array([100, 150, 200.]),
intensities=np.array([0.7, 0.2, 0.1]),
metadata={
"inchikey": "UGFAIRIUMAVXCW-UHFFFAOYSA-N",
"smiles": "[C-]#[O+]",
"precursor_mz": 150.0,
},
)
spectra = [spectrum_1, spectrum_2]
generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=256)
fpgen = Fingerprints(
fingerprint_generator=generator,
count=False,
folded=True,
return_csr=False,
)
fpgen.compute_fingerprints(spectra)
print(fpgen.fingerprint_count)
print(type(fpgen.get_fingerprint_by_inchikey("OTMSDBZUPAUEDD-UHFFFAOYSA-N")))
Should output
.. testoutput::
2
<class 'numpy.ndarray'>
Attributes
----------
fingerprints
The computed fingerprints as either a NumPy array or SciPy CSR matrix.
inchikeys
Ordered list of unique InChIKeys corresponding to fingerprint rows.
fingerprint_count
Number of unique fingerprints currently stored.
config
Dictionary with configuration used for fingerprint computation.
to_dataframe
DataFrame containing InChIKeys and fingerprints.
"""
[docs]
def __init__(
self,
fingerprint_generator,
*,
ignore_stereochemistry: bool = False,
count: bool = False,
folded: bool = True,
return_csr: bool = False,
invalid_policy: str = "raise",
**config_kwargs,
):
"""
Parameters
----------
fingerprint_generator
A chemap-compatible fingerprint generator, for example an RDKit
fingerprint generator or a scikit-fingerprints object.
ignore_stereochemistry
If True, the first 14 characters of the InChIKey are used.
count
Whether count fingerprints should be computed.
folded
Whether fingerprints should be folded.
return_csr
If True, fingerprints are stored as a SciPy CSR matrix.
Otherwise they are stored as a dense NumPy array.
invalid_policy
Policy passed to chemap for invalid molecular inputs.
**config_kwargs
Additional keyword arguments passed into ``FingerprintConfig``.
"""
self.fingerprint_generator = fingerprint_generator
self.ignore_stereochemistry = ignore_stereochemistry
self.count = count
self.folded = folded
self.return_csr = return_csr
self.invalid_policy = invalid_policy
self.config_kwargs = config_kwargs
self._inchikeys: list[str] = []
self._row_by_inchikey: dict[str, int] = {}
self._records: list[_FingerprintRecord] = []
self._fingerprints: np.ndarray | sp.csr_matrix | None = None
def __str__(self):
return json.dumps(
{
"config": self.config,
"inchikeys": self._inchikeys,
"fingerprint_count": self.fingerprint_count,
"is_sparse": self.is_sparse,
}
)
@property
def config(self) -> dict:
"""Return configuration used for fingerprint computation."""
return {
"ignore_stereochemistry": self.ignore_stereochemistry,
"count": self.count,
"folded": self.folded,
"return_csr": self.return_csr,
"invalid_policy": self.invalid_policy,
"additional_keyword_arguments": self.config_kwargs,
}
@property
def fingerprints(self) -> np.ndarray | sp.csr_matrix | None:
"""Return the stored fingerprint matrix."""
return self._fingerprints
@property
def inchikeys(self) -> list[str]:
"""Return ordered list of stored InChIKeys."""
return list(self._inchikeys)
@property
def is_sparse(self) -> bool:
"""Return True if fingerprints are stored as CSR sparse matrix."""
return self._fingerprints is not None and sp.issparse(self._fingerprints)
@property
def fingerprint_count(self) -> int:
"""Return the number of stored fingerprints."""
return len(self._inchikeys)
@property
def to_dataframe(self) -> pd.DataFrame:
"""Return fingerprints as a pandas DataFrame indexed by InChIKey."""
if self._fingerprints is None:
return pd.DataFrame(index=[])
fingerprints = list(self._fingerprints)
return pd.DataFrame(
data={"fingerprint": fingerprints},
index=self._inchikeys,
)
[docs]
def get_fingerprint_by_inchikey(self, inchikey: str):
"""Get fingerprint by InChIKey.
Parameters
----------
inchikey
InChIKey of a compound.
Returns
-------
Optional[np.ndarray | scipy.sparse.csr_matrix]
The corresponding fingerprint row, or None if not present.
"""
if inchikey is None:
logger.warning("No InChIKey provided.")
return None
normalized = self._normalize_inchikey(inchikey, validate=False)
if normalized in self._row_by_inchikey:
row_idx = self._row_by_inchikey[normalized]
return self._get_row(row_idx)
if len(inchikey) == 14 and not self.ignore_stereochemistry:
raise ValueError(
"Expected full 27 character InChIKey (or set ignore_stereochemistry=True)."
)
if not is_valid_inchikey(inchikey):
logger.warning("The provided InChIKey is not valid or may be the short form.")
logger.warning("Fingerprint is not present for given Spectrum/InChIKey. Use compute_fingerprints() first.")
return None
[docs]
def get_fingerprint_by_spectrum(self, spectrum: SpectrumType):
"""Get fingerprint by spectrum.
Parameters
----------
spectrum
Spectrum with an InChIKey.
Returns
-------
Optional[np.ndarray | scipy.sparse.csr_matrix]
The corresponding fingerprint row, or None if not present.
"""
inchikey = spectrum.get("inchikey")
return self.get_fingerprint_by_inchikey(inchikey)
[docs]
def compute_fingerprint(self, spectrum: SpectrumType):
"""Compute one fingerprint for a given spectrum.
This does not add the fingerprint to the internal storage. It only computes
and returns the fingerprint.
Parameters
----------
spectrum
A spectrum for which a fingerprint is to be calculated.
Returns
-------
Optional[np.ndarray | scipy.sparse.csr_matrix]
Fingerprint row, or None if fingerprint could not be computed.
"""
record = self._record_from_spectrum(spectrum)
if record is None:
return None
smiles = self._select_structure_string(record)
fps = self._compute_from_smiles([smiles])
if fps.shape[0] == 0:
return None
return self._extract_row(fps, 0)
[docs]
def compute_fingerprints(self, spectra: list[SpectrumType]):
"""Compute fingerprints for a list of spectra.
Fingerprints are computed only for unique compounds, keyed by InChIKey.
Existing stored fingerprints are replaced.
Parameters
----------
spectra
List of spectra.
"""
unique_records: dict[str, _FingerprintRecord] = {}
for spectrum in spectra:
record = self._record_from_spectrum(spectrum)
if record is None:
logger.warning("%s doesn't have valid fingerprint metadata. Skipping.", spectrum)
continue
if record.inchikey not in unique_records:
unique_records[record.inchikey] = record
self._records = list(unique_records.values())
self._inchikeys = [record.inchikey for record in self._records]
self._row_by_inchikey = {inchikey: i for i, inchikey in enumerate(self._inchikeys)}
if len(self._records) == 0:
self._fingerprints = None
return
smiles = [self._select_structure_string(record) for record in self._records]
self._fingerprints = self._compute_from_smiles(smiles)
# -------------------------------------------------------------------------
# Internal helpers
# -------------------------------------------------------------------------
def _get_row(self, row_idx: int):
"""Return one fingerprint row from internal storage."""
assert self._fingerprints is not None, "Fingerprints have not been computed yet."
if sp.issparse(self._fingerprints):
return self._fingerprints.getrow(row_idx)
return self._fingerprints[row_idx]
def _extract_row(self, matrix, row_idx: int):
"""Extract one row from a dense or sparse fingerprint result."""
if sp.issparse(matrix):
return matrix.getrow(row_idx)
return matrix[row_idx]
def _compute_from_smiles(self, smiles: list[str]):
"""Compute fingerprints from SMILES using chemap."""
config = FingerprintConfig(
count=self.count,
folded=self.folded,
return_csr=self.return_csr,
invalid_policy=self.invalid_policy,
**self.config_kwargs,
)
return compute_fingerprints(smiles, self.fingerprint_generator, config=config)
def _record_from_spectrum(self, spectrum: SpectrumType) -> _FingerprintRecord | None:
"""Build validated internal fingerprint record from a spectrum."""
inchikey = spectrum.get("inchikey")
smiles = spectrum.get("smiles")
inchi = spectrum.get("inchi")
if inchikey is None:
return None
try:
normalized_inchikey = self._normalize_inchikey(inchikey, validate=True)
except ValueError:
return None
if smiles and is_valid_smiles(smiles):
canonical_smiles = self._smiles_from_smiles(smiles)
if canonical_smiles is not None:
return _FingerprintRecord(
inchikey=normalized_inchikey,
smiles=canonical_smiles,
inchi=None,
)
if inchi and is_valid_inchi(inchi):
canonical_smiles = self._smiles_from_inchi(inchi)
if canonical_smiles is not None:
return _FingerprintRecord(
inchikey=normalized_inchikey,
smiles=canonical_smiles,
inchi=inchi,
)
return None
def _normalize_inchikey(self, inchikey: str, validate: bool = True) -> str:
"""Normalize InChIKey depending on stereochemistry setting."""
if inchikey is None:
raise ValueError("InChIKey is missing or invalid.")
if self.ignore_stereochemistry:
if len(inchikey) < 14:
raise ValueError("InChIKey is missing or invalid.")
return inchikey[:14]
if validate and not is_valid_inchikey(inchikey):
raise ValueError("InChIKey is missing or invalid.")
return inchikey
@staticmethod
def _smiles_from_smiles(smiles: str) -> str | None:
"""Convert valid SMILES to canonical SMILES."""
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
return Chem.MolToSmiles(mol)
@staticmethod
def _smiles_from_inchi(inchi: str) -> str | None:
"""Convert valid InChI to canonical SMILES."""
mol = Chem.MolFromInchi(inchi)
if mol is None:
return None
return Chem.MolToSmiles(mol)
@staticmethod
def _select_structure_string(record: _FingerprintRecord) -> str:
"""Return structure string used as chemap input."""
assert record.smiles is not None, "Expected canonical SMILES to be present."
return record.smiles