Source code for matchms.MetadataCollection

import logging
import re
import numpy as np
import pandas as pd
from matchms.filtering.filter_utils.metadata_conversions import (
    NO_METADATA_UPDATE,
    is_missing_metadata_value,
)
from .utils import load_known_key_conversions


logger = logging.getLogger("matchms")
_key_regex_replacements = {r"\s": "_", r"[!?.,;:]": ""}
_key_replacements = load_known_key_conversions()


def _needs_object_dtype(target_column: pd.Series, values: pd.Series) -> bool:
    """Return True if target column should be object before assigning values."""
    if pd.api.types.is_object_dtype(values.dtype):
        return True

    if pd.api.types.is_string_dtype(values.dtype):
        return True

    if pd.api.types.is_bool_dtype(target_column.dtype):
        return not pd.api.types.is_bool_dtype(values.dtype)

    if pd.api.types.is_numeric_dtype(target_column.dtype):
        return not pd.api.types.is_numeric_dtype(values.dtype)

    return True


def _to_python_metadata_value(value):
    """Convert pandas/numpy metadata values to JSON-friendly Python values."""
    if is_missing_metadata_value(value):
        return None

    if isinstance(value, np.generic):
        return value.item()

    return value


[docs] def harmonize_metadata_column_name(column_name: str) -> str: """Return the matchms-style metadata column name.""" column_name = column_name.lower() for regex_pattern, replacement in _key_regex_replacements.items(): column_name = re.sub(regex_pattern, replacement, column_name) if column_name in _key_replacements: column_name = _key_replacements[column_name] return column_name
[docs] def harmonize_metadata_collection_columns(metadata: pd.DataFrame) -> pd.DataFrame: """Return DataFrame with harmonized metadata column names. If multiple columns map to the same harmonized column name, values are combined row-wise. Existing non-null values are kept, and missing values are filled from duplicate columns. """ rename_map = { column: harmonize_metadata_column_name(str(column)) for column in metadata.columns } harmonized = pd.DataFrame(index=metadata.index) for old_column, new_column in rename_map.items(): values = metadata[old_column] if new_column not in harmonized.columns: harmonized[new_column] = values continue conflict_mask = ( harmonized[new_column].notna() & values.notna() & (harmonized[new_column] != values) ) if conflict_mask.any(): logger.warning( "Metadata column '%s' maps to existing column '%s' with " "different non-null values. Keeping existing values and filling " "missing values only.", old_column, new_column, ) harmonized[new_column] = harmonized[new_column].combine_first(values) return harmonized
[docs] class MetadataCollection(pd.DataFrame): """ Metadata proxy class. Used for filter directly on metadata and synchronize fragments. """ _metadata = ["_collection"]
[docs] def __init__(self, data, collection=None, *args, **kwargs): super().__init__(data, *args, **kwargs) object.__setattr__(self, "_collection", collection)
@property def _constructor(self): def _c(*args, **kwargs): return MetadataCollection(*args, collection=self._collection, **kwargs) return _c
[docs] def sort_values(self, by, inplace=False, **kwargs): result = self._collection.sort(by=by, inplace=inplace, **kwargs) return None if inplace else result.metadata
[docs] def harmonize_columns(self, inplace: bool = False): """Harmonize metadata columns to matchms key style.""" harmonized = harmonize_metadata_collection_columns(self) if inplace: self.drop(columns=list(self.columns), inplace=True) for column in harmonized.columns: self[column] = harmonized[column].values if self._collection is not None: self._collection._metadata = pd.DataFrame(self).reset_index(drop=True) self._collection._clear_cache(["metadata_hashes", "spectra_hashes"]) return None return MetadataCollection(harmonized, collection=self._collection)
[docs] def apply_to_rows( self, func, *args, row_mask=None, inplace: bool = False, drop_missing_updates: bool = True, **kwargs, ): """Apply a metadata function to selected rows and merge the result back. The function receives a pandas DataFrame containing either all metadata rows or the rows selected by ``row_mask``. It must return a DataFrame with metadata updates. The returned update DataFrame may contain fewer rows and fewer columns than the input subset. Its index must be a subset of the selected input rows. Missing values in the update DataFrame are treated as "no update" and do not overwrite existing metadata values. This method only updates metadata. It does not modify fragments. Parameters ---------- func Function that receives a ``MetadataCollection`` or ``DataFrame`` subset as first argument and returns a ``DataFrame``/``MetadataCollection`` or ``None``. *args Positional arguments passed to ``func``. row_mask Optional boolean mask selecting rows. If ``None``, all rows are passed directly to ``func``. inplace If True, update the bound collection metadata in place and return ``None``. If False, return a new ``MetadataCollection``. drop_missing_updates If True, missing values in the DataFrame returned by ``func`` are treated as "no update" and do not overwrite existing metadata values. If False, missing values are treated as explicit updates and will overwrite existing metadata values. **kwargs Keyword arguments passed to ``func``. Returns ------- MetadataCollection or None Updated metadata table if ``inplace=False``. Otherwise ``None``. """ target = pd.DataFrame(self).copy() if row_mask is None: row_indices = target.index else: if isinstance(row_mask, pd.Series): row_mask = row_mask.values row_mask = np.asarray(row_mask, dtype=bool) if row_mask.shape[0] != len(self): raise ValueError( f"Shape of row mask ({row_mask.shape[0]}) does not fit " f"metadata table ({len(self)})." ) row_indices = target.index[row_mask] if len(row_indices) == 0: return self._finalize_apply_to_rows(target, inplace=inplace) subset = target.loc[row_indices].copy() updates = func(subset, *args, **kwargs) if updates is None: return self._finalize_apply_to_rows(target, inplace=inplace) updates = pd.DataFrame(updates) self._validate_metadata_updates( selected_index=subset.index, updates=updates, func=func, ) target = self._merge_metadata_updates( target, updates, drop_missing_updates=drop_missing_updates, ) return self._finalize_apply_to_rows(target, inplace=inplace)
[docs] @staticmethod def row_to_dict(row: pd.Series) -> dict: """Convert a metadata row to a plain Python metadata dict. Pandas missing values such as ``NaN`` and ``pd.NA`` are converted to ``None``. NumPy scalar values such as ``np.int64`` and ``np.float64`` are converted to native Python scalars so reconstructed Spectrum objects can be exported to JSON. """ return { key: _to_python_metadata_value(value) for key, value in row.items() }
def _validate_metadata_updates( self, selected_index: pd.Index, updates: pd.DataFrame, func, ) -> None: """Validate that metadata updates only refer to selected rows.""" if updates.empty: return if updates.index.has_duplicates: raise ValueError( f"Function {getattr(func, '__name__', repr(func))} returned " "duplicate metadata row updates." ) if not updates.index.isin(selected_index).all(): raise ValueError( f"Function {getattr(func, '__name__', repr(func))} returned " "metadata updates for rows outside the selected input rows." ) def _merge_metadata_updates( self, target: pd.DataFrame, updates: pd.DataFrame, *, drop_missing_updates: bool = True, ) -> pd.DataFrame: """Merge sparse metadata updates into target.""" if updates.empty: return target for column in updates.columns: if column not in target.columns: target[column] = pd.Series(index=target.index, dtype="object") values = updates[column] if drop_missing_updates: update_mask = values.notna() else: update_mask = values.map(lambda value: value is not NO_METADATA_UPDATE) values_to_assign = values.loc[update_mask] if values_to_assign.empty: continue if not drop_missing_updates and values_to_assign.isna().any(): target[column] = target[column].astype("object") values_to_assign = values_to_assign.astype("object") values_to_assign = values_to_assign.where(values_to_assign.notna(), None) elif _needs_object_dtype(target[column], values_to_assign): target[column] = target[column].astype("object") target.loc[values_to_assign.index, column] = values_to_assign return target def _finalize_apply_to_rows( self, target: pd.DataFrame, *, inplace: bool, ): """Write back or return metadata after apply_to_rows.""" target = target.reset_index(drop=True) if inplace: if self._collection is not None: self._collection._metadata = target self._collection._clear_cache(["metadata_hashes", "spectra_hashes"]) else: self.drop(columns=list(self.columns), inplace=True) for column in target.columns: self[column] = target[column].values return None return MetadataCollection(target, collection=self._collection)