Source code for biolexica.api

"""API for assembling biomedial lexica."""

from __future__ import annotations

import logging
import typing as t
from collections import Counter
from collections.abc import Sequence
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal, TypeAlias

import ssslm
from curies import Reference
from pydantic import BaseModel, Field
from ssslm import LiteralMapping

if TYPE_CHECKING:
    import semra

__all__ = [
    "PREDEFINED",
    "Configuration",
    "Input",
    "Processor",
    "assemble_grounder",
    "assemble_terms",
    "get_literal_mappings",
    "load_grounder",
    "summarize_terms",
]

logger = logging.getLogger(__name__)

HERE = Path(__file__).parent.resolve()
LEXICA = HERE.parent.parent.joinpath("lexica")

#: A processor available as a literal mapping input
Processor: TypeAlias = Literal["pyobo", "bioontologies", "ssslm", "gilda"]


[docs] class Input(BaseModel): # type:ignore """An input towards lexicon assembly.""" processor: Processor source: str ancestors: None | str | list[str] = None kwargs: dict[str, Any] | None = None
[docs] class Configuration(BaseModel): """A configuration for construction of a lexicon.""" inputs: list[Input] excludes: list[Reference] | None = Field( default=None, description="A list of CURIEs to exclude after processing is complete", ) mapping_configuration: semra.Configuration | None = None
PREDEFINED: TypeAlias = Literal["cell", "anatomy", "phenotype", "obo"] URL_FMT = "https://github.com/biopragmatics/biolexica/raw/main/lexica/{key}/{key}.ssslm.tsv.gz"
[docs] def load_grounder(grounder: ssslm.GrounderHint) -> ssslm.Grounder: """Load a grounder, potentially from a remote location.""" if isinstance(grounder, str) and grounder in t.get_args(PREDEFINED): if LEXICA.is_dir(): # If biolexica is installed in editable mode, try looking for # the directory outside the package root and load the predefined # index directly grounder = LEXICA.joinpath(grounder, f"{grounder}.ssslm.tsv.gz").as_posix() else: grounder = URL_FMT.format(key=grounder) return ssslm.make_grounder(grounder)
[docs] def assemble_grounder( configuration: Configuration, mappings: list[semra.Mapping] | None = None, *, extra_terms: list[LiteralMapping] | None = None, include_biosynonyms: bool = True, ) -> ssslm.Grounder: """Assemble terms from multiple resources and load into a grounder.""" literal_mappings = assemble_terms( configuration=configuration, mappings=mappings, include_biosynonyms=include_biosynonyms, extra_terms=extra_terms, ) return ssslm.make_grounder(literal_mappings)
[docs] def assemble_terms( # noqa:C901 configuration: Configuration, mappings: list[semra.Mapping] | None = None, *, extra_terms: list[LiteralMapping] | None = None, include_biosynonyms: bool = True, raw_path: Path | None = None, processed_path: Path | None = None, gilda_path: Path | None = None, summary_path: Path | None = None, ) -> list[LiteralMapping]: """Assemble terms from multiple resources.""" terms: list[LiteralMapping] = [] for inp in configuration.inputs: if inp.processor in {"pyobo", "bioontologies"}: terms.extend( get_literal_mappings( inp.source, ancestors=inp.ancestors, processor=inp.processor, **(inp.kwargs or {}), ) ) elif inp.processor == "ssslm": terms.extend(ssslm.read_literal_mappings(inp.source)) elif inp.processor == "gilda": terms.extend(ssslm.read_gilda_terms(inp.source)) else: raise ValueError(f"Unknown processor {inp.processor}") if extra_terms: terms.extend(extra_terms) if include_biosynonyms: import biosynonyms terms.extend(biosynonyms.get_positive_synonyms()) if raw_path is not None: logger.info("Writing %d raw literal mappings to %s", len(terms), raw_path) ssslm.write_literal_mappings(terms, raw_path) _mappings: list[semra.Mapping] = [] if configuration.mapping_configuration is not None: from semra.pipeline import AssembleReturnType _mappings.extend( configuration.mapping_configuration.get_mappings( return_type=AssembleReturnType.priority ) ) if mappings is not None: _mappings.extend(mappings) if _mappings is not None: from semra.api import assert_projection assert_projection(_mappings) terms = ssslm.remap_literal_mappings( literal_mappings=terms, mappings=[(mapping.subject, mapping.object) for mapping in _mappings], ) if configuration.excludes: _excludes_set = set(configuration.excludes) terms = [term for term in terms if term.reference not in _excludes_set] if processed_path is not None: logger.info("Writing %d processed literal mappings to %s", len(terms), processed_path) ssslm.write_literal_mappings(terms, processed_path) if gilda_path is not None: ssslm.write_gilda_terms(terms, gilda_path) if summary_path is not None: summary = summarize_terms(terms) summary_path.write_text(summary.model_dump_json(indent=2)) return terms
[docs] def get_literal_mappings( prefix: str, *, ancestors: None | str | Sequence[str] = None, processor: Processor, **kwargs: Any, ) -> list[ssslm.LiteralMapping]: """Iterate over all terms from a given prefix.""" if ancestors is None: ancestor_refs = None elif isinstance(ancestors, str): ancestor_refs = [Reference.from_curie(ancestors)] else: ancestor_refs = [Reference.from_curie(a) for a in ancestors] if processor == "pyobo": import pyobo kwargs.setdefault("strict", False) if ancestor_refs is None: return pyobo.get_literal_mappings(prefix, **kwargs) else: return pyobo.get_literal_mappings_subset(prefix, ancestors=ancestor_refs, **kwargs) elif processor == "bioontologies": import bioontologies if ancestor_refs is None: return list(bioontologies.get_literal_mappings(prefix, **kwargs)) else: return list( bioontologies.get_literal_mappings_subset(prefix, ancestors=ancestor_refs, **kwargs) ) else: raise ValueError(f"Unknown processor: {processor}")
class Summary(BaseModel): """A model for summaries.""" count: int provenance_counter: dict[str, int] type_counter: dict[str, int]
[docs] def summarize_terms(literal_mappings: list[LiteralMapping]) -> BaseModel: """Summarize terms.""" provenance_counter: Counter[str] = Counter() type_counter: Counter[str] = Counter() for mapping in literal_mappings: for ref in mapping.provenance: provenance_counter[ref.prefix] += 1 if mapping.type is not None: type_counter[mapping.type.curie] += 1 return Summary( count=len(literal_mappings), provenance_counter=dict(provenance_counter), type_counter=dict(type_counter), )