Extensible canonical internal processing refactoring

This commit is contained in:
2026-05-04 11:06:11 +02:00
parent 4a16ccf1e1
commit d977f9e67c
20 changed files with 1815 additions and 16 deletions

View File

@@ -5,8 +5,15 @@ from markitect_tool.query.engine import (
QueryMatch,
extract_document,
extract_document_jsonpath,
extract_document_with_engine,
query_document,
query_document_jsonpath,
query_document_with_engine,
)
from markitect_tool.query.registry import (
QueryEngine,
QueryEngineRegistry,
default_query_engine_registry,
)
__all__ = [
@@ -14,6 +21,11 @@ __all__ = [
"QueryMatch",
"extract_document",
"extract_document_jsonpath",
"extract_document_with_engine",
"query_document",
"query_document_jsonpath",
"query_document_with_engine",
"QueryEngine",
"QueryEngineRegistry",
"default_query_engine_registry",
]

View File

@@ -44,6 +44,29 @@ class _Selector:
def query_document(document: Document, selector: str) -> list[QueryMatch]:
"""Query a parsed document with a small Markitect selector."""
return query_document_with_engine(document, selector, engine="selector")
def query_document_with_engine(
document: Document,
selector: str,
*,
engine: str = "selector",
) -> list[QueryMatch]:
"""Query a parsed document through a registered query engine."""
from markitect_tool.query.registry import default_query_engine_registry
try:
query_engine = default_query_engine_registry().get(engine)
except ValueError as exc:
raise InvalidQueryError(str(exc)) from exc
return query_engine.query(document, selector)
def _query_document_selector(document: Document, selector: str) -> list[QueryMatch]:
"""Query a parsed document with the built-in selector engine."""
parsed = _parse_selector(selector)
if parsed.target in {"document", "$", "."}:
return [QueryMatch(kind="document", path="$", value=document.to_dict())]
@@ -67,6 +90,12 @@ def query_document_jsonpath(document: Document, expression: str) -> list[QueryMa
remains dependency-light. Install ``markitect-tool[query]`` to enable it.
"""
return query_document_with_engine(document, expression, engine="jsonpath")
def _query_document_jsonpath(document: Document, expression: str) -> list[QueryMatch]:
"""Implementation for the registered optional JSONPath engine."""
try:
from jsonpath_ng.ext import parse as parse_jsonpath
except ImportError as exc: # pragma: no cover - branch depends on env deps
@@ -110,14 +139,29 @@ def extract_document(document: Document, selector: str) -> list[str]:
return extracted
def extract_document_with_engine(
document: Document,
selector: str,
*,
engine: str = "selector",
) -> list[str]:
"""Extract textual query matches through a registered query engine."""
extracted: list[str] = []
for match in query_document_with_engine(document, selector, engine=engine):
if match.text is not None:
extracted.append(match.text)
elif isinstance(match.value, str):
extracted.append(match.value)
elif isinstance(match.value, int | float | bool):
extracted.append(str(match.value))
return extracted
def extract_document_jsonpath(document: Document, expression: str) -> list[str]:
"""Extract textual JSONPath matches from a parsed document."""
extracted: list[str] = []
for match in query_document_jsonpath(document, expression):
if match.text is not None:
extracted.append(match.text)
return extracted
return extract_document_with_engine(document, expression, engine="jsonpath")
def _parse_selector(selector: str) -> _Selector:

View File

@@ -0,0 +1,105 @@
"""Query engine registry adapters."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Callable
from markitect_tool.core import Document
from markitect_tool.extension import (
ExtensionDescriptor,
ExtensionRegistry,
OptionalDependency,
ProcessingCapability,
)
from markitect_tool.query.engine import QueryMatch
QueryCallable = Callable[[Document, str], list[QueryMatch]]
@dataclass(frozen=True)
class QueryEngine:
"""Registered query engine implementation."""
descriptor: ExtensionDescriptor
query: QueryCallable
class QueryEngineRegistry:
"""Registry of query engines keyed by short engine id."""
def __init__(self, engines: list[QueryEngine] | None = None) -> None:
self._engines: dict[str, QueryEngine] = {}
for engine in engines or []:
self.register(engine)
def register(self, engine: QueryEngine) -> None:
if engine.descriptor.id in self._engines:
raise ValueError(f"Duplicate query engine `{engine.descriptor.id}`")
self._engines[engine.descriptor.id] = engine
def get(self, engine_id: str) -> QueryEngine:
try:
return self._engines[engine_id]
except KeyError as exc:
raise ValueError(f"Unknown query engine `{engine_id}`") from exc
def list(self) -> list[QueryEngine]:
return [self._engines[key] for key in sorted(self._engines)]
def extension_registry(self) -> ExtensionRegistry:
return ExtensionRegistry(engine.descriptor for engine in self.list())
def default_query_engine_registry() -> QueryEngineRegistry:
"""Return the built-in query engine registry."""
from markitect_tool.query.engine import (
_query_document_jsonpath,
_query_document_selector,
)
return QueryEngineRegistry(
[
QueryEngine(
descriptor=ExtensionDescriptor(
id="selector",
kind="query-engine",
summary="Compact Markitect selector engine.",
capabilities=[ProcessingCapability(id="ast", kind="read")],
input_contract="Document + selector",
output_contract="QueryMatch[]",
diagnostics_namespace="query",
provenance_prefix="query.selector",
cli={"commands": ["mkt query", "mkt extract", "mkt cache query"]},
docs=["docs/query-extraction.md"],
),
query=_query_document_selector,
),
QueryEngine(
descriptor=ExtensionDescriptor(
id="jsonpath",
kind="query-engine",
summary="Optional JSONPath engine over Document.to_dict().",
capabilities=[ProcessingCapability(id="ast", kind="read")],
optional_dependencies=[
OptionalDependency(
name="jsonpath_ng",
package="jsonpath-ng",
extra="query",
required=True,
purpose="Evaluate JSONPath expressions.",
)
],
input_contract="Document + JSONPath expression",
output_contract="QueryMatch[]",
diagnostics_namespace="query.jsonpath",
provenance_prefix="query.jsonpath",
cli={"commands": ["mkt query --engine jsonpath", "mkt extract --engine jsonpath"]},
docs=["docs/query-extraction.md", "docs/local-index-backend.md"],
),
query=_query_document_jsonpath,
),
]
)