Extensible canonical internal processing refactoring

This commit is contained in:
2026-05-04 11:06:11 +02:00
parent 4a16ccf1e1
commit d977f9e67c
20 changed files with 1815 additions and 16 deletions

View File

@@ -0,0 +1,50 @@
from markitect_tool.extension import builtin_extension_registry
def test_builtin_extension_registry_lists_query_processors_and_backend():
registry = builtin_extension_registry()
ids = [descriptor.id for descriptor in registry.list()]
assert "query.selector" not in ids
assert "selector" in ids
assert "jsonpath" in ids
assert "processor.identity" in ids
assert "processor.uppercase" in ids
assert "processor.include" in ids
assert "backend.local-sqlite" in ids
def test_builtin_processor_descriptors_capture_safety_and_provenance():
registry = builtin_extension_registry()
include = registry.get("processor.include")
uppercase = registry.get("processor.uppercase")
assert include.kind == "processor"
assert include.safety["reads_files"] is True
assert include.provenance_prefix == "processor.include"
assert uppercase.safety == {}
assert uppercase.provenance_prefix == "processor.uppercase"
def test_builtin_local_sqlite_descriptor_exposes_backend_capabilities():
registry = builtin_extension_registry()
descriptor = registry.get("backend.local-sqlite")
assert descriptor.kind == "backend"
assert {capability.id for capability in descriptor.capabilities} >= {
"snapshots",
"ast",
"json",
"fts",
"sql",
"provenance",
}
assert descriptor.cli["commands"] == [
"mkt cache init",
"mkt cache index",
"mkt cache query",
"mkt search",
]

View File

@@ -0,0 +1,37 @@
from markitect_tool.cli.extensions import collect_cli_command_specs, command_specs_from_extension
from markitect_tool.extension import ExtensionDescriptor, builtin_extension_registry
def test_command_specs_from_extension_handles_string_and_list_forms():
one = ExtensionDescriptor(id="one", kind="test", cli={"commands": "mkt one"})
many = ExtensionDescriptor(id="many", kind="test", cli={"commands": ["mkt a", "mkt b"]})
assert [spec.command for spec in command_specs_from_extension(one)] == ["mkt one"]
assert [spec.command for spec in command_specs_from_extension(many)] == ["mkt a", "mkt b"]
def test_collect_cli_command_specs_from_builtin_registry():
specs = collect_cli_command_specs(builtin_extension_registry())
commands = {(spec.extension_id, spec.command) for spec in specs}
assert ("selector", "mkt query") in commands
assert ("processor.uppercase", "mkt process") in commands
assert ("backend.local-sqlite", "mkt cache index") in commands
assert ("backend.local-sqlite", "mkt search") in commands
def test_cli_command_spec_serializes_without_empty_fields():
spec = command_specs_from_extension(
ExtensionDescriptor(
id="query.selector",
kind="query-engine",
summary="Selector engine",
cli={"commands": ["mkt query"], "group": "query"},
)
)[0]
data = spec.to_dict()
assert data["command"] == "mkt query"
assert data["extension_id"] == "query.selector"
assert data["metadata"]["group"] == "query"

View File

@@ -0,0 +1,176 @@
from pathlib import Path
import builtins
from click.testing import CliRunner
import pytest
from markitect_tool.backend import (
LocalSnapshotStore,
capability_check,
load_backend_manifest,
load_backend_registry,
local_index_path_for,
)
from markitect_tool.cli import main
from markitect_tool.core import parse_markdown
from markitect_tool.processor import ProcessorContext, run_fenced_processors
from markitect_tool.query import (
default_query_engine_registry,
InvalidQueryError,
extract_document,
query_document,
query_document_jsonpath,
)
from markitect_tool.reference import ReferenceContext, resolve_reference
CHARACTERIZATION_DOC = """---
document_type: adr
status: accepted
---
# Decision Record
## Context
Authors need stable infrastructure seams.
## Decision
Use explicit registries and processing envelopes.
"""
def test_query_selector_and_extraction_characterization():
document = parse_markdown(CHARACTERIZATION_DOC)
registry = default_query_engine_registry()
section_matches = query_document(document, "sections[heading=Decision]")
extracted = extract_document(document, "frontmatter.status")
assert registry.get("selector").descriptor.kind == "query-engine"
assert len(section_matches) == 1
assert section_matches[0].kind == "section"
assert section_matches[0].path == "$.sections[2]"
assert section_matches[0].text.startswith("## Decision")
assert extracted == ["accepted"]
def test_jsonpath_missing_dependency_diagnostic_characterization(monkeypatch):
document = parse_markdown(CHARACTERIZATION_DOC)
real_import = builtins.__import__
def fake_import(name, *args, **kwargs):
if name.startswith("jsonpath_ng"):
raise ImportError("blocked")
return real_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, "__import__", fake_import)
with pytest.raises(InvalidQueryError, match="optional `jsonpath-ng`"):
query_document_jsonpath(document, "$.headings[*].text")
def test_processor_registry_result_provenance_characterization():
markdown = """```mkt-uppercase {#shout}
hello
```
"""
run = run_fenced_processors(markdown, context=ProcessorContext())
assert run.valid
assert run.blocks[0].processor == "uppercase"
assert run.blocks[0].unit_id == "shout"
assert run.results[0].content == "HELLO\n"
assert run.results[0].provenance[0].operation == "processor.uppercase"
def test_unknown_processor_diagnostic_characterization():
markdown = """```mkt-missing {#x}
content
```
"""
run = run_fenced_processors(markdown, context=ProcessorContext())
assert not run.valid
diagnostic = run.results[0].diagnostics[0].to_dict()
assert diagnostic["severity"] == "error"
assert diagnostic["code"] == "processor.unknown"
assert "Unknown processor" in diagnostic["message"]
def test_backend_manifest_registry_characterization():
manifest = load_backend_manifest("examples/backends/local-sqlite-backend.md")
registry = load_backend_registry(["examples/backends"])
check = capability_check(manifest, ["snapshots", "fts", "provenance"])
assert manifest.id == "local-sqlite-cache"
assert registry.get("local-sqlite-cache").storage["engine"] == "sqlite"
assert check.compatible
def test_local_index_snapshot_query_search_characterization(tmp_path: Path):
source = tmp_path / "doc.md"
source.write_text(CHARACTERIZATION_DOC, encoding="utf-8")
store = LocalSnapshotStore(local_index_path_for(tmp_path))
build = store.build([tmp_path], root=tmp_path)
state = store.load_state()[0]
document = store.get_document("doc.md")
search_results = store.search("registries")
assert build.parsed == ["doc.md"]
assert state.path == "doc.md"
assert state.snapshot_id.startswith("snapshot:")
assert document["headings"][0]["text"] == "Decision Record"
assert search_results[0].path == "doc.md"
assert search_results[0].unit_kind in {"section", "block"}
def test_reference_resolution_characterization(tmp_path: Path):
context_file = tmp_path / "context.md"
target_file = tmp_path / "target.md"
context_file.write_text("# Context\n", encoding="utf-8")
target_file.write_text("# Target\n\n## Decision\n\nChosen text.\n", encoding="utf-8")
context = ReferenceContext(root=tmp_path, current_path=context_file)
resolution = resolve_reference("target.md#decision", context=context)
assert resolution.target_path == str(target_file.resolve())
assert resolution.units[0].kind == "section"
assert resolution.units[0].unit_id == "decision"
assert "Chosen text" in resolution.units[0].text
def test_cli_output_envelopes_characterization(tmp_path: Path):
source = tmp_path / "doc.md"
source.write_text(CHARACTERIZATION_DOC, encoding="utf-8")
runner = CliRunner()
query = runner.invoke(
main,
["query", str(source), "sections[heading=Decision]", "--format", "json"],
)
index = runner.invoke(main, ["cache", "index", str(tmp_path), "--root", str(tmp_path)])
cache_query = runner.invoke(
main,
[
"cache",
"query",
"frontmatter.status",
"--root",
str(tmp_path),
"--format",
"json",
],
)
assert query.exit_code == 0
assert '"engine": "selector"' in query.output
assert '"count": 1' in query.output
assert index.exit_code == 0
assert "parsed: 1" in index.output
assert cache_query.exit_code == 0
assert '"source_path": "doc.md"' in cache_query.output

View File

@@ -0,0 +1,98 @@
from markitect_tool.extension import (
ExtensionDescriptor,
ExtensionExecutor,
ExtensionLifecycle,
ExtensionRegistry,
ExtensionRegistryError,
OptionalDependency,
ProcessingRequest,
ProcessingResult,
)
def test_extension_executor_runs_callbacks_in_order():
events: list[str] = []
def runner(request: ProcessingRequest) -> ProcessingResult:
events.append(f"run:{request.operation}")
return ProcessingResult(output={"ok": True})
lifecycle = ExtensionLifecycle()
lifecycle.on_before(lambda descriptor, request: events.append(f"before:{descriptor.id}"))
lifecycle.on_success(
lambda descriptor, request, result: events.append(f"success:{result.output['ok']}")
)
lifecycle.on_after(lambda descriptor, request, result: events.append("after"))
registry = ExtensionRegistry(
[ExtensionDescriptor(id="fake.runner", kind="test", factory=lambda: runner)]
)
result = ExtensionExecutor(registry, lifecycle=lifecycle).execute(
"fake.runner",
ProcessingRequest(operation="fake.run", input={}),
)
assert result.valid
assert result.trace[-1].event == "extension.executed"
assert events == ["before:fake.runner", "run:fake.run", "success:True", "after"]
def test_extension_executor_routes_failure_callbacks():
events: list[str] = []
def runner(request: ProcessingRequest) -> ProcessingResult:
return ProcessingResult.from_error(code="fake.error", message="Nope")
lifecycle = ExtensionLifecycle()
lifecycle.on_failure(lambda descriptor, request, result: events.append(result.diagnostics[0].code))
registry = ExtensionRegistry(
[ExtensionDescriptor(id="fake.runner", kind="test", factory=lambda: runner)]
)
result = ExtensionExecutor(registry, lifecycle=lifecycle).execute(
"fake.runner",
ProcessingRequest(operation="fake.run", input={}),
)
assert not result.valid
assert events == ["fake.error"]
def test_extension_executor_blocks_missing_required_dependency():
registry = ExtensionRegistry(
[
ExtensionDescriptor(
id="query.jsonpath",
kind="query-engine",
factory=lambda: lambda request: ProcessingResult(output=[]),
optional_dependencies=[
OptionalDependency(name="definitely_missing_markitect_dep", required=True)
],
)
]
)
result = ExtensionExecutor(registry).execute(
"query.jsonpath",
ProcessingRequest(operation="query.jsonpath", input={}),
)
assert not result.valid
assert result.diagnostics[0].code == "extension.missing_dependency"
assert "definitely_missing_markitect_dep" in result.diagnostics[0].details["missing"]
def test_extension_executor_rejects_non_result_return():
registry = ExtensionRegistry(
[ExtensionDescriptor(id="bad.runner", kind="test", factory=lambda: lambda request: {})]
)
try:
ExtensionExecutor(registry).execute(
"bad.runner",
ProcessingRequest(operation="bad.run", input={}),
)
except ExtensionRegistryError as exc:
assert "expected ProcessingResult" in str(exc)
else:
raise AssertionError("Expected ExtensionRegistryError")

View File

@@ -0,0 +1,75 @@
from pathlib import Path
from markitect_tool.extension import (
ProcessingCapability,
ProcessingContext,
ProcessingProvenance,
ProcessingRequest,
ProcessingResult,
ProcessingTrace,
)
def test_processing_request_serializes_context_and_cache_key():
request = ProcessingRequest(
operation="query.selector",
input={"selector": "sections[heading=Decision]"},
context=ProcessingContext(root=Path("/workspace"), caller="cli"),
options={"format": "json"},
capabilities=[ProcessingCapability(id="ast", description="Read parsed AST")],
)
data = request.to_dict()
assert data["operation"] == "query.selector"
assert data["context"]["root"] == "/workspace"
assert data["context"]["caller"] == "cli"
assert data["capabilities"][0]["id"] == "ast"
assert request.cache_key.startswith("processing:")
assert request.cache_key == ProcessingRequest(
operation="query.selector",
input={"selector": "sections[heading=Decision]"},
context=ProcessingContext(root=Path("/other")),
options={"format": "json"},
capabilities=[ProcessingCapability(id="ast", description="Read parsed AST")],
).cache_key
def test_processing_result_validity_provenance_and_trace():
result = ProcessingResult(
output={"count": 1},
provenance=[
ProcessingProvenance(
operation="query.selector",
source_path="doc.md",
content_hash="sha256:abc",
dependencies=["doc.md"],
)
],
trace=[ProcessingTrace(event="query.start", metadata={"engine": "selector"})],
)
data = result.to_dict()
assert result.valid
assert data["valid"] is True
assert data["output"]["count"] == 1
assert data["provenance"][0]["operation"] == "query.selector"
assert data["trace"][0]["metadata"]["engine"] == "selector"
def test_processing_result_from_error_normalizes_diagnostics():
result = ProcessingResult.from_error(
code="extension.missing_dependency",
message="Install optional dependency.",
source_path="doc.md",
line=3,
details={"dependency": "jsonpath-ng"},
)
data = result.to_dict()
assert not result.valid
assert data["diagnostics"][0]["severity"] == "error"
assert data["diagnostics"][0]["source"]["path"] == "doc.md"
assert data["diagnostics"][0]["details"]["dependency"] == "jsonpath-ng"

View File

@@ -0,0 +1,112 @@
import pytest
from markitect_tool.extension import (
ExtensionDescriptor,
ExtensionRegistry,
ExtensionRegistryError,
OptionalDependency,
ProcessingCapability,
)
def test_extension_descriptor_serializes_contract_metadata():
descriptor = ExtensionDescriptor(
id="query.selector",
kind="query-engine",
summary="Small selector query engine.",
capabilities=[ProcessingCapability(id="ast", kind="read")],
input_contract="Document + selector",
output_contract="QueryMatch[]",
diagnostics_namespace="query",
provenance_prefix="query.selector",
cli={"command": "mkt query"},
docs=["docs/query-extraction.md"],
)
data = descriptor.to_dict()
assert data["id"] == "query.selector"
assert data["kind"] == "query-engine"
assert data["capabilities"][0]["id"] == "ast"
assert data["cli"]["command"] == "mkt query"
def test_extension_registry_lists_by_kind_and_capability():
selector = ExtensionDescriptor(
id="query.selector",
kind="query-engine",
capabilities=[ProcessingCapability(id="ast")],
)
local = ExtensionDescriptor(
id="backend.local-sqlite",
kind="backend",
capabilities=[ProcessingCapability(id="snapshots"), ProcessingCapability(id="fts")],
)
registry = ExtensionRegistry([local, selector])
assert [descriptor.id for descriptor in registry.list()] == [
"backend.local-sqlite",
"query.selector",
]
assert [descriptor.id for descriptor in registry.list(kind="query-engine")] == [
"query.selector"
]
assert [descriptor.id for descriptor in registry.require_capability("fts")] == [
"backend.local-sqlite"
]
def test_extension_registry_rejects_duplicate_ids():
descriptor = ExtensionDescriptor(id="query.selector", kind="query-engine")
registry = ExtensionRegistry([descriptor])
with pytest.raises(ExtensionRegistryError, match="Duplicate extension id"):
registry.register(descriptor)
def test_extension_registry_checks_optional_dependencies():
registry = ExtensionRegistry(
[
ExtensionDescriptor(
id="query.jsonpath",
kind="query-engine",
optional_dependencies=[
OptionalDependency(
name="jsonpath_ng",
package="jsonpath-ng",
extra="query",
required=True,
),
OptionalDependency(name="tabulate"),
],
)
]
)
missing = registry.check_dependencies("query.jsonpath", available_modules=set())
available = registry.check_dependencies(
"query.jsonpath",
available_modules={"jsonpath_ng", "tabulate"},
)
assert not missing.compatible
assert missing.missing == ["jsonpath_ng"]
assert missing.optional_missing == ["tabulate"]
assert available.compatible
def test_extension_descriptor_instantiates_factory():
descriptor = ExtensionDescriptor(
id="fake.extension",
kind="test",
factory=lambda: {"ready": True},
)
assert descriptor.instantiate() == {"ready": True}
def test_extension_descriptor_requires_factory_to_instantiate():
descriptor = ExtensionDescriptor(id="fake.extension", kind="test")
with pytest.raises(ExtensionRegistryError, match="has no factory"):
descriptor.instantiate()

View File

@@ -0,0 +1,28 @@
from markitect_tool.core import parse_markdown
from markitect_tool.query import (
default_query_engine_registry,
extract_document_with_engine,
query_document_with_engine,
)
def test_default_query_engine_registry_exposes_builtin_descriptors():
registry = default_query_engine_registry()
descriptors = registry.extension_registry().to_dict()["extensions"]
assert [engine.descriptor.id for engine in registry.list()] == ["jsonpath", "selector"]
assert {descriptor["id"] for descriptor in descriptors} == {"selector", "jsonpath"}
assert registry.get("selector").descriptor.cli["commands"][0] == "mkt query"
assert registry.get("jsonpath").descriptor.optional_dependencies[0].name == "jsonpath_ng"
def test_query_document_with_engine_uses_selector_registry():
document = parse_markdown("# Doc\n\n## Decision\n\nChosen.\n")
matches = query_document_with_engine(document, "sections[heading=Decision]", engine="selector")
extracted = extract_document_with_engine(document, "sections[heading=Decision]", engine="selector")
assert matches[0].kind == "section"
assert matches[0].path == "$.sections[1]"
assert extracted == ["## Decision\n\nChosen."]