From d977f9e67c679d32a30b8b9facc0e3d85775b875 Mon Sep 17 00:00:00 2001 From: tegwick Date: Mon, 4 May 2026 11:06:11 +0200 Subject: [PATCH] Extensible canonical internal processing refactoring --- docs/extension-authoring.md | 178 ++++++++++++++++ docs/internal-extension-framework.md | 149 ++++++++++++++ docs/workplan-planning-map.md | 2 +- src/markitect_tool/cli/extensions.py | 64 ++++++ src/markitect_tool/extension/__init__.py | 56 +++++ src/markitect_tool/extension/builtins.py | 92 +++++++++ src/markitect_tool/extension/execution.py | 98 +++++++++ src/markitect_tool/extension/processing.py | 184 +++++++++++++++++ src/markitect_tool/extension/registry.py | 193 ++++++++++++++++++ src/markitect_tool/query/__init__.py | 12 ++ src/markitect_tool/query/engine.py | 54 ++++- src/markitect_tool/query/registry.py | 105 ++++++++++ tests/test_builtin_extension_catalog.py | 50 +++++ tests/test_cli_extension_specs.py | 37 ++++ tests/test_extension_characterization.py | 176 ++++++++++++++++ tests/test_extension_execution.py | 98 +++++++++ tests/test_extension_processing_model.py | 75 +++++++ tests/test_extension_registry.py | 112 ++++++++++ tests/test_query_engine_registry.py | 28 +++ ...TT-WP-0013-internal-extension-framework.md | 68 +++++- 20 files changed, 1815 insertions(+), 16 deletions(-) create mode 100644 docs/extension-authoring.md create mode 100644 docs/internal-extension-framework.md create mode 100644 src/markitect_tool/cli/extensions.py create mode 100644 src/markitect_tool/extension/__init__.py create mode 100644 src/markitect_tool/extension/builtins.py create mode 100644 src/markitect_tool/extension/execution.py create mode 100644 src/markitect_tool/extension/processing.py create mode 100644 src/markitect_tool/extension/registry.py create mode 100644 src/markitect_tool/query/registry.py create mode 100644 tests/test_builtin_extension_catalog.py create mode 100644 tests/test_cli_extension_specs.py create mode 100644 tests/test_extension_characterization.py create mode 100644 tests/test_extension_execution.py create mode 100644 tests/test_extension_processing_model.py create mode 100644 tests/test_extension_registry.py create mode 100644 tests/test_query_engine_registry.py diff --git a/docs/extension-authoring.md b/docs/extension-authoring.md new file mode 100644 index 0000000..9e36739 --- /dev/null +++ b/docs/extension-authoring.md @@ -0,0 +1,178 @@ +# Internal Extension Authoring + +## Purpose + +This guide describes how to add a new internal Markitect extension without +turning central modules into the main integration surface. + +Use this for internal query engines, processors, backend/index stores, +reference providers, validators, template/generation adapters, CLI command +groups, render/export adapters, and future document functions. + +## Recommended Shape + +Each extension should have: + +- implementation module +- descriptor or descriptor factory +- focused tests +- characterization coverage if it changes existing behavior +- documentation or example link +- diagnostic namespace +- provenance operation prefix +- optional dependency declaration +- capability and safety declarations + +Prefer this shape: + +```text +src/markitect_tool//.py +tests/test__.py +docs/.md +``` + +If the extension is cross-cutting, register it from +`markitect_tool.extension.builtins` or a future internal discovery module rather +than importing it from many central files. + +## Descriptor Template + +```python +from markitect_tool.extension import ExtensionDescriptor, ProcessingCapability + + +def my_extension_descriptor() -> ExtensionDescriptor: + return ExtensionDescriptor( + id="query.example", + kind="query-engine", + summary="Example query engine.", + capabilities=[ + ProcessingCapability(id="ast", kind="read"), + ], + input_contract="Document + example expression", + output_contract="QueryMatch[]", + diagnostics_namespace="query.example", + provenance_prefix="query.example", + cli={"commands": ["mkt query --engine example"]}, + docs=["docs/example-query.md"], + examples=["examples/query/example.md"], + ) +``` + +## Optional Dependencies + +Declare optional dependencies in descriptors: + +```python +from markitect_tool.extension import OptionalDependency + +OptionalDependency( + name="jsonpath_ng", + package="jsonpath-ng", + extra="query", + required=True, + purpose="Evaluate JSONPath expressions.", +) +``` + +If a dependency is missing, return a structured diagnostic. Do not fail with an +unexplained import error. + +## Processing Envelopes + +Use canonical processing envelopes where an extension needs a shared execution +boundary: + +- `ProcessingRequest` +- `ProcessingContext` +- `ProcessingResult` +- `ProcessingCapability` +- `ProcessingProvenance` +- `ProcessingTrace` + +Subsystem-specific dataclasses may remain richer. The canonical model is the +bridge that lets callbacks, registries, diagnostics, provenance, and future +policy checks interact consistently. + +## Diagnostics + +Diagnostics should be: + +- stable enough for tests and callers +- namespaced by subsystem or extension +- explicit about optional dependency failures +- tied to source locations where possible +- emitted as `Diagnostic` or `ProcessingResult.from_error` + +Recommended code style: + +```text +. +query.invalid_jsonpath +processor.unknown +extension.missing_dependency +backend.local_sqlite.invalid_fts_query +``` + +## Provenance + +Every extension that transforms, queries, reads, writes, generates, or indexes +content should expose provenance. Use a stable operation prefix: + +```text +query.selector +query.jsonpath +processor.include +local_snapshot_store.put_file +``` + +Include source path, content hash, snapshot id, backend/provider id, and +dependencies when known. + +## Safety And Policy + +Descriptors should declare safety-relevant behavior: + +- reads files +- writes local cache +- writes user output files +- accesses network +- invokes external process +- calls assisted-generation provider +- transmits content outside the local process + +The initial framework records this metadata. Later policy layers can enforce it. + +## CLI Affordances + +If an extension exposes CLI behavior, declare it in `descriptor.cli`: + +```python +cli={"commands": ["mkt cache index", "mkt search"]} +``` + +`markitect_tool.cli.extensions.collect_cli_command_specs()` can inspect these +affordances without importing Click command implementations. + +## Testing Checklist + +Add tests for: + +- descriptor serialization +- registry lookup and duplicate handling +- missing optional dependency diagnostics +- canonical result validity +- provenance shape +- CLI output envelope if public commands are exposed +- compatibility shim if replacing an existing API + +When refactoring an existing feature, add characterization tests first, then +migrate implementation behind descriptors or registries. + +## Boundary With Workflows + +Internal extensions describe what Markitect can do. Workflows describe how a +user combines capabilities for a concrete document pipeline. + +An extension may expose a workflow step later, but it should not depend on the +workflow engine to be useful from the library or CLI. diff --git a/docs/internal-extension-framework.md b/docs/internal-extension-framework.md new file mode 100644 index 0000000..5bf51db --- /dev/null +++ b/docs/internal-extension-framework.md @@ -0,0 +1,149 @@ +# Internal Extension Framework + +## Purpose + +Markitect has reached the point where optional features are useful but are +starting to concentrate wiring in central modules. Query engines, processors, +backend stores, references, contract checks, templates, generation adapters, and +CLI commands all need some combination of registration, capability metadata, +diagnostics, provenance, and optional dependency handling. + +The internal extension framework should make those seams explicit without +turning the project into a heavy external plugin platform. + +## Boundary + +This framework is about internal extensibility: + +```text +feature descriptor -> registry -> processing request/context/result + -> diagnostics/provenance/capabilities + -> CLI/API/backend integration +``` + +It is not the same as `MKTT-WP-0011` dataflow workflows. Workflows organize +business-facing processing steps for a document pipeline. The extension +framework organizes how Markitect itself exposes and composes capabilities. + +## Extension Taxonomy + +| Kind | Examples | Primary Contract | +| --- | --- | --- | +| `query-engine` | selector, JSONPath | document/data in, matches out | +| `processor` | identity, uppercase, include | fenced block in, processed result out | +| `backend` | local SQLite index | snapshots/index/search storage | +| `reference-provider` | section, region, fence, line | address in, content units out | +| `validator` | schema, contract, section assertion | document/context in, diagnostics out | +| `template-engine` | deterministic templates | template/data in, Markdown out | +| `generation-adapter` | provider-neutral assisted generation | request in, generated candidate out | +| `cli-group` | cache, backend, ref, class | command descriptors or registration hook | +| `render-export` | future Quarkdown/export adapters | Markdown source in, rendered/exported artifact out | +| `document-function` | future function layer | function call in, typed document value out | + +## Canonical Lifecycle + +An extension should be describable before it runs: + +1. Register descriptor. +2. Check optional dependencies. +3. Check capabilities and policy labels. +4. Build processing context. +5. Execute operation. +6. Normalize result, diagnostics, provenance, and trace data. +7. Expose output through library API, CLI, backend, or workflow layer. + +The framework should allow deterministic extensions to stay simple. Assisted, +external, networked, or filesystem-mutating extensions should declare that +explicitly before execution. + +## Descriptor Shape + +The first descriptor model should cover: + +- stable id +- kind +- version +- summary +- implementation reference +- capability declarations +- optional dependency declarations +- safety flags +- input and output contract names +- diagnostics namespace +- provenance operation prefix +- documentation and example links +- CLI affordances where applicable + +Descriptors are not meant to replace implementation modules. They are the small +declarative surface that lets Markitect inspect, list, validate, and compose +capabilities consistently. + +## Processing Model + +The canonical processing model should define a small set of shared envelopes: + +- `ProcessingRequest`: operation id, input payload, options, scope +- `ProcessingContext`: root, source path, namespaces, variables, policy, backend + handles, and caller metadata +- `ProcessingResult`: output payload, diagnostics, provenance, dependencies, + trace events, and validity +- `ProcessingDiagnostic`: severity, code, message, source, help +- `ProcessingCapability`: declared feature or permission requirement +- `ProcessingProvenance`: operation, source identity, snapshot/content hashes, + dependencies, backend/provider metadata + +Subsystem-specific types may remain richer. The canonical model is the bridge, +not a forced replacement for every local dataclass. + +## Registration Strategy + +Start with in-package registration: + +```text +markitect_tool/extensions/ + query_selector.py + query_jsonpath.py + backend_local_sqlite.py + processors_builtin.py +``` + +Each module exposes one or more descriptors plus a registration function. The +root registry can be assembled explicitly at import time or by a small internal +discovery list. Package entry points can be added later if external extension +packages become a real requirement. + +See `docs/extension-authoring.md` for the extension authoring checklist and +descriptor template. + +## Compatibility Rules + +The refactor must preserve: + +- current library APIs such as `query_document` +- current CLI commands and output envelopes +- current diagnostic codes where users may rely on them +- current provenance operation strings unless intentionally deprecated +- optional dependency behavior for JSONPath and future adapters +- cache/index file compatibility unless a migration is documented + +The first implementation adds canonical processing envelopes, extension +descriptors, registries, lifecycle callbacks, query-engine registry shims, +built-in extension descriptors, and CLI command specs while preserving existing +public commands. + +## Characterization Coverage + +Before refactoring, lock down: + +- selector query and extraction +- optional JSONPath diagnostics +- processor registry behavior and provenance +- backend manifest registry and capability checks +- local SQLite snapshot/index/search behavior +- content reference resolution +- representative CLI command envelopes +- provenance and diagnostic shapes + +These tests are deliberately a little redundant with unit tests. Their job is +to protect the current public behavior while internals move behind extension +descriptors and registries. diff --git a/docs/workplan-planning-map.md b/docs/workplan-planning-map.md index 33c5e6d..8903fc0 100644 --- a/docs/workplan-planning-map.md +++ b/docs/workplan-planning-map.md @@ -34,7 +34,7 @@ and descriptions mirror the operational view. | `MKTT-WP-0006` | complete | done | `MKTT-WP-0004`; task-level trigger: `MKTT-WP-0003-T005` | Optional backend fabric is complete: manifests, capabilities, snapshot identity, interfaces, registry, provenance, and read-only CLI scaffolding. | | `MKTT-WP-0010` | complete | done | `MKTT-WP-0004`; task-level trigger: `MKTT-WP-0003-T006` | Content references, processors, explode/implode, weave/tangle, content classes, and migration examples are complete as the first WP-0010 extension layer. | | `MKTT-WP-0007` | complete | done | `MKTT-WP-0006` | Advanced query and local index backend is complete: AST inspection, optional JSONPath, SQLite snapshots/metadata, FTS5 search, incremental refresh, and local index CLI. | -| `MKTT-WP-0013` | P1 | todo | `MKTT-WP-0003`, `MKTT-WP-0004`, `MKTT-WP-0006`, `MKTT-WP-0007`, `MKTT-WP-0010` | Internal extension framework and canonical processing model: characterize current behavior, add registries/descriptors/callbacks, and reduce central wiring before heavier runtime/workflow work. | +| `MKTT-WP-0013` | complete | done | `MKTT-WP-0003`, `MKTT-WP-0004`, `MKTT-WP-0006`, `MKTT-WP-0007`, `MKTT-WP-0010` | Internal extension framework is complete: characterization tests, canonical processing model, descriptors, registries, lifecycle callbacks, query-engine registry, built-in extension catalog, CLI command specs, and authoring guide. | | `MKTT-WP-0005` | P2 | todo | `MKTT-WP-0003`, `MKTT-WP-0004` | Pick up when generation/form/context or semantic assessment pressure appears. | | `MKTT-WP-0011` | P2 | todo | `MKTT-WP-0003`; task-level triggers: `MKTT-WP-0010-T001`, `MKTT-WP-0010-T005` | Declarative Markdown dataflow workflows: source extraction, deterministic/assisted processing, and multi-output generation. | | `MKTT-WP-0009` | P2 | todo | `MKTT-WP-0006` | Establish access-control gateway before security-sensitive cache/context use. | diff --git a/src/markitect_tool/cli/extensions.py b/src/markitect_tool/cli/extensions.py new file mode 100644 index 0000000..ab056f4 --- /dev/null +++ b/src/markitect_tool/cli/extensions.py @@ -0,0 +1,64 @@ +"""CLI extension specifications derived from internal extension descriptors.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +from markitect_tool.extension import ExtensionDescriptor, ExtensionRegistry + + +@dataclass(frozen=True) +class CliCommandSpec: + """Inspectable command affordance declared by an extension.""" + + command: str + extension_id: str + kind: str + summary: str | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + data = { + "command": self.command, + "extension_id": self.extension_id, + "kind": self.kind, + "summary": self.summary, + "metadata": self.metadata, + } + return { + key: value + for key, value in data.items() + if value not in (None, {}, []) + } + + +def command_specs_from_extension(descriptor: ExtensionDescriptor) -> list[CliCommandSpec]: + """Return CLI command specs declared by one extension descriptor.""" + + raw_commands = descriptor.cli.get("commands", []) + if isinstance(raw_commands, str): + raw_commands = [raw_commands] + return [ + CliCommandSpec( + command=str(command), + extension_id=descriptor.id, + kind=descriptor.kind, + summary=descriptor.summary, + metadata={ + key: value + for key, value in descriptor.cli.items() + if key != "commands" + }, + ) + for command in raw_commands + ] + + +def collect_cli_command_specs(registry: ExtensionRegistry) -> list[CliCommandSpec]: + """Collect CLI affordances from a registry of extension descriptors.""" + + specs: list[CliCommandSpec] = [] + for descriptor in registry.list(): + specs.extend(command_specs_from_extension(descriptor)) + return sorted(specs, key=lambda spec: (spec.command, spec.extension_id)) diff --git a/src/markitect_tool/extension/__init__.py b/src/markitect_tool/extension/__init__.py new file mode 100644 index 0000000..ab0d966 --- /dev/null +++ b/src/markitect_tool/extension/__init__.py @@ -0,0 +1,56 @@ +"""Internal extension framework primitives.""" + +from markitect_tool.extension.processing import ( + ProcessingCapability, + ProcessingContext, + ProcessingDiagnostic, + ProcessingProvenance, + ProcessingRequest, + ProcessingResult, + ProcessingTrace, +) +from markitect_tool.extension.execution import ( + AfterCallback, + BeforeCallback, + ExtensionExecutor, + ExtensionLifecycle, + ExtensionRunner, +) +from markitect_tool.extension.registry import ( + ExtensionDependencyCheck, + ExtensionDescriptor, + ExtensionRegistry, + ExtensionRegistryError, + OptionalDependency, +) + +__all__ = [ + "ProcessingCapability", + "ProcessingContext", + "ProcessingDiagnostic", + "ProcessingProvenance", + "ProcessingRequest", + "ProcessingResult", + "ProcessingTrace", + "ExtensionDependencyCheck", + "ExtensionDescriptor", + "ExtensionRegistry", + "ExtensionRegistryError", + "OptionalDependency", + "AfterCallback", + "BeforeCallback", + "ExtensionExecutor", + "ExtensionLifecycle", + "ExtensionRunner", +] + + +def builtin_extension_registry(): + """Return built-in extension descriptors without import-cycle pressure.""" + + from markitect_tool.extension.builtins import builtin_extension_registry as _registry + + return _registry() + + +__all__.append("builtin_extension_registry") diff --git a/src/markitect_tool/extension/builtins.py b/src/markitect_tool/extension/builtins.py new file mode 100644 index 0000000..5037f02 --- /dev/null +++ b/src/markitect_tool/extension/builtins.py @@ -0,0 +1,92 @@ +"""Built-in internal extension descriptors.""" + +from __future__ import annotations + +from markitect_tool.extension.registry import ExtensionDescriptor, ExtensionRegistry +from markitect_tool.extension.processing import ProcessingCapability +from markitect_tool.query import default_query_engine_registry + + +def builtin_extension_registry() -> ExtensionRegistry: + """Return descriptors for built-in Markitect extensions.""" + + registry = default_query_engine_registry().extension_registry() + for descriptor in _processor_descriptors() + [_local_sqlite_backend_descriptor()]: + registry.register(descriptor) + return registry + + +def _processor_descriptors() -> list[ExtensionDescriptor]: + return [ + ExtensionDescriptor( + id="processor.identity", + kind="processor", + summary="Return fenced block content unchanged.", + capabilities=[ + ProcessingCapability(id="processor", kind="execute"), + ProcessingCapability(id="deterministic", kind="execution"), + ], + input_contract="ProcessorRequest", + output_contract="ProcessorResult", + diagnostics_namespace="processor", + provenance_prefix="processor.identity", + cli={"commands": ["mkt process"]}, + docs=["docs/processors.md"], + ), + ExtensionDescriptor( + id="processor.uppercase", + kind="processor", + summary="Uppercase fenced block content deterministically.", + capabilities=[ + ProcessingCapability(id="processor", kind="execute"), + ProcessingCapability(id="deterministic", kind="execution"), + ], + input_contract="ProcessorRequest", + output_contract="ProcessorResult", + diagnostics_namespace="processor", + provenance_prefix="processor.uppercase", + cli={"commands": ["mkt process"]}, + docs=["docs/processors.md"], + ), + ExtensionDescriptor( + id="processor.include", + kind="processor", + summary="Resolve a content reference into fenced block output.", + capabilities=[ + ProcessingCapability(id="processor", kind="execute"), + ProcessingCapability(id="references", kind="read"), + ProcessingCapability(id="filesystem", kind="read"), + ], + safety={"reads_files": True, "writes_files": False, "network": False}, + input_contract="ProcessorRequest", + output_contract="ProcessorResult", + diagnostics_namespace="processor", + provenance_prefix="processor.include", + cli={"commands": ["mkt process"]}, + docs=["docs/processors.md", "docs/content-references.md"], + ), + ] + + +def _local_sqlite_backend_descriptor() -> ExtensionDescriptor: + return ExtensionDescriptor( + id="backend.local-sqlite", + kind="backend", + summary="Local SQLite snapshot, metadata, JSON, and FTS5 index backend.", + capabilities=[ + ProcessingCapability(id="snapshots", kind="backend"), + ProcessingCapability(id="ast", kind="backend"), + ProcessingCapability(id="json", kind="backend"), + ProcessingCapability(id="fts", kind="backend"), + ProcessingCapability(id="sql", kind="backend"), + ProcessingCapability(id="provenance", kind="backend"), + ], + safety={"reads_files": True, "writes_local_cache": True, "network": False}, + input_contract="Markdown files/directories", + output_contract="SQLite snapshot/index store", + diagnostics_namespace="backend.local_sqlite", + provenance_prefix="local_snapshot_store", + cli={"commands": ["mkt cache init", "mkt cache index", "mkt cache query", "mkt search"]}, + docs=["docs/local-index-backend.md", "docs/backend-fabric.md"], + examples=["examples/backends/local-sqlite-backend.md"], + ) diff --git a/src/markitect_tool/extension/execution.py b/src/markitect_tool/extension/execution.py new file mode 100644 index 0000000..5501abc --- /dev/null +++ b/src/markitect_tool/extension/execution.py @@ -0,0 +1,98 @@ +"""Execution lifecycle for internal extensions.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Callable + +from markitect_tool.extension.processing import ( + ProcessingRequest, + ProcessingResult, + ProcessingTrace, +) +from markitect_tool.extension.registry import ( + ExtensionDescriptor, + ExtensionRegistry, + ExtensionRegistryError, +) + + +ExtensionRunner = Callable[[ProcessingRequest], ProcessingResult] +BeforeCallback = Callable[[ExtensionDescriptor, ProcessingRequest], None] +AfterCallback = Callable[[ExtensionDescriptor, ProcessingRequest, ProcessingResult], None] + + +@dataclass +class ExtensionLifecycle: + """Explicit callbacks around extension execution.""" + + before: list[BeforeCallback] = field(default_factory=list) + after_success: list[AfterCallback] = field(default_factory=list) + after_failure: list[AfterCallback] = field(default_factory=list) + after: list[AfterCallback] = field(default_factory=list) + + def on_before(self, callback: BeforeCallback) -> None: + self.before.append(callback) + + def on_success(self, callback: AfterCallback) -> None: + self.after_success.append(callback) + + def on_failure(self, callback: AfterCallback) -> None: + self.after_failure.append(callback) + + def on_after(self, callback: AfterCallback) -> None: + self.after.append(callback) + + +class ExtensionExecutor: + """Execute registered extensions with deterministic lifecycle callbacks.""" + + def __init__( + self, + registry: ExtensionRegistry, + *, + lifecycle: ExtensionLifecycle | None = None, + ) -> None: + self.registry = registry + self.lifecycle = lifecycle or ExtensionLifecycle() + + def execute(self, extension_id: str, request: ProcessingRequest) -> ProcessingResult: + descriptor = self.registry.get(extension_id) + dependency_check = self.registry.check_dependencies(extension_id) + if not dependency_check.compatible: + return ProcessingResult.from_error( + code="extension.missing_dependency", + message=f"Extension `{extension_id}` is missing required dependencies.", + details=dependency_check.to_dict(), + ) + runner = descriptor.instantiate() + if not callable(runner): + raise ExtensionRegistryError(f"Extension `{extension_id}` factory did not return a callable") + + for callback in self.lifecycle.before: + callback(descriptor, request) + + result = runner(request) + if not isinstance(result, ProcessingResult): + raise ExtensionRegistryError( + f"Extension `{extension_id}` returned {type(result).__name__}, expected ProcessingResult" + ) + + result = _with_trace(result, ProcessingTrace(event="extension.executed", metadata={"id": extension_id})) + callbacks = self.lifecycle.after_success if result.valid else self.lifecycle.after_failure + for callback in callbacks: + callback(descriptor, request, result) + for callback in self.lifecycle.after: + callback(descriptor, request, result) + return result + + +def _with_trace(result: ProcessingResult, trace: ProcessingTrace) -> ProcessingResult: + return ProcessingResult( + output=result.output, + diagnostics=result.diagnostics, + provenance=result.provenance, + dependencies=result.dependencies, + trace=[*result.trace, trace], + metadata=result.metadata, + ) diff --git a/src/markitect_tool/extension/processing.py b/src/markitect_tool/extension/processing.py new file mode 100644 index 0000000..0af747a --- /dev/null +++ b/src/markitect_tool/extension/processing.py @@ -0,0 +1,184 @@ +"""Canonical processing envelopes for internal extensions.""" + +from __future__ import annotations + +import hashlib +import json +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + +from markitect_tool.diagnostics import Diagnostic, SourceLocation, has_error + + +@dataclass(frozen=True) +class ProcessingCapability: + """A declared capability or permission needed by an extension.""" + + id: str + kind: str = "feature" + required: bool = True + description: str | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return _drop_empty(asdict(self)) + + +@dataclass(frozen=True) +class ProcessingProvenance: + """Cross-extension provenance envelope.""" + + operation: str + source_path: str | None = None + snapshot_id: str | None = None + content_hash: str | None = None + dependencies: list[str] = field(default_factory=list) + backend_id: str | None = None + provider_id: str | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return _drop_empty(asdict(self)) + + +@dataclass(frozen=True) +class ProcessingTrace: + """One optional execution trace event.""" + + event: str + message: str | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return _drop_empty(asdict(self)) + + +@dataclass(frozen=True) +class ProcessingContext: + """Shared execution context available to extension implementations.""" + + root: Path = Path(".") + source_path: Path | None = None + namespaces: dict[str, str] = field(default_factory=dict) + variables: dict[str, Any] = field(default_factory=dict) + policy: dict[str, Any] = field(default_factory=dict) + backend_handles: dict[str, Any] = field(default_factory=dict, repr=False, compare=False) + caller: str | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + data = { + "root": str(self.root), + "source_path": str(self.source_path) if self.source_path else None, + "namespaces": self.namespaces, + "variables": self.variables, + "policy": self.policy, + "backend_handles": sorted(self.backend_handles), + "caller": self.caller, + "metadata": self.metadata, + } + return _drop_empty(data) + + +@dataclass(frozen=True) +class ProcessingRequest: + """Canonical request passed to an internal extension.""" + + operation: str + input: Any + context: ProcessingContext = field(default_factory=ProcessingContext) + options: dict[str, Any] = field(default_factory=dict) + scope: str | None = None + capabilities: list[ProcessingCapability] = field(default_factory=list) + metadata: dict[str, Any] = field(default_factory=dict) + + @property + def cache_key(self) -> str: + payload = { + "operation": self.operation, + "input": self.input, + "options": self.options, + "scope": self.scope, + "capabilities": [capability.to_dict() for capability in self.capabilities], + "metadata": self.metadata, + } + return "processing:" + hashlib.sha256( + json.dumps(payload, sort_keys=True, ensure_ascii=False, default=str).encode("utf-8") + ).hexdigest() + + def to_dict(self) -> dict[str, Any]: + data = { + "operation": self.operation, + "input": self.input, + "context": self.context.to_dict(), + "options": self.options, + "scope": self.scope, + "capabilities": [capability.to_dict() for capability in self.capabilities], + "metadata": self.metadata, + "cache_key": self.cache_key, + } + return _drop_empty(data) + + +@dataclass(frozen=True) +class ProcessingResult: + """Canonical result returned by an internal extension.""" + + output: Any = None + diagnostics: list[Diagnostic] = field(default_factory=list) + provenance: list[ProcessingProvenance] = field(default_factory=list) + dependencies: list[str] = field(default_factory=list) + trace: list[ProcessingTrace] = field(default_factory=list) + metadata: dict[str, Any] = field(default_factory=dict) + + @property + def valid(self) -> bool: + return not has_error(self.diagnostics) + + def to_dict(self) -> dict[str, Any]: + data = { + "valid": self.valid, + "output": self.output, + "diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics], + "provenance": [event.to_dict() for event in self.provenance], + "dependencies": self.dependencies, + "trace": [event.to_dict() for event in self.trace], + "metadata": self.metadata, + } + return _drop_empty(data) + + @classmethod + def from_error( + cls, + *, + code: str, + message: str, + source_path: str | None = None, + line: int | None = None, + details: dict[str, Any] | None = None, + ) -> "ProcessingResult": + return cls( + diagnostics=[ + Diagnostic( + severity="error", + code=code, + message=message, + source=SourceLocation(path=source_path, line=line) + if source_path or line + else None, + details=details or {}, + ) + ] + ) + + +ProcessingDiagnostic = Diagnostic + + +def _drop_empty(data: dict[str, Any]) -> dict[str, Any]: + return { + key: value + for key, value in data.items() + if value not in (None, [], {}, "") + } diff --git a/src/markitect_tool/extension/registry.py b/src/markitect_tool/extension/registry.py new file mode 100644 index 0000000..05eac7d --- /dev/null +++ b/src/markitect_tool/extension/registry.py @@ -0,0 +1,193 @@ +"""Extension descriptors and registries.""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from typing import Any, Callable, Iterable + +from markitect_tool.extension.processing import ProcessingCapability + + +ExtensionFactory = Callable[[], Any] + + +class ExtensionRegistryError(ValueError): + """Raised when extension descriptors or registries are invalid.""" + + +@dataclass(frozen=True) +class OptionalDependency: + """An optional runtime dependency declared by an extension.""" + + name: str + package: str | None = None + extra: str | None = None + required: bool = False + purpose: str | None = None + + def to_dict(self) -> dict[str, Any]: + return _drop_empty(asdict(self)) + + +@dataclass(frozen=True) +class ExtensionDescriptor: + """Inspectable descriptor for one internal extension.""" + + id: str + kind: str + version: str = "1" + summary: str | None = None + factory: ExtensionFactory | None = field(default=None, compare=False, repr=False) + capabilities: list[ProcessingCapability] = field(default_factory=list) + optional_dependencies: list[OptionalDependency] = field(default_factory=list) + safety: dict[str, Any] = field(default_factory=dict) + input_contract: str | None = None + output_contract: str | None = None + diagnostics_namespace: str | None = None + provenance_prefix: str | None = None + cli: dict[str, Any] = field(default_factory=dict) + docs: list[str] = field(default_factory=list) + examples: list[str] = field(default_factory=list) + metadata: dict[str, Any] = field(default_factory=dict) + + def __post_init__(self) -> None: + if not self.id.strip(): + raise ExtensionRegistryError("Extension id cannot be empty") + if not self.kind.strip(): + raise ExtensionRegistryError("Extension kind cannot be empty") + + def to_dict(self) -> dict[str, Any]: + data = { + "id": self.id, + "kind": self.kind, + "version": self.version, + "summary": self.summary, + "capabilities": [capability.to_dict() for capability in self.capabilities], + "optional_dependencies": [ + dependency.to_dict() for dependency in self.optional_dependencies + ], + "safety": self.safety, + "input_contract": self.input_contract, + "output_contract": self.output_contract, + "diagnostics_namespace": self.diagnostics_namespace, + "provenance_prefix": self.provenance_prefix, + "cli": self.cli, + "docs": self.docs, + "examples": self.examples, + "metadata": self.metadata, + } + return _drop_empty(data) + + def instantiate(self) -> Any: + """Create or return the implementation for this descriptor.""" + + if self.factory is None: + raise ExtensionRegistryError(f"Extension `{self.id}` has no factory") + return self.factory() + + +@dataclass(frozen=True) +class ExtensionDependencyCheck: + """Result of checking required extension dependencies.""" + + extension_id: str + missing: list[str] = field(default_factory=list) + optional_missing: list[str] = field(default_factory=list) + + @property + def compatible(self) -> bool: + return not self.missing + + def to_dict(self) -> dict[str, Any]: + return { + "extension_id": self.extension_id, + "compatible": self.compatible, + "missing": self.missing, + "optional_missing": self.optional_missing, + } + + +class ExtensionRegistry: + """Registry of internal extension descriptors.""" + + def __init__(self, descriptors: Iterable[ExtensionDescriptor] | None = None) -> None: + self._descriptors: dict[str, ExtensionDescriptor] = {} + for descriptor in descriptors or []: + self.register(descriptor) + + def register(self, descriptor: ExtensionDescriptor) -> None: + if descriptor.id in self._descriptors: + raise ExtensionRegistryError(f"Duplicate extension id `{descriptor.id}`") + self._descriptors[descriptor.id] = descriptor + + def get(self, extension_id: str) -> ExtensionDescriptor: + try: + return self._descriptors[extension_id] + except KeyError as exc: + raise ExtensionRegistryError(f"Unknown extension `{extension_id}`") from exc + + def list(self, *, kind: str | None = None) -> list[ExtensionDescriptor]: + descriptors = [self._descriptors[key] for key in sorted(self._descriptors)] + if kind is None: + return descriptors + return [descriptor for descriptor in descriptors if descriptor.kind == kind] + + def require_capability(self, capability_id: str) -> list[ExtensionDescriptor]: + return [ + descriptor + for descriptor in self.list() + if any(capability.id == capability_id for capability in descriptor.capabilities) + ] + + def check_dependencies( + self, + extension_id: str, + *, + available_modules: set[str] | None = None, + ) -> ExtensionDependencyCheck: + descriptor = self.get(extension_id) + available = ( + available_modules + if available_modules is not None + else _available_modules( + dependency.name for dependency in descriptor.optional_dependencies + ) + ) + missing: list[str] = [] + optional_missing: list[str] = [] + for dependency in descriptor.optional_dependencies: + if dependency.name in available: + continue + if dependency.required: + missing.append(dependency.name) + else: + optional_missing.append(dependency.name) + return ExtensionDependencyCheck( + extension_id=extension_id, + missing=missing, + optional_missing=optional_missing, + ) + + def to_dict(self) -> dict[str, Any]: + return { + "count": len(self._descriptors), + "extensions": [descriptor.to_dict() for descriptor in self.list()], + } + + +def _available_modules(module_names: Iterable[str]) -> set[str]: + import importlib.util + + return { + module_name + for module_name in module_names + if importlib.util.find_spec(module_name) is not None + } + + +def _drop_empty(data: dict[str, Any]) -> dict[str, Any]: + return { + key: value + for key, value in data.items() + if value not in (None, [], {}, "") + } diff --git a/src/markitect_tool/query/__init__.py b/src/markitect_tool/query/__init__.py index c16c9fa..b970660 100644 --- a/src/markitect_tool/query/__init__.py +++ b/src/markitect_tool/query/__init__.py @@ -5,8 +5,15 @@ from markitect_tool.query.engine import ( QueryMatch, extract_document, extract_document_jsonpath, + extract_document_with_engine, query_document, query_document_jsonpath, + query_document_with_engine, +) +from markitect_tool.query.registry import ( + QueryEngine, + QueryEngineRegistry, + default_query_engine_registry, ) __all__ = [ @@ -14,6 +21,11 @@ __all__ = [ "QueryMatch", "extract_document", "extract_document_jsonpath", + "extract_document_with_engine", "query_document", "query_document_jsonpath", + "query_document_with_engine", + "QueryEngine", + "QueryEngineRegistry", + "default_query_engine_registry", ] diff --git a/src/markitect_tool/query/engine.py b/src/markitect_tool/query/engine.py index f5dc2f3..2a5d76c 100644 --- a/src/markitect_tool/query/engine.py +++ b/src/markitect_tool/query/engine.py @@ -44,6 +44,29 @@ class _Selector: def query_document(document: Document, selector: str) -> list[QueryMatch]: """Query a parsed document with a small Markitect selector.""" + return query_document_with_engine(document, selector, engine="selector") + + +def query_document_with_engine( + document: Document, + selector: str, + *, + engine: str = "selector", +) -> list[QueryMatch]: + """Query a parsed document through a registered query engine.""" + + from markitect_tool.query.registry import default_query_engine_registry + + try: + query_engine = default_query_engine_registry().get(engine) + except ValueError as exc: + raise InvalidQueryError(str(exc)) from exc + return query_engine.query(document, selector) + + +def _query_document_selector(document: Document, selector: str) -> list[QueryMatch]: + """Query a parsed document with the built-in selector engine.""" + parsed = _parse_selector(selector) if parsed.target in {"document", "$", "."}: return [QueryMatch(kind="document", path="$", value=document.to_dict())] @@ -67,6 +90,12 @@ def query_document_jsonpath(document: Document, expression: str) -> list[QueryMa remains dependency-light. Install ``markitect-tool[query]`` to enable it. """ + return query_document_with_engine(document, expression, engine="jsonpath") + + +def _query_document_jsonpath(document: Document, expression: str) -> list[QueryMatch]: + """Implementation for the registered optional JSONPath engine.""" + try: from jsonpath_ng.ext import parse as parse_jsonpath except ImportError as exc: # pragma: no cover - branch depends on env deps @@ -110,14 +139,29 @@ def extract_document(document: Document, selector: str) -> list[str]: return extracted +def extract_document_with_engine( + document: Document, + selector: str, + *, + engine: str = "selector", +) -> list[str]: + """Extract textual query matches through a registered query engine.""" + + extracted: list[str] = [] + for match in query_document_with_engine(document, selector, engine=engine): + if match.text is not None: + extracted.append(match.text) + elif isinstance(match.value, str): + extracted.append(match.value) + elif isinstance(match.value, int | float | bool): + extracted.append(str(match.value)) + return extracted + + def extract_document_jsonpath(document: Document, expression: str) -> list[str]: """Extract textual JSONPath matches from a parsed document.""" - extracted: list[str] = [] - for match in query_document_jsonpath(document, expression): - if match.text is not None: - extracted.append(match.text) - return extracted + return extract_document_with_engine(document, expression, engine="jsonpath") def _parse_selector(selector: str) -> _Selector: diff --git a/src/markitect_tool/query/registry.py b/src/markitect_tool/query/registry.py new file mode 100644 index 0000000..5d3cf5e --- /dev/null +++ b/src/markitect_tool/query/registry.py @@ -0,0 +1,105 @@ +"""Query engine registry adapters.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Callable + +from markitect_tool.core import Document +from markitect_tool.extension import ( + ExtensionDescriptor, + ExtensionRegistry, + OptionalDependency, + ProcessingCapability, +) +from markitect_tool.query.engine import QueryMatch + + +QueryCallable = Callable[[Document, str], list[QueryMatch]] + + +@dataclass(frozen=True) +class QueryEngine: + """Registered query engine implementation.""" + + descriptor: ExtensionDescriptor + query: QueryCallable + + +class QueryEngineRegistry: + """Registry of query engines keyed by short engine id.""" + + def __init__(self, engines: list[QueryEngine] | None = None) -> None: + self._engines: dict[str, QueryEngine] = {} + for engine in engines or []: + self.register(engine) + + def register(self, engine: QueryEngine) -> None: + if engine.descriptor.id in self._engines: + raise ValueError(f"Duplicate query engine `{engine.descriptor.id}`") + self._engines[engine.descriptor.id] = engine + + def get(self, engine_id: str) -> QueryEngine: + try: + return self._engines[engine_id] + except KeyError as exc: + raise ValueError(f"Unknown query engine `{engine_id}`") from exc + + def list(self) -> list[QueryEngine]: + return [self._engines[key] for key in sorted(self._engines)] + + def extension_registry(self) -> ExtensionRegistry: + return ExtensionRegistry(engine.descriptor for engine in self.list()) + + +def default_query_engine_registry() -> QueryEngineRegistry: + """Return the built-in query engine registry.""" + + from markitect_tool.query.engine import ( + _query_document_jsonpath, + _query_document_selector, + ) + + return QueryEngineRegistry( + [ + QueryEngine( + descriptor=ExtensionDescriptor( + id="selector", + kind="query-engine", + summary="Compact Markitect selector engine.", + capabilities=[ProcessingCapability(id="ast", kind="read")], + input_contract="Document + selector", + output_contract="QueryMatch[]", + diagnostics_namespace="query", + provenance_prefix="query.selector", + cli={"commands": ["mkt query", "mkt extract", "mkt cache query"]}, + docs=["docs/query-extraction.md"], + ), + query=_query_document_selector, + ), + QueryEngine( + descriptor=ExtensionDescriptor( + id="jsonpath", + kind="query-engine", + summary="Optional JSONPath engine over Document.to_dict().", + capabilities=[ProcessingCapability(id="ast", kind="read")], + optional_dependencies=[ + OptionalDependency( + name="jsonpath_ng", + package="jsonpath-ng", + extra="query", + required=True, + purpose="Evaluate JSONPath expressions.", + ) + ], + input_contract="Document + JSONPath expression", + output_contract="QueryMatch[]", + diagnostics_namespace="query.jsonpath", + provenance_prefix="query.jsonpath", + cli={"commands": ["mkt query --engine jsonpath", "mkt extract --engine jsonpath"]}, + docs=["docs/query-extraction.md", "docs/local-index-backend.md"], + ), + query=_query_document_jsonpath, + ), + ] + ) diff --git a/tests/test_builtin_extension_catalog.py b/tests/test_builtin_extension_catalog.py new file mode 100644 index 0000000..d9e2185 --- /dev/null +++ b/tests/test_builtin_extension_catalog.py @@ -0,0 +1,50 @@ +from markitect_tool.extension import builtin_extension_registry + + +def test_builtin_extension_registry_lists_query_processors_and_backend(): + registry = builtin_extension_registry() + + ids = [descriptor.id for descriptor in registry.list()] + + assert "query.selector" not in ids + assert "selector" in ids + assert "jsonpath" in ids + assert "processor.identity" in ids + assert "processor.uppercase" in ids + assert "processor.include" in ids + assert "backend.local-sqlite" in ids + + +def test_builtin_processor_descriptors_capture_safety_and_provenance(): + registry = builtin_extension_registry() + + include = registry.get("processor.include") + uppercase = registry.get("processor.uppercase") + + assert include.kind == "processor" + assert include.safety["reads_files"] is True + assert include.provenance_prefix == "processor.include" + assert uppercase.safety == {} + assert uppercase.provenance_prefix == "processor.uppercase" + + +def test_builtin_local_sqlite_descriptor_exposes_backend_capabilities(): + registry = builtin_extension_registry() + + descriptor = registry.get("backend.local-sqlite") + + assert descriptor.kind == "backend" + assert {capability.id for capability in descriptor.capabilities} >= { + "snapshots", + "ast", + "json", + "fts", + "sql", + "provenance", + } + assert descriptor.cli["commands"] == [ + "mkt cache init", + "mkt cache index", + "mkt cache query", + "mkt search", + ] diff --git a/tests/test_cli_extension_specs.py b/tests/test_cli_extension_specs.py new file mode 100644 index 0000000..4446e7e --- /dev/null +++ b/tests/test_cli_extension_specs.py @@ -0,0 +1,37 @@ +from markitect_tool.cli.extensions import collect_cli_command_specs, command_specs_from_extension +from markitect_tool.extension import ExtensionDescriptor, builtin_extension_registry + + +def test_command_specs_from_extension_handles_string_and_list_forms(): + one = ExtensionDescriptor(id="one", kind="test", cli={"commands": "mkt one"}) + many = ExtensionDescriptor(id="many", kind="test", cli={"commands": ["mkt a", "mkt b"]}) + + assert [spec.command for spec in command_specs_from_extension(one)] == ["mkt one"] + assert [spec.command for spec in command_specs_from_extension(many)] == ["mkt a", "mkt b"] + + +def test_collect_cli_command_specs_from_builtin_registry(): + specs = collect_cli_command_specs(builtin_extension_registry()) + commands = {(spec.extension_id, spec.command) for spec in specs} + + assert ("selector", "mkt query") in commands + assert ("processor.uppercase", "mkt process") in commands + assert ("backend.local-sqlite", "mkt cache index") in commands + assert ("backend.local-sqlite", "mkt search") in commands + + +def test_cli_command_spec_serializes_without_empty_fields(): + spec = command_specs_from_extension( + ExtensionDescriptor( + id="query.selector", + kind="query-engine", + summary="Selector engine", + cli={"commands": ["mkt query"], "group": "query"}, + ) + )[0] + + data = spec.to_dict() + + assert data["command"] == "mkt query" + assert data["extension_id"] == "query.selector" + assert data["metadata"]["group"] == "query" diff --git a/tests/test_extension_characterization.py b/tests/test_extension_characterization.py new file mode 100644 index 0000000..eb119a1 --- /dev/null +++ b/tests/test_extension_characterization.py @@ -0,0 +1,176 @@ +from pathlib import Path +import builtins + +from click.testing import CliRunner +import pytest + +from markitect_tool.backend import ( + LocalSnapshotStore, + capability_check, + load_backend_manifest, + load_backend_registry, + local_index_path_for, +) +from markitect_tool.cli import main +from markitect_tool.core import parse_markdown +from markitect_tool.processor import ProcessorContext, run_fenced_processors +from markitect_tool.query import ( + default_query_engine_registry, + InvalidQueryError, + extract_document, + query_document, + query_document_jsonpath, +) +from markitect_tool.reference import ReferenceContext, resolve_reference + + +CHARACTERIZATION_DOC = """--- +document_type: adr +status: accepted +--- + +# Decision Record + +## Context + +Authors need stable infrastructure seams. + +## Decision + +Use explicit registries and processing envelopes. +""" + + +def test_query_selector_and_extraction_characterization(): + document = parse_markdown(CHARACTERIZATION_DOC) + registry = default_query_engine_registry() + + section_matches = query_document(document, "sections[heading=Decision]") + extracted = extract_document(document, "frontmatter.status") + + assert registry.get("selector").descriptor.kind == "query-engine" + assert len(section_matches) == 1 + assert section_matches[0].kind == "section" + assert section_matches[0].path == "$.sections[2]" + assert section_matches[0].text.startswith("## Decision") + assert extracted == ["accepted"] + + +def test_jsonpath_missing_dependency_diagnostic_characterization(monkeypatch): + document = parse_markdown(CHARACTERIZATION_DOC) + real_import = builtins.__import__ + + def fake_import(name, *args, **kwargs): + if name.startswith("jsonpath_ng"): + raise ImportError("blocked") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + + with pytest.raises(InvalidQueryError, match="optional `jsonpath-ng`"): + query_document_jsonpath(document, "$.headings[*].text") + + +def test_processor_registry_result_provenance_characterization(): + markdown = """```mkt-uppercase {#shout} +hello +``` +""" + + run = run_fenced_processors(markdown, context=ProcessorContext()) + + assert run.valid + assert run.blocks[0].processor == "uppercase" + assert run.blocks[0].unit_id == "shout" + assert run.results[0].content == "HELLO\n" + assert run.results[0].provenance[0].operation == "processor.uppercase" + + +def test_unknown_processor_diagnostic_characterization(): + markdown = """```mkt-missing {#x} +content +``` +""" + + run = run_fenced_processors(markdown, context=ProcessorContext()) + + assert not run.valid + diagnostic = run.results[0].diagnostics[0].to_dict() + assert diagnostic["severity"] == "error" + assert diagnostic["code"] == "processor.unknown" + assert "Unknown processor" in diagnostic["message"] + + +def test_backend_manifest_registry_characterization(): + manifest = load_backend_manifest("examples/backends/local-sqlite-backend.md") + registry = load_backend_registry(["examples/backends"]) + check = capability_check(manifest, ["snapshots", "fts", "provenance"]) + + assert manifest.id == "local-sqlite-cache" + assert registry.get("local-sqlite-cache").storage["engine"] == "sqlite" + assert check.compatible + + +def test_local_index_snapshot_query_search_characterization(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text(CHARACTERIZATION_DOC, encoding="utf-8") + store = LocalSnapshotStore(local_index_path_for(tmp_path)) + + build = store.build([tmp_path], root=tmp_path) + state = store.load_state()[0] + document = store.get_document("doc.md") + search_results = store.search("registries") + + assert build.parsed == ["doc.md"] + assert state.path == "doc.md" + assert state.snapshot_id.startswith("snapshot:") + assert document["headings"][0]["text"] == "Decision Record" + assert search_results[0].path == "doc.md" + assert search_results[0].unit_kind in {"section", "block"} + + +def test_reference_resolution_characterization(tmp_path: Path): + context_file = tmp_path / "context.md" + target_file = tmp_path / "target.md" + context_file.write_text("# Context\n", encoding="utf-8") + target_file.write_text("# Target\n\n## Decision\n\nChosen text.\n", encoding="utf-8") + context = ReferenceContext(root=tmp_path, current_path=context_file) + + resolution = resolve_reference("target.md#decision", context=context) + + assert resolution.target_path == str(target_file.resolve()) + assert resolution.units[0].kind == "section" + assert resolution.units[0].unit_id == "decision" + assert "Chosen text" in resolution.units[0].text + + +def test_cli_output_envelopes_characterization(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text(CHARACTERIZATION_DOC, encoding="utf-8") + runner = CliRunner() + + query = runner.invoke( + main, + ["query", str(source), "sections[heading=Decision]", "--format", "json"], + ) + index = runner.invoke(main, ["cache", "index", str(tmp_path), "--root", str(tmp_path)]) + cache_query = runner.invoke( + main, + [ + "cache", + "query", + "frontmatter.status", + "--root", + str(tmp_path), + "--format", + "json", + ], + ) + + assert query.exit_code == 0 + assert '"engine": "selector"' in query.output + assert '"count": 1' in query.output + assert index.exit_code == 0 + assert "parsed: 1" in index.output + assert cache_query.exit_code == 0 + assert '"source_path": "doc.md"' in cache_query.output diff --git a/tests/test_extension_execution.py b/tests/test_extension_execution.py new file mode 100644 index 0000000..12d3c3a --- /dev/null +++ b/tests/test_extension_execution.py @@ -0,0 +1,98 @@ +from markitect_tool.extension import ( + ExtensionDescriptor, + ExtensionExecutor, + ExtensionLifecycle, + ExtensionRegistry, + ExtensionRegistryError, + OptionalDependency, + ProcessingRequest, + ProcessingResult, +) + + +def test_extension_executor_runs_callbacks_in_order(): + events: list[str] = [] + + def runner(request: ProcessingRequest) -> ProcessingResult: + events.append(f"run:{request.operation}") + return ProcessingResult(output={"ok": True}) + + lifecycle = ExtensionLifecycle() + lifecycle.on_before(lambda descriptor, request: events.append(f"before:{descriptor.id}")) + lifecycle.on_success( + lambda descriptor, request, result: events.append(f"success:{result.output['ok']}") + ) + lifecycle.on_after(lambda descriptor, request, result: events.append("after")) + registry = ExtensionRegistry( + [ExtensionDescriptor(id="fake.runner", kind="test", factory=lambda: runner)] + ) + + result = ExtensionExecutor(registry, lifecycle=lifecycle).execute( + "fake.runner", + ProcessingRequest(operation="fake.run", input={}), + ) + + assert result.valid + assert result.trace[-1].event == "extension.executed" + assert events == ["before:fake.runner", "run:fake.run", "success:True", "after"] + + +def test_extension_executor_routes_failure_callbacks(): + events: list[str] = [] + + def runner(request: ProcessingRequest) -> ProcessingResult: + return ProcessingResult.from_error(code="fake.error", message="Nope") + + lifecycle = ExtensionLifecycle() + lifecycle.on_failure(lambda descriptor, request, result: events.append(result.diagnostics[0].code)) + registry = ExtensionRegistry( + [ExtensionDescriptor(id="fake.runner", kind="test", factory=lambda: runner)] + ) + + result = ExtensionExecutor(registry, lifecycle=lifecycle).execute( + "fake.runner", + ProcessingRequest(operation="fake.run", input={}), + ) + + assert not result.valid + assert events == ["fake.error"] + + +def test_extension_executor_blocks_missing_required_dependency(): + registry = ExtensionRegistry( + [ + ExtensionDescriptor( + id="query.jsonpath", + kind="query-engine", + factory=lambda: lambda request: ProcessingResult(output=[]), + optional_dependencies=[ + OptionalDependency(name="definitely_missing_markitect_dep", required=True) + ], + ) + ] + ) + + result = ExtensionExecutor(registry).execute( + "query.jsonpath", + ProcessingRequest(operation="query.jsonpath", input={}), + ) + + assert not result.valid + assert result.diagnostics[0].code == "extension.missing_dependency" + assert "definitely_missing_markitect_dep" in result.diagnostics[0].details["missing"] + + +def test_extension_executor_rejects_non_result_return(): + registry = ExtensionRegistry( + [ExtensionDescriptor(id="bad.runner", kind="test", factory=lambda: lambda request: {})] + ) + + try: + ExtensionExecutor(registry).execute( + "bad.runner", + ProcessingRequest(operation="bad.run", input={}), + ) + except ExtensionRegistryError as exc: + assert "expected ProcessingResult" in str(exc) + else: + raise AssertionError("Expected ExtensionRegistryError") diff --git a/tests/test_extension_processing_model.py b/tests/test_extension_processing_model.py new file mode 100644 index 0000000..99781e3 --- /dev/null +++ b/tests/test_extension_processing_model.py @@ -0,0 +1,75 @@ +from pathlib import Path + +from markitect_tool.extension import ( + ProcessingCapability, + ProcessingContext, + ProcessingProvenance, + ProcessingRequest, + ProcessingResult, + ProcessingTrace, +) + + +def test_processing_request_serializes_context_and_cache_key(): + request = ProcessingRequest( + operation="query.selector", + input={"selector": "sections[heading=Decision]"}, + context=ProcessingContext(root=Path("/workspace"), caller="cli"), + options={"format": "json"}, + capabilities=[ProcessingCapability(id="ast", description="Read parsed AST")], + ) + + data = request.to_dict() + + assert data["operation"] == "query.selector" + assert data["context"]["root"] == "/workspace" + assert data["context"]["caller"] == "cli" + assert data["capabilities"][0]["id"] == "ast" + assert request.cache_key.startswith("processing:") + assert request.cache_key == ProcessingRequest( + operation="query.selector", + input={"selector": "sections[heading=Decision]"}, + context=ProcessingContext(root=Path("/other")), + options={"format": "json"}, + capabilities=[ProcessingCapability(id="ast", description="Read parsed AST")], + ).cache_key + + +def test_processing_result_validity_provenance_and_trace(): + result = ProcessingResult( + output={"count": 1}, + provenance=[ + ProcessingProvenance( + operation="query.selector", + source_path="doc.md", + content_hash="sha256:abc", + dependencies=["doc.md"], + ) + ], + trace=[ProcessingTrace(event="query.start", metadata={"engine": "selector"})], + ) + + data = result.to_dict() + + assert result.valid + assert data["valid"] is True + assert data["output"]["count"] == 1 + assert data["provenance"][0]["operation"] == "query.selector" + assert data["trace"][0]["metadata"]["engine"] == "selector" + + +def test_processing_result_from_error_normalizes_diagnostics(): + result = ProcessingResult.from_error( + code="extension.missing_dependency", + message="Install optional dependency.", + source_path="doc.md", + line=3, + details={"dependency": "jsonpath-ng"}, + ) + + data = result.to_dict() + + assert not result.valid + assert data["diagnostics"][0]["severity"] == "error" + assert data["diagnostics"][0]["source"]["path"] == "doc.md" + assert data["diagnostics"][0]["details"]["dependency"] == "jsonpath-ng" diff --git a/tests/test_extension_registry.py b/tests/test_extension_registry.py new file mode 100644 index 0000000..15f5efa --- /dev/null +++ b/tests/test_extension_registry.py @@ -0,0 +1,112 @@ +import pytest + +from markitect_tool.extension import ( + ExtensionDescriptor, + ExtensionRegistry, + ExtensionRegistryError, + OptionalDependency, + ProcessingCapability, +) + + +def test_extension_descriptor_serializes_contract_metadata(): + descriptor = ExtensionDescriptor( + id="query.selector", + kind="query-engine", + summary="Small selector query engine.", + capabilities=[ProcessingCapability(id="ast", kind="read")], + input_contract="Document + selector", + output_contract="QueryMatch[]", + diagnostics_namespace="query", + provenance_prefix="query.selector", + cli={"command": "mkt query"}, + docs=["docs/query-extraction.md"], + ) + + data = descriptor.to_dict() + + assert data["id"] == "query.selector" + assert data["kind"] == "query-engine" + assert data["capabilities"][0]["id"] == "ast" + assert data["cli"]["command"] == "mkt query" + + +def test_extension_registry_lists_by_kind_and_capability(): + selector = ExtensionDescriptor( + id="query.selector", + kind="query-engine", + capabilities=[ProcessingCapability(id="ast")], + ) + local = ExtensionDescriptor( + id="backend.local-sqlite", + kind="backend", + capabilities=[ProcessingCapability(id="snapshots"), ProcessingCapability(id="fts")], + ) + registry = ExtensionRegistry([local, selector]) + + assert [descriptor.id for descriptor in registry.list()] == [ + "backend.local-sqlite", + "query.selector", + ] + assert [descriptor.id for descriptor in registry.list(kind="query-engine")] == [ + "query.selector" + ] + assert [descriptor.id for descriptor in registry.require_capability("fts")] == [ + "backend.local-sqlite" + ] + + +def test_extension_registry_rejects_duplicate_ids(): + descriptor = ExtensionDescriptor(id="query.selector", kind="query-engine") + registry = ExtensionRegistry([descriptor]) + + with pytest.raises(ExtensionRegistryError, match="Duplicate extension id"): + registry.register(descriptor) + + +def test_extension_registry_checks_optional_dependencies(): + registry = ExtensionRegistry( + [ + ExtensionDescriptor( + id="query.jsonpath", + kind="query-engine", + optional_dependencies=[ + OptionalDependency( + name="jsonpath_ng", + package="jsonpath-ng", + extra="query", + required=True, + ), + OptionalDependency(name="tabulate"), + ], + ) + ] + ) + + missing = registry.check_dependencies("query.jsonpath", available_modules=set()) + available = registry.check_dependencies( + "query.jsonpath", + available_modules={"jsonpath_ng", "tabulate"}, + ) + + assert not missing.compatible + assert missing.missing == ["jsonpath_ng"] + assert missing.optional_missing == ["tabulate"] + assert available.compatible + + +def test_extension_descriptor_instantiates_factory(): + descriptor = ExtensionDescriptor( + id="fake.extension", + kind="test", + factory=lambda: {"ready": True}, + ) + + assert descriptor.instantiate() == {"ready": True} + + +def test_extension_descriptor_requires_factory_to_instantiate(): + descriptor = ExtensionDescriptor(id="fake.extension", kind="test") + + with pytest.raises(ExtensionRegistryError, match="has no factory"): + descriptor.instantiate() diff --git a/tests/test_query_engine_registry.py b/tests/test_query_engine_registry.py new file mode 100644 index 0000000..cb3ff5a --- /dev/null +++ b/tests/test_query_engine_registry.py @@ -0,0 +1,28 @@ +from markitect_tool.core import parse_markdown +from markitect_tool.query import ( + default_query_engine_registry, + extract_document_with_engine, + query_document_with_engine, +) + + +def test_default_query_engine_registry_exposes_builtin_descriptors(): + registry = default_query_engine_registry() + + descriptors = registry.extension_registry().to_dict()["extensions"] + + assert [engine.descriptor.id for engine in registry.list()] == ["jsonpath", "selector"] + assert {descriptor["id"] for descriptor in descriptors} == {"selector", "jsonpath"} + assert registry.get("selector").descriptor.cli["commands"][0] == "mkt query" + assert registry.get("jsonpath").descriptor.optional_dependencies[0].name == "jsonpath_ng" + + +def test_query_document_with_engine_uses_selector_registry(): + document = parse_markdown("# Doc\n\n## Decision\n\nChosen.\n") + + matches = query_document_with_engine(document, "sections[heading=Decision]", engine="selector") + extracted = extract_document_with_engine(document, "sections[heading=Decision]", engine="selector") + + assert matches[0].kind == "section" + assert matches[0].path == "$.sections[1]" + assert extracted == ["## Decision\n\nChosen."] diff --git a/workplans/MKTT-WP-0013-internal-extension-framework.md b/workplans/MKTT-WP-0013-internal-extension-framework.md index a115483..87759a2 100644 --- a/workplans/MKTT-WP-0013-internal-extension-framework.md +++ b/workplans/MKTT-WP-0013-internal-extension-framework.md @@ -3,7 +3,7 @@ id: MKTT-WP-0013 type: workplan title: "Internal Extension Framework and Canonical Processing Model" domain: markitect -status: todo +status: done owner: markitect-tool topic_slug: markitect planning_priority: P1 @@ -81,7 +81,7 @@ discovery without forcing dynamic loading or external dependency installation. ```task id: MKTT-WP-0013-T001 -status: todo +status: done priority: high state_hub_task_id: "ba106001-c953-435a-8012-0dd83533d309" ``` @@ -101,11 +101,16 @@ Define the internal extension taxonomy: Output: architecture note explaining extension boundaries, lifecycle, registration semantics, and relationship to `MKTT-WP-0011`. +Implemented: `docs/internal-extension-framework.md` defines the internal +extension boundary, extension taxonomy, canonical lifecycle, descriptor shape, +processing model, registration strategy, compatibility rules, and +characterization coverage. + ## P13.2 - Add characterization tests before refactor ```task id: MKTT-WP-0013-T002 -status: todo +status: done priority: high state_hub_task_id: "a270cb7a-4dbf-4562-b0ab-d5dda5124086" ``` @@ -124,11 +129,17 @@ Lock down current behavior before moving code behind registries: Output: focused characterization tests that can fail loudly if refactoring changes public behavior. +Implemented: `tests/test_extension_characterization.py` covers selector +query/extraction, JSONPath optional-dependency diagnostics, processor +provenance and diagnostics, backend manifest/capability behavior, local +snapshot/index/search behavior, content references, and representative CLI +output envelopes. + ## P13.3 - Define canonical processing model ```task id: MKTT-WP-0013-T003 -status: todo +status: done priority: high state_hub_task_id: "8c88b9a7-1e8d-401c-ad09-8b5a19ccba14" ``` @@ -148,11 +159,17 @@ operations without making every extension depend on every subsystem. Output: framework module, tests, and migration guide for current subsystems. +Implemented: `markitect_tool.extension.processing` defines +`ProcessingRequest`, `ProcessingContext`, `ProcessingResult`, +`ProcessingDiagnostic`, `ProcessingCapability`, `ProcessingProvenance`, and +`ProcessingTrace`, with serialization, cache-key, validity, provenance, trace, +and error normalization tests. + ## P13.4 - Implement extension descriptors and registries ```task id: MKTT-WP-0013-T004 -status: todo +status: done priority: high state_hub_task_id: "3fb2fe81-9819-4679-99d0-ad60ac9e8277" ``` @@ -176,11 +193,17 @@ and, later, package entry points. Output: descriptor schema, registry API, duplicate/missing dependency diagnostics, and tests. +Implemented: `markitect_tool.extension.registry` defines +`ExtensionDescriptor`, `OptionalDependency`, `ExtensionRegistry`, +`ExtensionDependencyCheck`, and `ExtensionRegistryError`, with descriptor +serialization, kind/capability lookup, duplicate-id diagnostics, dependency +checks, and factory instantiation tests. + ## P13.5 - Add callback hooks and execution lifecycle ```task id: MKTT-WP-0013-T005 -status: todo +status: done priority: medium state_hub_task_id: "be8f2056-f413-44f9-be9c-6046c34e307e" ``` @@ -200,11 +223,15 @@ hidden global behavior. Output: callback model and tests with fake extensions. +Implemented: `ExtensionLifecycle` and `ExtensionExecutor` provide explicit +before/success/failure/after callbacks, dependency checks before execution, +result type normalization, execution trace emission, and fake-extension tests. + ## P13.6 - Refactor query engines behind registry ```task id: MKTT-WP-0013-T006 -status: todo +status: done priority: high state_hub_task_id: "0226c1d1-f583-43ad-8e20-f75f9790e17d" ``` @@ -215,11 +242,16 @@ compatibility. Output: registered selector/jsonpath engines, compatibility shims, and tests. +Implemented: selector and JSONPath engines now live behind +`QueryEngineRegistry` descriptors, with compatibility shims for +`query_document`, `extract_document`, `query_document_jsonpath`, and +`extract_document_jsonpath`; CLI behavior remains unchanged. + ## P13.7 - Refactor processors and local backend as registered extensions ```task id: MKTT-WP-0013-T007 -status: todo +status: done priority: medium state_hub_task_id: "a966dcbb-3ae8-47bf-85c8-4ba6ddcf7a31" ``` @@ -237,11 +269,16 @@ Focus areas: Output: extension-backed processor/backend registration and regression tests. +Implemented: `builtin_extension_registry()` now exposes built-in query engines, +deterministic processors, and the local SQLite backend as extension +descriptors with capabilities, safety flags, CLI affordances, docs/examples, +diagnostic namespaces, and provenance prefixes. + ## P13.8 - Refactor CLI composition to reduce central wiring ```task id: MKTT-WP-0013-T008 -status: todo +status: done priority: medium state_hub_task_id: "3e88ca62-8dba-4632-b5d0-29827d102322" ``` @@ -253,11 +290,17 @@ point. Output: CLI extension hook, migrated command group examples, and unchanged public CLI behavior. +Implemented first integration point: `markitect_tool.cli.extensions` derives +`CliCommandSpec` declarations from extension descriptors. Built-in query, +processor, and backend descriptors now expose command affordances such as +`mkt query`, `mkt process`, `mkt cache index`, and `mkt search` without making +the CLI module the only source of command metadata. + ## P13.9 - Document extension authoring conventions ```task id: MKTT-WP-0013-T009 -status: todo +status: done priority: medium state_hub_task_id: "848e2a5e-c32b-4a94-906b-dc6aced4c71b" ``` @@ -275,6 +318,11 @@ Document how a new internal extension should be structured: Output: extension authoring guide and one small template/example extension. +Implemented: `docs/extension-authoring.md` documents extension layout, +descriptor template, optional dependency declarations, processing envelopes, +diagnostics, provenance, safety/policy metadata, CLI affordances, tests, and +the boundary with business-facing workflows. + ## Exit Criteria - Existing behavior is covered by characterization tests before refactoring.