From b1577d90dbcf99650bbaa9acf94aa21e37ca6b7f Mon Sep 17 00:00:00 2001 From: tegwick Date: Mon, 4 May 2026 03:25:26 +0200 Subject: [PATCH] Refresh planning layer for backend fabric --- docs/backend-fabric.md | 32 ++ docs/workplan-planning-map.md | 2 +- examples/backend-state/snapshot-state.yaml | 11 + src/markitect_tool/__init__.py | 12 + src/markitect_tool/backend/__init__.py | 14 + src/markitect_tool/backend/engine.py | 3 +- src/markitect_tool/backend/planning.py | 425 ++++++++++++++++++ src/markitect_tool/cli/main.py | 92 ++++ tests/test_backend_refresh_planning.py | 163 +++++++ ...-advanced-query-and-local-index-backend.md | 45 ++ 10 files changed, 797 insertions(+), 2 deletions(-) create mode 100644 examples/backend-state/snapshot-state.yaml create mode 100644 src/markitect_tool/backend/planning.py create mode 100644 tests/test_backend_refresh_planning.py diff --git a/docs/backend-fabric.md b/docs/backend-fabric.md index 7aa76f8..71e118e 100644 --- a/docs/backend-fabric.md +++ b/docs/backend-fabric.md @@ -75,6 +75,37 @@ The resulting `snapshot_id` is a stable hash over those identity fields. This lets future AST, JSONPath, FTS, SQL, vector, policy, and context-package backends invalidate derived data without guessing what changed. +## Refresh Planning + +Before WP-0007 writes a local SQLite index, the backend fabric provides a +read-only refresh planner. The planner compares current Markdown files with a +portable snapshot-state inventory and reports: + +- unchanged files +- files that need hashing +- files that need parsing +- files that need indexing +- files that only need metadata updates +- deleted sources +- dependency-invalidated dependents + +The planner uses a cheap-first strategy: + +1. Compare path, size, mtime, parser version, parse options hash, and contract + hash. +2. If cheap metadata is unchanged, skip hashing, parsing, and indexing. +3. If metadata changed, either mark the file for hash/parse/index or, with + `--verify-hashes`, hash only those changed candidates to avoid parsing when + content is unchanged. +4. Use dependency edges to invalidate direct and transitive dependents. + +This gives WP-0007 a performance contract before the storage engine exists. + +```bash +mkt backend refresh-plan docs --state examples/backend-state/snapshot-state.yaml +mkt backend refresh-plan docs --state .markitect/cache/snapshots.yaml --verify-hashes +``` + ## Provenance Envelope The shared backend provenance envelope records: @@ -113,6 +144,7 @@ Read-only inspection commands: mkt backend list --path examples/backends mkt backend inspect local-sqlite-cache --path examples/backends --require snapshots --require provenance mkt backend snapshot-id docs/content-references.md +mkt backend refresh-plan docs --state examples/backend-state/snapshot-state.yaml ``` The existing `mkt cache status` remains the lightweight file-manifest change diff --git a/docs/workplan-planning-map.md b/docs/workplan-planning-map.md index 805bc3c..ef13358 100644 --- a/docs/workplan-planning-map.md +++ b/docs/workplan-planning-map.md @@ -33,7 +33,7 @@ and descriptions mirror the operational view. | `MKTT-WP-0003` | complete | done | `MKTT-WP-0001`, `MKTT-WP-0002`, `MKTT-WP-0004` | Core toolkit implementation is complete. | | `MKTT-WP-0006` | complete | done | `MKTT-WP-0004`; task-level trigger: `MKTT-WP-0003-T005` | Optional backend fabric is complete: manifests, capabilities, snapshot identity, interfaces, registry, provenance, and read-only CLI scaffolding. | | `MKTT-WP-0010` | complete | done | `MKTT-WP-0004`; task-level trigger: `MKTT-WP-0003-T006` | Content references, processors, explode/implode, weave/tangle, content classes, and migration examples are complete as the first WP-0010 extension layer. | -| `MKTT-WP-0007` | P2 | todo | `MKTT-WP-0006` | First practical cache backend use case: AST/JSONPath/SQLite/FTS. | +| `MKTT-WP-0007` | P2 | todo | `MKTT-WP-0006` | First practical cache backend use case: AST/JSONPath/SQLite/FTS. Preliminary refresh planning is in place as the performance contract. | | `MKTT-WP-0005` | P2 | todo | `MKTT-WP-0003`, `MKTT-WP-0004` | Pick up when generation/form/context or semantic assessment pressure appears. | | `MKTT-WP-0011` | P2 | todo | `MKTT-WP-0003`; task-level triggers: `MKTT-WP-0010-T001`, `MKTT-WP-0010-T005` | Declarative Markdown dataflow workflows: source extraction, deterministic/assisted processing, and multi-output generation. | | `MKTT-WP-0009` | P2 | todo | `MKTT-WP-0006` | Establish access-control gateway before security-sensitive cache/context use. | diff --git a/examples/backend-state/snapshot-state.yaml b/examples/backend-state/snapshot-state.yaml new file mode 100644 index 0000000..76e4e73 --- /dev/null +++ b/examples/backend-state/snapshot-state.yaml @@ -0,0 +1,11 @@ +snapshots: + - path: docs/content-references.md + size: 0 + mtime_ns: 0 + content_hash: sha256:example + snapshot_id: snapshot:example + indexed: true + dependencies: + - source_id: snapshot:example + target: examples/references/standard/clauses.md + kind: reference diff --git a/src/markitect_tool/__init__.py b/src/markitect_tool/__init__.py index 5afffc2..04c23cb 100644 --- a/src/markitect_tool/__init__.py +++ b/src/markitect_tool/__init__.py @@ -43,15 +43,21 @@ from markitect_tool.backend import ( ContextPackageRegistry, DependencyEdge, DocumentSnapshot, + EMPTY_PARSE_OPTIONS_HASH, IndexBackend, ProcessorResultStore, ProvenanceEnvelope, QueryAdapter, + SnapshotPlanEntry, + SnapshotRefreshPlan, SnapshotBackend, SnapshotIdentity, + SnapshotState, capability_check, load_backend_manifest, load_backend_registry, + load_snapshot_state_file, + plan_snapshot_refresh, snapshot_identity_for_file, ) from markitect_tool.content_class import ( @@ -194,15 +200,21 @@ __all__ = [ "ContextPackageRegistry", "DependencyEdge", "DocumentSnapshot", + "EMPTY_PARSE_OPTIONS_HASH", "IndexBackend", "ProcessorResultStore", "ProvenanceEnvelope", "QueryAdapter", + "SnapshotPlanEntry", + "SnapshotRefreshPlan", "SnapshotBackend", "SnapshotIdentity", + "SnapshotState", "capability_check", "load_backend_manifest", "load_backend_registry", + "load_snapshot_state_file", + "plan_snapshot_refresh", "snapshot_identity_for_file", "ClassCompositionResult", "ContentClass", diff --git a/src/markitect_tool/backend/__init__.py b/src/markitect_tool/backend/__init__.py index ae1cd1c..8dae0e6 100644 --- a/src/markitect_tool/backend/__init__.py +++ b/src/markitect_tool/backend/__init__.py @@ -9,6 +9,7 @@ from markitect_tool.backend.engine import ( BackendRegistryError, DependencyEdge, DocumentSnapshot, + EMPTY_PARSE_OPTIONS_HASH, ProvenanceEnvelope, SnapshotIdentity, capability_check, @@ -16,6 +17,13 @@ from markitect_tool.backend.engine import ( load_backend_registry, snapshot_identity_for_file, ) +from markitect_tool.backend.planning import ( + SnapshotPlanEntry, + SnapshotRefreshPlan, + SnapshotState, + load_snapshot_state_file, + plan_snapshot_refresh, +) from markitect_tool.backend.interfaces import ( AccessPolicyGateway, ContextPackageRegistry, @@ -34,12 +42,18 @@ __all__ = [ "BackendRegistryError", "DependencyEdge", "DocumentSnapshot", + "EMPTY_PARSE_OPTIONS_HASH", "ProvenanceEnvelope", "SnapshotIdentity", "capability_check", "load_backend_manifest", "load_backend_registry", "snapshot_identity_for_file", + "SnapshotPlanEntry", + "SnapshotRefreshPlan", + "SnapshotState", + "load_snapshot_state_file", + "plan_snapshot_refresh", "AccessPolicyGateway", "ContextPackageRegistry", "IndexBackend", diff --git a/src/markitect_tool/backend/engine.py b/src/markitect_tool/backend/engine.py index dc142dc..cb96436 100644 --- a/src/markitect_tool/backend/engine.py +++ b/src/markitect_tool/backend/engine.py @@ -32,6 +32,7 @@ BACKEND_CAPABILITIES = { DEFAULT_BACKEND_PATHS = (".markitect/backends", ".markitect/backend.yaml") PARSER_ID = "markdown-it-py/commonmark" PARSER_VERSION = "markitect-tool:1" +EMPTY_PARSE_OPTIONS_HASH = "sha256:44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a" class BackendRegistryError(ValueError): @@ -103,7 +104,7 @@ class SnapshotIdentity: content_hash: str parser: str = PARSER_ID parser_version: str = PARSER_VERSION - parse_options_hash: str = "sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + parse_options_hash: str = EMPTY_PARSE_OPTIONS_HASH contract_hash: str | None = None @property diff --git a/src/markitect_tool/backend/planning.py b/src/markitect_tool/backend/planning.py new file mode 100644 index 0000000..a71d9c7 --- /dev/null +++ b/src/markitect_tool/backend/planning.py @@ -0,0 +1,425 @@ +"""Refresh planning for optional snapshot and index backends.""" + +from __future__ import annotations + +import hashlib +import json +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + +import yaml + +from markitect_tool.backend.engine import ( + DependencyEdge, + EMPTY_PARSE_OPTIONS_HASH, + PARSER_ID, + PARSER_VERSION, +) +from markitect_tool.cache import scan_markdown_files + + +@dataclass(frozen=True) +class SnapshotState: + """Previously known source state from a snapshot/index backend.""" + + path: str + size: int + mtime_ns: int + content_hash: str + snapshot_id: str + parser: str = PARSER_ID + parser_version: str = PARSER_VERSION + parse_options_hash: str = EMPTY_PARSE_OPTIONS_HASH + contract_hash: str | None = None + indexed: bool = True + dependencies: list[DependencyEdge] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + data = asdict(self) + data["dependencies"] = [edge.to_dict() for edge in self.dependencies] + return {key: value for key, value in data.items() if value is not None} + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "SnapshotState": + return cls( + path=str(data["path"]), + size=int(data["size"]), + mtime_ns=int(data["mtime_ns"]), + content_hash=str(data["content_hash"]), + snapshot_id=str(data["snapshot_id"]), + parser=str(data.get("parser", PARSER_ID)), + parser_version=str(data.get("parser_version", PARSER_VERSION)), + parse_options_hash=str( + data.get( + "parse_options_hash", + EMPTY_PARSE_OPTIONS_HASH, + ) + ), + contract_hash=str(data["contract_hash"]) if data.get("contract_hash") is not None else None, + indexed=bool(data.get("indexed", True)), + dependencies=[ + _dependency_edge_from_dict(edge) + for edge in data.get("dependencies", []) + if isinstance(edge, dict) + ], + ) + + +@dataclass(frozen=True) +class SnapshotPlanEntry: + """One source-path decision in a refresh plan.""" + + path: str + actions: list[str] + reason: str + size: int | None = None + mtime_ns: int | None = None + previous_snapshot_id: str | None = None + content_hash: str | None = None + invalidated_by: list[str] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return {key: value for key, value in asdict(self).items() if value not in (None, [], {})} + + +@dataclass(frozen=True) +class SnapshotRefreshPlan: + """A cheap-first plan for refreshing snapshots and derived indexes.""" + + root: str + parser: str + parser_version: str + parse_options_hash: str + contract_hash: str | None + verify_hashes: bool + entries: list[SnapshotPlanEntry] + + @property + def unchanged(self) -> list[str]: + return _paths_without_actions(self.entries) + + @property + def needs_hash(self) -> list[str]: + return _paths_with_action(self.entries, "hash") + + @property + def needs_parse(self) -> list[str]: + return _paths_with_action(self.entries, "parse") + + @property + def needs_index(self) -> list[str]: + return _paths_with_action(self.entries, "index") + + @property + def needs_metadata_update(self) -> list[str]: + return _paths_with_action(self.entries, "metadata") + + @property + def deleted(self) -> list[str]: + return _paths_with_action(self.entries, "delete") + + @property + def invalidated(self) -> list[str]: + return sorted(entry.path for entry in self.entries if "invalidate" in entry.actions) + + @property + def dirty(self) -> bool: + return any(entry.actions for entry in self.entries) + + def to_dict(self) -> dict[str, Any]: + return { + "dirty": self.dirty, + "root": self.root, + "parser": self.parser, + "parser_version": self.parser_version, + "parse_options_hash": self.parse_options_hash, + "contract_hash": self.contract_hash, + "verify_hashes": self.verify_hashes, + "counts": { + "unchanged": len(self.unchanged), + "needs_hash": len(self.needs_hash), + "needs_parse": len(self.needs_parse), + "needs_index": len(self.needs_index), + "needs_metadata_update": len(self.needs_metadata_update), + "deleted": len(self.deleted), + "invalidated": len(self.invalidated), + }, + "unchanged": self.unchanged, + "needs_hash": self.needs_hash, + "needs_parse": self.needs_parse, + "needs_index": self.needs_index, + "needs_metadata_update": self.needs_metadata_update, + "deleted": self.deleted, + "invalidated": self.invalidated, + "entries": [entry.to_dict() for entry in self.entries], + } + + +def plan_snapshot_refresh( + paths: list[str | Path], + *, + previous: list[SnapshotState] | dict[str, SnapshotState] | None = None, + root: str | Path = ".", + recursive: bool = True, + parse_options: dict[str, Any] | None = None, + contract_hash: str | None = None, + verify_hashes: bool = False, +) -> SnapshotRefreshPlan: + """Plan snapshot/index refresh work using cheap metadata before hashing. + + When ``verify_hashes`` is false, files with changed size/mtime are marked + for hash, parse, and index. When true, the planner hashes only those + metadata-changed files so it can avoid parsing when content is unchanged. + """ + + root_path = Path(root).resolve() + previous_by_path = _previous_by_path(previous) + parse_options_hash = _hash_mapping(parse_options or {}) + current_files = { + _relative(path, root_path): path + for path in scan_markdown_files(paths, recursive=recursive) + } + entries: list[SnapshotPlanEntry] = [] + changed_or_deleted: set[str] = set() + + for relative_path, file_path in sorted(current_files.items()): + stat = file_path.stat() + known = previous_by_path.get(relative_path) + if known is None: + entries.append( + SnapshotPlanEntry( + path=relative_path, + actions=["hash", "parse", "index"], + reason="new_file", + size=stat.st_size, + mtime_ns=stat.st_mtime_ns, + ) + ) + changed_or_deleted.add(relative_path) + continue + + identity_changed = ( + known.parser != PARSER_ID + or known.parser_version != PARSER_VERSION + or known.parse_options_hash != parse_options_hash + or known.contract_hash != contract_hash + ) + if identity_changed: + entries.append( + SnapshotPlanEntry( + path=relative_path, + actions=["hash", "parse", "index"], + reason="snapshot_identity_parameters_changed", + size=stat.st_size, + mtime_ns=stat.st_mtime_ns, + previous_snapshot_id=known.snapshot_id, + ) + ) + changed_or_deleted.add(relative_path) + continue + + metadata_same = known.size == stat.st_size and known.mtime_ns == stat.st_mtime_ns + if metadata_same: + actions = [] if known.indexed else ["index"] + entries.append( + SnapshotPlanEntry( + path=relative_path, + actions=actions, + reason="unchanged" if not actions else "snapshot_not_indexed", + size=stat.st_size, + mtime_ns=stat.st_mtime_ns, + previous_snapshot_id=known.snapshot_id, + content_hash=known.content_hash, + ) + ) + continue + + if not verify_hashes: + entries.append( + SnapshotPlanEntry( + path=relative_path, + actions=["hash", "parse", "index"], + reason="file_metadata_changed", + size=stat.st_size, + mtime_ns=stat.st_mtime_ns, + previous_snapshot_id=known.snapshot_id, + ) + ) + changed_or_deleted.add(relative_path) + continue + + current_hash = _hash_file(file_path) + if current_hash == known.content_hash: + actions = ["hash", "metadata"] if known.indexed else ["hash", "metadata", "index"] + entries.append( + SnapshotPlanEntry( + path=relative_path, + actions=actions, + reason="file_metadata_changed_content_same", + size=stat.st_size, + mtime_ns=stat.st_mtime_ns, + previous_snapshot_id=known.snapshot_id, + content_hash=current_hash, + ) + ) + else: + entries.append( + SnapshotPlanEntry( + path=relative_path, + actions=["hash", "parse", "index"], + reason="content_hash_changed", + size=stat.st_size, + mtime_ns=stat.st_mtime_ns, + previous_snapshot_id=known.snapshot_id, + content_hash=current_hash, + ) + ) + changed_or_deleted.add(relative_path) + + for relative_path, known in sorted(previous_by_path.items()): + if relative_path in current_files: + continue + entries.append( + SnapshotPlanEntry( + path=relative_path, + actions=["delete"], + reason="source_missing", + previous_snapshot_id=known.snapshot_id, + content_hash=known.content_hash, + ) + ) + changed_or_deleted.add(relative_path) + + invalidated = _transitive_dependents(changed_or_deleted, previous_by_path) + if invalidated: + entries = _apply_invalidations(entries, invalidated, changed_or_deleted) + + return SnapshotRefreshPlan( + root=str(root_path), + parser=PARSER_ID, + parser_version=PARSER_VERSION, + parse_options_hash=parse_options_hash, + contract_hash=contract_hash, + verify_hashes=verify_hashes, + entries=sorted(entries, key=lambda entry: entry.path), + ) + + +def load_snapshot_state_file(path: str | Path) -> list[SnapshotState]: + """Load a portable snapshot-state fixture from JSON or YAML.""" + + state_path = Path(path) + data = yaml.safe_load(state_path.read_text(encoding="utf-8")) or {} + raw_snapshots = data.get("snapshots", data.get("states", data)) + if isinstance(raw_snapshots, dict): + raw_snapshots = list(raw_snapshots.values()) + if not isinstance(raw_snapshots, list): + raise ValueError("Snapshot state file must contain a `snapshots` list") + return [ + SnapshotState.from_dict(item) + for item in raw_snapshots + if isinstance(item, dict) + ] + + +def _previous_by_path( + previous: list[SnapshotState] | dict[str, SnapshotState] | None, +) -> dict[str, SnapshotState]: + if previous is None: + return {} + if isinstance(previous, dict): + return dict(previous) + return {state.path: state for state in previous} + + +def _dependency_edge_from_dict(data: dict[str, Any]) -> DependencyEdge: + return DependencyEdge( + source_id=str(data["source_id"]), + target=str(data["target"]), + kind=str(data["kind"]), + target_snapshot_id=str(data["target_snapshot_id"]) if data.get("target_snapshot_id") else None, + metadata=dict(data.get("metadata") or {}), + ) + + +def _transitive_dependents( + changed_paths: set[str], + previous_by_path: dict[str, SnapshotState], +) -> dict[str, list[str]]: + reverse: dict[str, set[str]] = {} + for state in previous_by_path.values(): + for edge in state.dependencies: + reverse.setdefault(edge.target, set()).add(state.path) + if edge.target_snapshot_id: + reverse.setdefault(edge.target_snapshot_id, set()).add(state.path) + + invalidates: dict[str, list[str]] = {} + queue = list(changed_paths) + visited = set(changed_paths) + while queue: + changed = queue.pop(0) + dependents = sorted(reverse.get(changed, set())) + if dependents: + invalidates[changed] = dependents + for dependent in dependents: + if dependent in visited: + continue + visited.add(dependent) + queue.append(dependent) + return invalidates + + +def _apply_invalidations( + entries: list[SnapshotPlanEntry], + invalidates: dict[str, list[str]], + changed_or_deleted: set[str], +) -> list[SnapshotPlanEntry]: + dependents_by_path: dict[str, list[str]] = {} + for changed_path, dependents in invalidates.items(): + for dependent in dependents: + dependents_by_path.setdefault(dependent, []).append(changed_path) + + existing = {entry.path: entry for entry in entries} + for dependent, causes in dependents_by_path.items(): + if dependent in changed_or_deleted: + continue + entry = existing.get(dependent) + actions = sorted(set((entry.actions if entry else []) + ["invalidate"])) + reason = "dependency_changed" if entry is None or entry.reason == "unchanged" else entry.reason + existing[dependent] = SnapshotPlanEntry( + path=dependent, + actions=actions, + reason=reason, + size=entry.size if entry else None, + mtime_ns=entry.mtime_ns if entry else None, + previous_snapshot_id=entry.previous_snapshot_id if entry else None, + content_hash=entry.content_hash if entry else None, + invalidated_by=sorted(set(causes)), + ) + return list(existing.values()) + + +def _paths_with_action(entries: list[SnapshotPlanEntry], action: str) -> list[str]: + return sorted(entry.path for entry in entries if action in entry.actions) + + +def _paths_without_actions(entries: list[SnapshotPlanEntry]) -> list[str]: + return sorted(entry.path for entry in entries if not entry.actions) + + +def _relative(path: Path, root: Path) -> str: + resolved = path.resolve() + try: + return resolved.relative_to(root).as_posix() + except ValueError: + return resolved.as_posix() + + +def _hash_file(path: Path) -> str: + return "sha256:" + hashlib.sha256(path.read_bytes()).hexdigest() + + +def _hash_mapping(mapping: dict[str, Any]) -> str: + payload = json.dumps(mapping, sort_keys=True, ensure_ascii=False) + return "sha256:" + hashlib.sha256(payload.encode("utf-8")).hexdigest() diff --git a/src/markitect_tool/cli/main.py b/src/markitect_tool/cli/main.py index 6762892..f0ec269 100644 --- a/src/markitect_tool/cli/main.py +++ b/src/markitect_tool/cli/main.py @@ -19,6 +19,8 @@ from markitect_tool.cache import ( from markitect_tool.backend import ( BackendRegistryError, load_backend_registry, + load_snapshot_state_file, + plan_snapshot_refresh, snapshot_identity_for_file, ) from markitect_tool.content_class import ( @@ -581,6 +583,71 @@ def backend_snapshot_id( _emit_snapshot_identity(data, output_format) +@backend.command("refresh-plan") +@click.argument("paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)) +@click.option( + "--root", + type=click.Path(exists=True, file_okay=False, path_type=Path), + default=Path("."), + show_default=True, + help="Root used for relative source paths.", +) +@click.option( + "--state", + "state_file", + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help="YAML/JSON snapshot state file from a previous backend run.", +) +@click.option("--no-recursive", is_flag=True, help="Do not recurse into directories.") +@click.option( + "--verify-hashes", + is_flag=True, + help="Hash metadata-changed files to avoid unnecessary parse/index work.", +) +@click.option( + "--parse-option", + "parse_options", + multiple=True, + metavar="KEY=VALUE", + help="Parse option included in the identity comparison.", +) +@click.option("--contract-hash", help="Optional contract hash included in identity comparison.") +@click.option( + "--format", + "output_format", + type=click.Choice(["json", "yaml", "text"], case_sensitive=False), + default="text", + show_default=True, +) +def backend_refresh_plan( + paths: tuple[Path, ...], + root: Path, + state_file: Path | None, + no_recursive: bool, + verify_hashes: bool, + parse_options: tuple[str, ...], + contract_hash: str | None, + output_format: str, +) -> None: + """Plan cheap-first snapshot and index refresh work.""" + + try: + previous = load_snapshot_state_file(state_file) if state_file else [] + plan = plan_snapshot_refresh( + list(paths), + previous=previous, + root=root, + recursive=not no_recursive, + parse_options=_parse_key_value_options(parse_options), + contract_hash=contract_hash, + verify_hashes=verify_hashes, + ) + except (ValueError, TypeError) as exc: + raise click.ClickException(str(exc)) from exc + _emit_refresh_plan(plan.to_dict(), output_format) + raise click.exceptions.Exit(1 if plan.dirty else 0) + + @main.group("class") def class_group() -> None: """Resolve deterministic content classes.""" @@ -1238,6 +1305,31 @@ def _emit_snapshot_identity(data: dict, output_format: str) -> None: click.echo(f"parser: {data['parser']} {data['parser_version']}") +def _emit_refresh_plan(data: dict, output_format: str) -> None: + if output_format == "json": + click.echo(json.dumps(data, indent=2, ensure_ascii=False)) + elif output_format == "yaml": + click.echo(yaml.safe_dump(data, sort_keys=False)) + else: + click.echo("dirty" if data["dirty"] else "clean") + counts = data["counts"] + for key in [ + "unchanged", + "needs_hash", + "needs_parse", + "needs_index", + "needs_metadata_update", + "deleted", + "invalidated", + ]: + click.echo(f"{key}: {counts[key]}") + for entry in data["entries"]: + actions = ",".join(entry.get("actions", [])) or "none" + click.echo(f"- {entry['path']}: {actions} ({entry['reason']})") + if entry.get("invalidated_by"): + click.echo(f" invalidated_by: {', '.join(entry['invalidated_by'])}") + + def _emit_content_class_result(data: dict, output_format: str) -> None: if output_format == "json": click.echo(json.dumps(data, indent=2, ensure_ascii=False)) diff --git a/tests/test_backend_refresh_planning.py b/tests/test_backend_refresh_planning.py new file mode 100644 index 0000000..65a7ab2 --- /dev/null +++ b/tests/test_backend_refresh_planning.py @@ -0,0 +1,163 @@ +import os +from pathlib import Path + +from click.testing import CliRunner + +from markitect_tool.backend import ( + DependencyEdge, + SnapshotState, + load_snapshot_state_file, + plan_snapshot_refresh, +) +from markitect_tool.cli import main + + +def test_refresh_plan_marks_all_files_new_without_previous_state(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Doc\n", encoding="utf-8") + + plan = plan_snapshot_refresh([tmp_path], root=tmp_path) + + assert plan.dirty + assert plan.needs_hash == ["doc.md"] + assert plan.needs_parse == ["doc.md"] + assert plan.needs_index == ["doc.md"] + + +def test_refresh_plan_uses_cheap_metadata_for_unchanged_file(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Doc\n", encoding="utf-8") + stat = source.stat() + previous = SnapshotState( + path="doc.md", + size=stat.st_size, + mtime_ns=stat.st_mtime_ns, + content_hash="sha256:known", + snapshot_id="snapshot:known", + ) + + plan = plan_snapshot_refresh([tmp_path], previous=[previous], root=tmp_path) + + assert not plan.dirty + assert plan.unchanged == ["doc.md"] + assert plan.needs_hash == [] + + +def test_refresh_plan_can_hash_metadata_changed_file_and_skip_parse_if_content_same(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Doc\n", encoding="utf-8") + stat = source.stat() + content_hash = _hash_file(source) + previous = SnapshotState( + path="doc.md", + size=stat.st_size, + mtime_ns=stat.st_mtime_ns, + content_hash=content_hash, + snapshot_id="snapshot:known", + ) + os.utime(source, ns=(stat.st_atime_ns + 1_000_000_000, stat.st_mtime_ns + 1_000_000_000)) + + plan = plan_snapshot_refresh( + [tmp_path], + previous=[previous], + root=tmp_path, + verify_hashes=True, + ) + + assert plan.needs_hash == ["doc.md"] + assert plan.needs_metadata_update == ["doc.md"] + assert plan.needs_parse == [] + assert plan.needs_index == [] + + +def test_refresh_plan_invalidates_transitive_dependents(tmp_path: Path): + source = tmp_path / "source.md" + dependent = tmp_path / "dependent.md" + transitive = tmp_path / "transitive.md" + source.write_text("# Source changed\n", encoding="utf-8") + dependent.write_text("# Dependent\n", encoding="utf-8") + transitive.write_text("# Transitive\n", encoding="utf-8") + + source_stat = source.stat() + dependent_stat = dependent.stat() + transitive_stat = transitive.stat() + previous = [ + SnapshotState( + path="source.md", + size=1, + mtime_ns=1, + content_hash="sha256:old", + snapshot_id="snapshot:source", + ), + SnapshotState( + path="dependent.md", + size=dependent_stat.st_size, + mtime_ns=dependent_stat.st_mtime_ns, + content_hash=_hash_file(dependent), + snapshot_id="snapshot:dependent", + dependencies=[ + DependencyEdge(source_id="snapshot:dependent", target="source.md", kind="reference") + ], + ), + SnapshotState( + path="transitive.md", + size=transitive_stat.st_size, + mtime_ns=transitive_stat.st_mtime_ns, + content_hash=_hash_file(transitive), + snapshot_id="snapshot:transitive", + dependencies=[ + DependencyEdge(source_id="snapshot:transitive", target="dependent.md", kind="reference") + ], + ), + ] + + plan = plan_snapshot_refresh([tmp_path], previous=previous, root=tmp_path) + + assert plan.needs_parse == ["source.md"] + assert plan.invalidated == ["dependent.md", "transitive.md"] + entries = {entry.path: entry for entry in plan.entries} + assert entries["dependent.md"].invalidated_by == ["source.md"] + assert entries["transitive.md"].invalidated_by == ["dependent.md"] + assert source_stat.st_size != 1 + + +def test_snapshot_state_file_and_cli_refresh_plan(tmp_path: Path): + source = tmp_path / "doc.md" + state_file = tmp_path / "state.yaml" + source.write_text("# Doc\n", encoding="utf-8") + stat = source.stat() + state_file.write_text( + f"""snapshots: + - path: doc.md + size: {stat.st_size} + mtime_ns: {stat.st_mtime_ns} + content_hash: {_hash_file(source)} + snapshot_id: snapshot:known +""", + encoding="utf-8", + ) + + states = load_snapshot_state_file(state_file) + result = CliRunner().invoke( + main, + [ + "backend", + "refresh-plan", + str(tmp_path), + "--root", + str(tmp_path), + "--state", + str(state_file), + ], + ) + + assert states[0].path == "doc.md" + assert result.exit_code == 0 + assert "clean" in result.output + assert "unchanged: 1" in result.output + + +def _hash_file(path: Path) -> str: + import hashlib + + return "sha256:" + hashlib.sha256(path.read_bytes()).hexdigest() diff --git a/workplans/MKTT-WP-0007-advanced-query-and-local-index-backend.md b/workplans/MKTT-WP-0007-advanced-query-and-local-index-backend.md index 9bb7e78..dddaea0 100644 --- a/workplans/MKTT-WP-0007-advanced-query-and-local-index-backend.md +++ b/workplans/MKTT-WP-0007-advanced-query-and-local-index-backend.md @@ -28,6 +28,25 @@ This backend should later be able to index `MKTT-WP-0010` references, named regions, chunks, and processor provenance without changing its basic storage contract. +## Preliminary Refinement - Snapshot Refresh Planning + +Implemented before starting the SQLite/index tasks: `SnapshotState`, +`SnapshotPlanEntry`, `SnapshotRefreshPlan`, `plan_snapshot_refresh`, +`load_snapshot_state_file`, and CLI `mkt backend refresh-plan`. + +This is the performance contract for WP-0007: + +- compare cheap metadata before hashing +- hash only likely-changed files when `--verify-hashes` is requested +- parse only files whose identity/content requires a new snapshot +- index only new, changed, unindexed, or dependency-invalidated entries +- carry direct and transitive dependency invalidation forward from + `DependencyEdge` +- keep refresh planning inspectable through JSON/YAML/text output + +The future SQLite store should persist enough state to feed this planner +directly and should report actual refresh work against the same categories. + ## P7.1 - Implement local snapshot store ```task @@ -40,6 +59,14 @@ state_hub_task_id: "8894a9a4-586c-457b-b4e6-add8276ff5f2" Persist parsed document snapshots and source metadata in a local cache directory. +Implementation hints: + +- Persist `SnapshotState` fields in the snapshot/source tables. +- Store path, size, mtime, content hash, parser id/version, parse options hash, + contract hash, snapshot id, indexed flag, and dependency edges. +- Keep large document/token JSON lazy-loadable so refresh planning does not + pull whole AST payloads into memory. + ## P7.2 - Add AST introspection commands ```task @@ -86,6 +113,14 @@ and metrics in SQLite. Keep schema extension points for reference edges, named regions, chunks, and processor outputs. +Implementation hints: + +- Use narrow metadata tables for hot refresh decisions. +- Store document/token JSON separately from searchable section/block rows. +- Add indexes on path, content hash, snapshot id, parser version, and unit ids. +- Preserve source spans and content-unit ids from WP-0010 reference/literate + layers. + ## P7.5 - Add FTS5 section/block search ```task @@ -111,6 +146,16 @@ Refresh only changed files based on content hash and parser version. Include dependency invalidation hooks for future transclusion/reference graphs. +Implementation hints: + +- Drive incremental refresh from `SnapshotRefreshPlan`. +- The first pass should use cheap metadata; only hash metadata-changed files. +- With `--verify-hashes`, skip parse/index when content is unchanged and only + update metadata. +- Use reverse dependency edges for direct and transitive invalidation. +- Report planned vs actual counts for hash, parse, index, metadata update, + delete, and invalidation work. + ## P7.7 - Add local index CLI ```task