From 8203f50fd5f266b0d809d63aac15b2f0ebc8bd9c Mon Sep 17 00:00:00 2001 From: tegwick Date: Mon, 4 May 2026 01:35:32 +0200 Subject: [PATCH] Lightweight caching and incremental processing --- docs/cache-incremental.md | 73 ++++++ docs/workplan-planning-map.md | 2 +- src/markitect_tool/__init__.py | 22 ++ src/markitect_tool/cache/__init__.py | 27 ++ src/markitect_tool/cache/engine.py | 230 ++++++++++++++++++ src/markitect_tool/cli/main.py | 141 +++++++++++ tests/test_cache_incremental.py | 110 +++++++++ ...KTT-WP-0003-core-toolkit-implementation.md | 10 +- 8 files changed, 612 insertions(+), 3 deletions(-) create mode 100644 docs/cache-incremental.md create mode 100644 src/markitect_tool/cache/__init__.py create mode 100644 src/markitect_tool/cache/engine.py create mode 100644 tests/test_cache_incremental.py diff --git a/docs/cache-incremental.md b/docs/cache-incremental.md new file mode 100644 index 0000000..314853e --- /dev/null +++ b/docs/cache-incremental.md @@ -0,0 +1,73 @@ +# Lightweight Cache and Incremental Processing + +`markitect-tool` includes a small file-backed cache manifest for the core CLI. +It records Markdown file fingerprints and parse summaries so workflows can +detect which inputs changed before rerunning expensive operations. + +This is intentionally not the future SQLite/AST/query backend. That richer +backend is tracked in `MKTT-WP-0006` and `MKTT-WP-0007`. + +## Commands + +Fingerprint one file: + +```bash +mkt cache fingerprint docs/example.md +``` + +Build or refresh a manifest: + +```bash +mkt cache build docs/ --root . +``` + +The default manifest path is: + +```text +.markitect/cache/manifest.json +``` + +Check status against the manifest: + +```bash +mkt cache status docs/ --root . +``` + +Exit behavior: + +- `0`: clean +- `1`: new, changed, or deleted Markdown files detected + +## Manifest Contents + +Each entry records: + +- relative path +- SHA-256 content hash +- size +- modification time +- parser identity +- heading, section, and block counts + +The content hash is the authoritative change signal. Modification time and size +are recorded for diagnostics and future fast-path checks. + +## Design Boundary + +This layer provides: + +- repeatable fingerprints +- changed/new/deleted detection +- simple parse summaries +- a transparent JSON file that can be inspected or removed + +It does not provide: + +- AST persistence +- JSONPath querying over cached documents +- SQLite/FTS indexes +- vector search +- policy-aware retrieval +- distributed cache synchronization + +Those belong to later backend workplans. diff --git a/docs/workplan-planning-map.md b/docs/workplan-planning-map.md index 818293a..ee5e57d 100644 --- a/docs/workplan-planning-map.md +++ b/docs/workplan-planning-map.md @@ -30,7 +30,7 @@ and descriptions mirror the operational view. | `MKTT-WP-0001` | complete | done | none | Repository foundation is complete. | | `MKTT-WP-0002` | complete | done | `MKTT-WP-0001` | Legacy scope extraction is complete. | | `MKTT-WP-0004` | complete | done | `MKTT-WP-0001`, `MKTT-WP-0002` | Contract framework is complete and informs later validation/generation work. | -| `MKTT-WP-0003` | P0 | active | `MKTT-WP-0001`, `MKTT-WP-0002`, `MKTT-WP-0004` | Mainline implementation. P3.6 is complete; P3.7 caching remains. | +| `MKTT-WP-0003` | complete | done | `MKTT-WP-0001`, `MKTT-WP-0002`, `MKTT-WP-0004` | Core toolkit implementation is complete. | | `MKTT-WP-0006` | P1 | todo | `MKTT-WP-0004`; task-level trigger: `MKTT-WP-0003-T005` | Ready after transform/composition shape is clear; should account for future reference/provenance needs. | | `MKTT-WP-0010` | P1 | todo | `MKTT-WP-0004`; task-level trigger: `MKTT-WP-0003-T006` | Trigger is satisfied; keep as the richer content-reference, processor, explode/implode, and weave/tangle track. | | `MKTT-WP-0007` | P2 | todo | `MKTT-WP-0006` | First practical cache backend use case: AST/JSONPath/SQLite/FTS. | diff --git a/src/markitect_tool/__init__.py b/src/markitect_tool/__init__.py index 32d797c..4717cbf 100644 --- a/src/markitect_tool/__init__.py +++ b/src/markitect_tool/__init__.py @@ -20,6 +20,18 @@ from markitect_tool.contract import ( validate_contract, validate_contract_file, ) +from markitect_tool.cache import ( + CacheEntry, + CacheManifest, + CacheStatus, + build_cache, + cache_path_for, + detect_changes, + fingerprint_file, + load_cache, + save_cache, + scan_markdown_files, +) from markitect_tool.diagnostics import Diagnostic, SourceLocation from markitect_tool.generation import ( GeneratedDocument, @@ -87,6 +99,16 @@ __all__ = [ "load_contract_file", "validate_contract", "validate_contract_file", + "CacheEntry", + "CacheManifest", + "CacheStatus", + "build_cache", + "cache_path_for", + "detect_changes", + "fingerprint_file", + "load_cache", + "save_cache", + "scan_markdown_files", "Diagnostic", "SourceLocation", "GeneratedDocument", diff --git a/src/markitect_tool/cache/__init__.py b/src/markitect_tool/cache/__init__.py new file mode 100644 index 0000000..8030e0a --- /dev/null +++ b/src/markitect_tool/cache/__init__.py @@ -0,0 +1,27 @@ +"""Lightweight file-backed cache and change detection.""" + +from markitect_tool.cache.engine import ( + CacheEntry, + CacheManifest, + CacheStatus, + build_cache, + cache_path_for, + detect_changes, + fingerprint_file, + load_cache, + save_cache, + scan_markdown_files, +) + +__all__ = [ + "CacheEntry", + "CacheManifest", + "CacheStatus", + "build_cache", + "cache_path_for", + "detect_changes", + "fingerprint_file", + "load_cache", + "save_cache", + "scan_markdown_files", +] diff --git a/src/markitect_tool/cache/engine.py b/src/markitect_tool/cache/engine.py new file mode 100644 index 0000000..3365170 --- /dev/null +++ b/src/markitect_tool/cache/engine.py @@ -0,0 +1,230 @@ +"""Lightweight cache manifest for Markdown fingerprints and parse summaries.""" + +from __future__ import annotations + +import hashlib +import json +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + +from markitect_tool.core import parse_markdown_file + + +CACHE_SCHEMA_VERSION = "1.0" +DEFAULT_CACHE_PATH = ".markitect/cache/manifest.json" + + +@dataclass(frozen=True) +class CacheEntry: + """One cached source-file fingerprint and parse summary.""" + + path: str + content_hash: str + size: int + mtime_ns: int + parser: str = "markdown-it-py/commonmark" + headings: int = 0 + sections: int = 0 + blocks: int = 0 + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "CacheEntry": + return cls( + path=str(data["path"]), + content_hash=str(data["content_hash"]), + size=int(data["size"]), + mtime_ns=int(data["mtime_ns"]), + parser=str(data.get("parser", "markdown-it-py/commonmark")), + headings=int(data.get("headings", 0)), + sections=int(data.get("sections", 0)), + blocks=int(data.get("blocks", 0)), + ) + + +@dataclass(frozen=True) +class CacheManifest: + """A file-backed cache manifest.""" + + schema_version: str = CACHE_SCHEMA_VERSION + root: str = "." + entries: dict[str, CacheEntry] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "schema_version": self.schema_version, + "root": self.root, + "entries": { + path: entry.to_dict() + for path, entry in sorted(self.entries.items()) + }, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "CacheManifest": + entries = { + path: CacheEntry.from_dict(entry) + for path, entry in (data.get("entries") or {}).items() + if isinstance(entry, dict) + } + return cls( + schema_version=str(data.get("schema_version", CACHE_SCHEMA_VERSION)), + root=str(data.get("root", ".")), + entries=entries, + ) + + +@dataclass(frozen=True) +class CacheStatus: + """Change detection result against a cache manifest.""" + + new: list[str] = field(default_factory=list) + changed: list[str] = field(default_factory=list) + unchanged: list[str] = field(default_factory=list) + deleted: list[str] = field(default_factory=list) + + @property + def dirty(self) -> bool: + return bool(self.new or self.changed or self.deleted) + + def to_dict(self) -> dict[str, Any]: + return { + "dirty": self.dirty, + "new": self.new, + "changed": self.changed, + "unchanged": self.unchanged, + "deleted": self.deleted, + "counts": { + "new": len(self.new), + "changed": len(self.changed), + "unchanged": len(self.unchanged), + "deleted": len(self.deleted), + }, + } + + +def cache_path_for(root: str | Path, cache_path: str | Path | None = None) -> Path: + """Return the manifest path for a root and optional cache path.""" + + path = Path(cache_path or DEFAULT_CACHE_PATH) + if path.is_absolute(): + return path + return Path(root) / path + + +def load_cache(path: str | Path) -> CacheManifest: + """Load a cache manifest. Missing manifests produce an empty manifest.""" + + manifest_path = Path(path) + if not manifest_path.exists(): + return CacheManifest(root=str(manifest_path.parent.parent.parent if manifest_path.name == "manifest.json" else ".")) + data = json.loads(manifest_path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise ValueError("Cache manifest must be a JSON object") + return CacheManifest.from_dict(data) + + +def save_cache(manifest: CacheManifest, path: str | Path) -> None: + """Write a cache manifest.""" + + manifest_path = Path(path) + manifest_path.parent.mkdir(parents=True, exist_ok=True) + manifest_path.write_text( + json.dumps(manifest.to_dict(), indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +def fingerprint_file(path: str | Path, *, root: str | Path | None = None) -> CacheEntry: + """Fingerprint one Markdown file and record a small parse summary.""" + + file_path = Path(path) + stat = file_path.stat() + digest = hashlib.sha256(file_path.read_bytes()).hexdigest() + document = parse_markdown_file(file_path) + relative_path = _relative(file_path, Path(root).resolve() if root else file_path.parent.resolve()) + return CacheEntry( + path=relative_path, + content_hash=f"sha256:{digest}", + size=stat.st_size, + mtime_ns=stat.st_mtime_ns, + headings=len(document.headings), + sections=len(document.sections), + blocks=len(document.blocks), + ) + + +def build_cache( + paths: list[str | Path], + *, + root: str | Path = ".", + recursive: bool = True, +) -> CacheManifest: + """Build a cache manifest for Markdown files under paths.""" + + root_path = Path(root).resolve() + entries: dict[str, CacheEntry] = {} + for source in scan_markdown_files(paths, recursive=recursive): + entry = fingerprint_file(source, root=root_path) + entries[entry.path] = entry + return CacheManifest(root=str(root_path), entries=entries) + + +def detect_changes( + manifest: CacheManifest, + paths: list[str | Path], + *, + root: str | Path = ".", + recursive: bool = True, +) -> CacheStatus: + """Compare current Markdown files against a cache manifest.""" + + current = build_cache(paths, root=root, recursive=recursive) + current_paths = set(current.entries) + cached_paths = set(manifest.entries) + + new = sorted(current_paths - cached_paths) + deleted = sorted(cached_paths - current_paths) + changed: list[str] = [] + unchanged: list[str] = [] + + for path in sorted(current_paths & cached_paths): + if current.entries[path].content_hash == manifest.entries[path].content_hash: + unchanged.append(path) + else: + changed.append(path) + + return CacheStatus(new=new, changed=changed, unchanged=unchanged, deleted=deleted) + + +def scan_markdown_files( + paths: list[str | Path], + *, + recursive: bool = True, +) -> list[Path]: + """Return Markdown files for a set of files or directories.""" + + files: list[Path] = [] + for raw_path in paths: + path = Path(raw_path) + if path.is_file() and _is_markdown(path): + files.append(path) + elif path.is_dir(): + pattern = "**/*" if recursive else "*" + files.extend(candidate for candidate in path.glob(pattern) if candidate.is_file() and _is_markdown(candidate)) + return sorted(set(files)) + + +def _is_markdown(path: Path) -> bool: + return path.suffix.lower() in {".md", ".markdown"} + + +def _relative(path: Path, root: Path) -> str: + resolved = path.resolve() + try: + return resolved.relative_to(root).as_posix() + except ValueError: + return resolved.as_posix() diff --git a/src/markitect_tool/cli/main.py b/src/markitect_tool/cli/main.py index c18d696..12c5c91 100644 --- a/src/markitect_tool/cli/main.py +++ b/src/markitect_tool/cli/main.py @@ -8,6 +8,14 @@ from pathlib import Path import click import yaml +from markitect_tool.cache import ( + build_cache, + cache_path_for, + detect_changes, + fingerprint_file, + load_cache, + save_cache, +) from markitect_tool.core import parse_markdown_file from markitect_tool.contract import ( ContractLoaderError, @@ -288,6 +296,118 @@ def include( _emit_markdown_result(result.to_dict(), output_format, output) +@main.group() +def cache() -> None: + """Fingerprint Markdown files and detect changed inputs.""" + + +@cache.command("fingerprint") +@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path)) +@click.option( + "--root", + type=click.Path(exists=True, file_okay=False, path_type=Path), + default=Path("."), + show_default=True, + help="Root used for relative cache paths.", +) +@click.option( + "--format", + "output_format", + type=click.Choice(["json", "yaml", "text"], case_sensitive=False), + default="json", + show_default=True, +) +def cache_fingerprint(file: Path, root: Path, output_format: str) -> None: + """Fingerprint one Markdown file.""" + + entry = fingerprint_file(file, root=root) + _emit_cache_data(entry.to_dict(), output_format) + + +@cache.command("build") +@click.argument("paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)) +@click.option( + "--root", + type=click.Path(exists=True, file_okay=False, path_type=Path), + default=Path("."), + show_default=True, + help="Root used for relative cache paths.", +) +@click.option( + "--cache-path", + type=click.Path(dir_okay=False, path_type=Path), + help="Cache manifest path. Defaults to .markitect/cache/manifest.json under root.", +) +@click.option("--no-recursive", is_flag=True, help="Do not recurse into directories.") +@click.option("--dry-run", is_flag=True, help="Report manifest without writing it.") +@click.option( + "--format", + "output_format", + type=click.Choice(["json", "yaml", "text"], case_sensitive=False), + default="text", + show_default=True, +) +def cache_build( + paths: tuple[Path, ...], + root: Path, + cache_path: Path | None, + no_recursive: bool, + dry_run: bool, + output_format: str, +) -> None: + """Build or refresh a lightweight Markdown cache manifest.""" + + manifest = build_cache(list(paths), root=root, recursive=not no_recursive) + manifest_path = cache_path_for(root, cache_path) + if not dry_run: + save_cache(manifest, manifest_path) + data = manifest.to_dict() | { + "cache_path": str(manifest_path), + "written": not dry_run, + "count": len(manifest.entries), + } + _emit_cache_data(data, output_format) + + +@cache.command("status") +@click.argument("paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)) +@click.option( + "--root", + type=click.Path(exists=True, file_okay=False, path_type=Path), + default=Path("."), + show_default=True, + help="Root used for relative cache paths.", +) +@click.option( + "--cache-path", + type=click.Path(dir_okay=False, path_type=Path), + help="Cache manifest path. Defaults to .markitect/cache/manifest.json under root.", +) +@click.option("--no-recursive", is_flag=True, help="Do not recurse into directories.") +@click.option( + "--format", + "output_format", + type=click.Choice(["json", "yaml", "text"], case_sensitive=False), + default="text", + show_default=True, +) +def cache_status( + paths: tuple[Path, ...], + root: Path, + cache_path: Path | None, + no_recursive: bool, + output_format: str, +) -> None: + """Report changed, new, unchanged, and deleted Markdown files.""" + + manifest_path = cache_path_for(root, cache_path) + manifest = load_cache(manifest_path) + status = detect_changes(manifest, list(paths), root=root, recursive=not no_recursive) + data = status.to_dict() | {"cache_path": str(manifest_path)} + _emit_cache_data(data, output_format) + raise click.exceptions.Exit(1 if status.dirty else 0) + + @main.group() def template() -> None: """Render and inspect deterministic Markdown templates.""" @@ -647,6 +767,27 @@ def _emit_markdown_result(data: dict, output_format: str, output: Path | None) - click.echo(markdown, nl=False) +def _emit_cache_data(data: dict, output_format: str) -> None: + if output_format == "json": + click.echo(json.dumps(data, indent=2, ensure_ascii=False)) + elif output_format == "yaml": + click.echo(yaml.safe_dump(data, sort_keys=False)) + else: + if "dirty" in data: + click.echo("dirty" if data["dirty"] else "clean") + for key in ["new", "changed", "deleted", "unchanged"]: + values = data.get(key, []) + if values: + click.echo(f"{key}: {len(values)}") + for value in values: + click.echo(f"- {value}") + else: + click.echo(f"cache_path: {data.get('cache_path', '')}") + click.echo(f"count: {data.get('count', len(data.get('entries', [])))}") + if data.get("written") is not None: + click.echo(f"written: {data['written']}") + + def _emit_jsonish(data: dict, output_format: str) -> None: if output_format == "yaml": click.echo(yaml.safe_dump(data, sort_keys=False)) diff --git a/tests/test_cache_incremental.py b/tests/test_cache_incremental.py new file mode 100644 index 0000000..0b7a7a2 --- /dev/null +++ b/tests/test_cache_incremental.py @@ -0,0 +1,110 @@ +from pathlib import Path + +from click.testing import CliRunner + +from markitect_tool.cache import ( + build_cache, + cache_path_for, + detect_changes, + fingerprint_file, + load_cache, + save_cache, + scan_markdown_files, +) +from markitect_tool.cli import main + + +def test_fingerprint_file_records_hash_and_parse_summary(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Title\n\nBody.\n", encoding="utf-8") + + entry = fingerprint_file(source, root=tmp_path) + + assert entry.path == "doc.md" + assert entry.content_hash.startswith("sha256:") + assert entry.headings == 1 + assert entry.sections == 1 + assert entry.blocks == 2 + + +def test_build_cache_and_detect_changes(tmp_path: Path): + one = tmp_path / "one.md" + two = tmp_path / "two.md" + one.write_text("# One\n", encoding="utf-8") + two.write_text("# Two\n", encoding="utf-8") + + manifest = build_cache([tmp_path], root=tmp_path) + assert sorted(manifest.entries) == ["one.md", "two.md"] + + status = detect_changes(manifest, [tmp_path], root=tmp_path) + assert not status.dirty + assert status.unchanged == ["one.md", "two.md"] + + one.write_text("# One\n\nChanged.\n", encoding="utf-8") + two.unlink() + three = tmp_path / "three.md" + three.write_text("# Three\n", encoding="utf-8") + + status = detect_changes(manifest, [tmp_path], root=tmp_path) + assert status.dirty + assert status.changed == ["one.md"] + assert status.deleted == ["two.md"] + assert status.new == ["three.md"] + + +def test_save_and_load_cache_manifest(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Doc\n", encoding="utf-8") + manifest_path = cache_path_for(tmp_path) + manifest = build_cache([source], root=tmp_path) + + save_cache(manifest, manifest_path) + loaded = load_cache(manifest_path) + + assert loaded.entries["doc.md"].content_hash == manifest.entries["doc.md"].content_hash + + +def test_scan_markdown_files_skips_non_markdown(tmp_path: Path): + (tmp_path / "doc.md").write_text("# Doc\n", encoding="utf-8") + (tmp_path / "notes.txt").write_text("Nope\n", encoding="utf-8") + nested = tmp_path / "nested" + nested.mkdir() + (nested / "other.markdown").write_text("# Other\n", encoding="utf-8") + + assert [path.name for path in scan_markdown_files([tmp_path])] == ["doc.md", "other.markdown"] + assert [path.name for path in scan_markdown_files([tmp_path], recursive=False)] == ["doc.md"] + + +def test_mkt_cache_build_and_status(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Doc\n", encoding="utf-8") + runner = CliRunner() + + build = runner.invoke(main, ["cache", "build", str(tmp_path), "--root", str(tmp_path)]) + + assert build.exit_code == 0 + assert "written: True" in build.output + + clean = runner.invoke(main, ["cache", "status", str(tmp_path), "--root", str(tmp_path)]) + assert clean.exit_code == 0 + assert "clean" in clean.output + + source.write_text("# Doc\n\nChanged.\n", encoding="utf-8") + dirty = runner.invoke(main, ["cache", "status", str(tmp_path), "--root", str(tmp_path)]) + assert dirty.exit_code == 1 + assert "dirty" in dirty.output + assert "changed: 1" in dirty.output + + +def test_mkt_cache_fingerprint_outputs_json(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Doc\n", encoding="utf-8") + + result = CliRunner().invoke( + main, + ["cache", "fingerprint", str(source), "--root", str(tmp_path)], + ) + + assert result.exit_code == 0 + assert '"path": "doc.md"' in result.output + assert '"content_hash": "sha256:' in result.output diff --git a/workplans/MKTT-WP-0003-core-toolkit-implementation.md b/workplans/MKTT-WP-0003-core-toolkit-implementation.md index 12728f7..b251570 100644 --- a/workplans/MKTT-WP-0003-core-toolkit-implementation.md +++ b/workplans/MKTT-WP-0003-core-toolkit-implementation.md @@ -3,7 +3,7 @@ id: MKTT-WP-0003 type: workplan title: "Core Markdown Toolkit Implementation" domain: markitect -status: active +status: done owner: markitect-tool topic_slug: markitect planning_priority: P0 @@ -130,13 +130,19 @@ generation hook protocols, docs, examples, tests, `mkt template inspect`, ```task id: MKTT-WP-0003-T007 -status: todo +status: done priority: low state_hub_task_id: "236f90e3-1d79-473f-8c57-bcbbde9ece02" ``` Implement FR-070 and FR-071 after the parse/schema contracts are stable. +Initial implementation complete for lightweight file-backed cache manifests, +SHA-256 content fingerprints, parse summaries, new/changed/deleted detection, +CLI access via `mkt cache fingerprint`, `mkt cache build`, and +`mkt cache status`, docs, and tests. Rich AST/query/index caching remains +tracked in `MKTT-WP-0006` and `MKTT-WP-0007`. + ## Known Technical Debt ```task