From 36ff4cedab4560316b1deed1604f2e7d813c5977 Mon Sep 17 00:00:00 2001 From: tegwick Date: Mon, 4 May 2026 08:56:41 +0200 Subject: [PATCH] SQLite-backed local snapshot store --- docs/local-index-backend.md | 87 +++ docs/workplan-planning-map.md | 2 +- src/markitect_tool/backend/__init__.py | 12 + src/markitect_tool/backend/local_store.py | 510 ++++++++++++++++++ src/markitect_tool/cli/main.py | 202 +++++++ tests/test_local_snapshot_store.py | 89 +++ ...-advanced-query-and-local-index-backend.md | 29 +- 7 files changed, 926 insertions(+), 5 deletions(-) create mode 100644 docs/local-index-backend.md create mode 100644 src/markitect_tool/backend/local_store.py create mode 100644 tests/test_local_snapshot_store.py diff --git a/docs/local-index-backend.md b/docs/local-index-backend.md new file mode 100644 index 0000000..a9d9a54 --- /dev/null +++ b/docs/local-index-backend.md @@ -0,0 +1,87 @@ +# Local Index Backend + +`markitect-tool` now includes a local SQLite snapshot/index backend as the +first practical implementation of the optional backend fabric. + +## Purpose + +The local index is optimized for repeatable Markdown infrastructure work: + +- persist parsed document snapshots +- keep cheap source metadata for incremental refresh planning +- store document JSON for later AST/JSONPath use +- index frontmatter, headings, sections, blocks, and metrics +- preserve extension points for dependency edges, references, named regions, + chunks, processor outputs, FTS, and policy-aware access + +The backend is optional. Single-file commands such as `mkt parse`, `mkt query`, +and `mkt ast` do not require it. + +## Commands + +Initialize the SQLite store: + +```text +mkt cache init --root . +``` + +Build or refresh the local index: + +```text +mkt cache index docs workplans --root . +``` + +Inspect a parsed AST without using the cache: + +```text +mkt ast show docs/backend-fabric.md --format tree +mkt ast stats docs/backend-fabric.md +``` + +By default, the index is written to: + +```text +.markitect/cache/index.sqlite3 +``` + +Use `--index-path` to override it. + +## Refresh Behavior + +`mkt cache index` uses the same cheap-first refresh planning model as +`mkt backend refresh-plan`: + +1. Compare path, size, mtime, parser identity, parse options, and contract hash. +2. Hash only files whose metadata changed. +3. Skip parse/index when metadata changed but content hash stayed the same. +4. Parse and index new or changed files. +5. Delete rows for removed source files. + +The command reports planned work and actual work separately in JSON/YAML output. + +## Stored Data + +The first schema stores: + +- `sources`: path, absolute path, size, mtime, content hash, snapshot id, + parser identity, parse option hash, contract hash, document JSON, + frontmatter JSON, metrics JSON, provenance JSON, and indexed flag +- `headings`: heading level, text, and source line +- `sections`: heading metadata, section text, and source span +- `blocks`: block type, text, source span, and heading level +- `dependencies`: reserved dependency edge table for references, + transclusion, literate chunks, and future invalidation graphs + +This is enough to recover the useful markitect-main idea of keeping parsed +structure available for faster and richer query backends, while keeping the +normal CLI usable without a cache. + +## Future Work + +`MKTT-WP-0007` still needs: + +- JSONPath query adapter over stored or live document JSON +- FTS5 search over section/block rows +- cache-backed query commands +- richer dependency extraction from references, transclusion, and literate + chunks diff --git a/docs/workplan-planning-map.md b/docs/workplan-planning-map.md index 758b628..4816c10 100644 --- a/docs/workplan-planning-map.md +++ b/docs/workplan-planning-map.md @@ -33,7 +33,7 @@ and descriptions mirror the operational view. | `MKTT-WP-0003` | complete | done | `MKTT-WP-0001`, `MKTT-WP-0002`, `MKTT-WP-0004` | Core toolkit implementation is complete. | | `MKTT-WP-0006` | complete | done | `MKTT-WP-0004`; task-level trigger: `MKTT-WP-0003-T005` | Optional backend fabric is complete: manifests, capabilities, snapshot identity, interfaces, registry, provenance, and read-only CLI scaffolding. | | `MKTT-WP-0010` | complete | done | `MKTT-WP-0004`; task-level trigger: `MKTT-WP-0003-T006` | Content references, processors, explode/implode, weave/tangle, content classes, and migration examples are complete as the first WP-0010 extension layer. | -| `MKTT-WP-0007` | P2 | todo | `MKTT-WP-0006` | First practical cache backend use case: AST/JSONPath/SQLite/FTS. Preliminary refresh planning is in place as the performance contract. | +| `MKTT-WP-0007` | P2 | todo | `MKTT-WP-0006` | First practical cache backend use case: AST/JSONPath/SQLite/FTS. SQLite snapshots, AST inspection, metadata indexing, and incremental refresh are in place; JSONPath, FTS, and cache-backed query remain. | | `MKTT-WP-0005` | P2 | todo | `MKTT-WP-0003`, `MKTT-WP-0004` | Pick up when generation/form/context or semantic assessment pressure appears. | | `MKTT-WP-0011` | P2 | todo | `MKTT-WP-0003`; task-level triggers: `MKTT-WP-0010-T001`, `MKTT-WP-0010-T005` | Declarative Markdown dataflow workflows: source extraction, deterministic/assisted processing, and multi-output generation. | | `MKTT-WP-0009` | P2 | todo | `MKTT-WP-0006` | Establish access-control gateway before security-sensitive cache/context use. | diff --git a/src/markitect_tool/backend/__init__.py b/src/markitect_tool/backend/__init__.py index 8dae0e6..1f49995 100644 --- a/src/markitect_tool/backend/__init__.py +++ b/src/markitect_tool/backend/__init__.py @@ -32,6 +32,13 @@ from markitect_tool.backend.interfaces import ( QueryAdapter, SnapshotBackend, ) +from markitect_tool.backend.local_store import ( + DEFAULT_LOCAL_INDEX_PATH, + LOCAL_INDEX_SCHEMA_VERSION, + LocalIndexBuildResult, + LocalSnapshotStore, + local_index_path_for, +) __all__ = [ "BACKEND_CAPABILITIES", @@ -60,4 +67,9 @@ __all__ = [ "ProcessorResultStore", "QueryAdapter", "SnapshotBackend", + "DEFAULT_LOCAL_INDEX_PATH", + "LOCAL_INDEX_SCHEMA_VERSION", + "LocalIndexBuildResult", + "LocalSnapshotStore", + "local_index_path_for", ] diff --git a/src/markitect_tool/backend/local_store.py b/src/markitect_tool/backend/local_store.py new file mode 100644 index 0000000..c13c240 --- /dev/null +++ b/src/markitect_tool/backend/local_store.py @@ -0,0 +1,510 @@ +"""Local SQLite snapshot and metadata store.""" + +from __future__ import annotations + +import json +import sqlite3 +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from markitect_tool.backend.engine import ( + EMPTY_PARSE_OPTIONS_HASH, + PARSER_ID, + PARSER_VERSION, + DependencyEdge, + ProvenanceEnvelope, + snapshot_identity_for_file, +) +from markitect_tool.backend.planning import SnapshotState, plan_snapshot_refresh +from markitect_tool.cache import scan_markdown_files +from markitect_tool.contract import collect_metrics +from markitect_tool.core import parse_markdown_file + + +DEFAULT_LOCAL_INDEX_PATH = ".markitect/cache/index.sqlite3" +LOCAL_INDEX_SCHEMA_VERSION = "1" + + +@dataclass(frozen=True) +class LocalIndexBuildResult: + """Summary of a local index build or refresh.""" + + index_path: str + root: str + paths: list[str] + planned: dict[str, Any] + parsed: list[str] = field(default_factory=list) + indexed: list[str] = field(default_factory=list) + metadata_updated: list[str] = field(default_factory=list) + deleted: list[str] = field(default_factory=list) + + @property + def dirty(self) -> bool: + return bool(self.parsed or self.indexed or self.metadata_updated or self.deleted) + + def to_dict(self) -> dict[str, Any]: + data = asdict(self) + data["dirty"] = self.dirty + data["counts"] = { + "parsed": len(self.parsed), + "indexed": len(self.indexed), + "metadata_updated": len(self.metadata_updated), + "deleted": len(self.deleted), + } + return data + + +class LocalSnapshotStore: + """SQLite-backed local snapshot store for parsed Markdown documents.""" + + def __init__(self, path: str | Path = DEFAULT_LOCAL_INDEX_PATH) -> None: + self.path = Path(path) + + def initialize(self) -> None: + """Create or migrate the local index schema.""" + + self.path.parent.mkdir(parents=True, exist_ok=True) + with self._connect() as conn: + _create_schema(conn) + conn.execute( + """ + insert into meta(key, value) values('schema_version', ?) + on conflict(key) do update set value = excluded.value + """, + (LOCAL_INDEX_SCHEMA_VERSION,), + ) + + def load_state(self) -> list[SnapshotState]: + """Load cheap refresh-planning state without loading document JSON.""" + + if not self.path.exists(): + return [] + with self._connect() as conn: + _create_schema(conn) + rows = conn.execute( + """ + select path, size, mtime_ns, content_hash, snapshot_id, parser, + parser_version, parse_options_hash, contract_hash, indexed + from sources + order by path + """ + ).fetchall() + dependencies = _load_dependencies(conn) + return [ + SnapshotState( + path=row["path"], + size=row["size"], + mtime_ns=row["mtime_ns"], + content_hash=row["content_hash"], + snapshot_id=row["snapshot_id"], + parser=row["parser"], + parser_version=row["parser_version"], + parse_options_hash=row["parse_options_hash"], + contract_hash=row["contract_hash"], + indexed=bool(row["indexed"]), + dependencies=dependencies.get(row["path"], []), + ) + for row in rows + ] + + def put_file( + self, + path: str | Path, + *, + root: str | Path = ".", + parse_options: dict[str, Any] | None = None, + contract_hash: str | None = None, + ) -> SnapshotState: + """Parse and persist one Markdown file.""" + + self.initialize() + file_path = Path(path) + root_path = Path(root).resolve() + relative_path = _relative(file_path, root_path) + identity = snapshot_identity_for_file( + file_path, + parse_options=parse_options, + contract_hash=contract_hash, + ) + document = parse_markdown_file(file_path) + metrics = collect_metrics(document).to_dict() + stat = file_path.stat() + now = datetime.now(timezone.utc).isoformat() + provenance = ProvenanceEnvelope( + operation="local_snapshot_store.put_file", + snapshot_id=identity.snapshot_id, + source_path=relative_path, + content_hash=identity.content_hash, + backend_id="local-sqlite", + ) + with self._connect() as conn: + _create_schema(conn) + conn.execute( + """ + insert into sources( + path, abs_path, size, mtime_ns, content_hash, snapshot_id, + parser, parser_version, parse_options_hash, contract_hash, + indexed, document_json, frontmatter_json, metrics_json, + provenance_json, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, ?, ?, ?, ?, ?) + on conflict(path) do update set + abs_path = excluded.abs_path, + size = excluded.size, + mtime_ns = excluded.mtime_ns, + content_hash = excluded.content_hash, + snapshot_id = excluded.snapshot_id, + parser = excluded.parser, + parser_version = excluded.parser_version, + parse_options_hash = excluded.parse_options_hash, + contract_hash = excluded.contract_hash, + indexed = excluded.indexed, + document_json = excluded.document_json, + frontmatter_json = excluded.frontmatter_json, + metrics_json = excluded.metrics_json, + provenance_json = excluded.provenance_json, + updated_at = excluded.updated_at + """, + ( + relative_path, + str(file_path.resolve()), + stat.st_size, + stat.st_mtime_ns, + identity.content_hash, + identity.snapshot_id, + identity.parser, + identity.parser_version, + identity.parse_options_hash, + identity.contract_hash, + _json(document.to_dict()), + _json(document.frontmatter), + _json(metrics), + _json(provenance.to_dict()), + now, + ), + ) + _replace_document_units(conn, relative_path, identity.snapshot_id, document.to_dict()) + return SnapshotState( + path=relative_path, + size=stat.st_size, + mtime_ns=stat.st_mtime_ns, + content_hash=identity.content_hash, + snapshot_id=identity.snapshot_id, + parser=identity.parser, + parser_version=identity.parser_version, + parse_options_hash=identity.parse_options_hash, + contract_hash=identity.contract_hash, + indexed=True, + ) + + def update_metadata(self, path: str, *, root: str | Path = ".") -> None: + """Update file size and mtime when content hash is unchanged.""" + + file_path = Path(root) / path + stat = file_path.stat() + with self._connect() as conn: + _create_schema(conn) + conn.execute( + "update sources set size = ?, mtime_ns = ?, updated_at = ? where path = ?", + (stat.st_size, stat.st_mtime_ns, datetime.now(timezone.utc).isoformat(), path), + ) + + def delete_path(self, path: str) -> None: + """Delete one indexed source and derived rows.""" + + if not self.path.exists(): + return + with self._connect() as conn: + _create_schema(conn) + conn.execute("delete from blocks where path = ?", (path,)) + conn.execute("delete from sections where path = ?", (path,)) + conn.execute("delete from headings where path = ?", (path,)) + conn.execute("delete from dependencies where path = ?", (path,)) + conn.execute("delete from sources where path = ?", (path,)) + + def get_document(self, path: str) -> dict[str, Any]: + """Return stored document JSON for a relative source path.""" + + with self._connect() as conn: + _create_schema(conn) + row = conn.execute( + "select document_json from sources where path = ?", + (path,), + ).fetchone() + if row is None: + raise KeyError(f"No indexed document `{path}`") + return json.loads(row["document_json"]) + + def build( + self, + paths: list[str | Path], + *, + root: str | Path = ".", + recursive: bool = True, + parse_options: dict[str, Any] | None = None, + contract_hash: str | None = None, + verify_hashes: bool = True, + ) -> LocalIndexBuildResult: + """Incrementally build or refresh the local index.""" + + self.initialize() + root_path = Path(root).resolve() + plan = plan_snapshot_refresh( + paths, + previous=self.load_state(), + root=root_path, + recursive=recursive, + parse_options=parse_options, + contract_hash=contract_hash, + verify_hashes=verify_hashes, + ) + current_files = { + _relative(path, root_path): path + for path in scan_markdown_files(paths, recursive=recursive) + } + parsed: list[str] = [] + indexed: list[str] = [] + metadata_updated: list[str] = [] + deleted: list[str] = [] + for entry in plan.entries: + if "delete" in entry.actions: + self.delete_path(entry.path) + deleted.append(entry.path) + continue + if "parse" in entry.actions or "index" in entry.actions: + file_path = current_files.get(entry.path) + if file_path is None: + continue + self.put_file( + file_path, + root=root_path, + parse_options=parse_options, + contract_hash=contract_hash, + ) + if "parse" in entry.actions: + parsed.append(entry.path) + if "index" in entry.actions: + indexed.append(entry.path) + continue + if "metadata" in entry.actions: + self.update_metadata(entry.path, root=root_path) + metadata_updated.append(entry.path) + return LocalIndexBuildResult( + index_path=str(self.path), + root=str(root_path), + paths=[str(path) for path in paths], + planned=plan.to_dict(), + parsed=parsed, + indexed=indexed, + metadata_updated=metadata_updated, + deleted=deleted, + ) + + def _connect(self) -> sqlite3.Connection: + conn = sqlite3.connect(self.path) + conn.row_factory = sqlite3.Row + conn.execute("pragma foreign_keys = on") + return conn + + +def local_index_path_for(root: str | Path, index_path: str | Path | None = None) -> Path: + """Return the local SQLite index path for a root and optional override.""" + + path = Path(index_path or DEFAULT_LOCAL_INDEX_PATH) + if path.is_absolute(): + return path + return Path(root) / path + + +def _create_schema(conn: sqlite3.Connection) -> None: + conn.executescript( + """ + create table if not exists meta( + key text primary key, + value text not null + ); + create table if not exists sources( + path text primary key, + abs_path text not null, + size integer not null, + mtime_ns integer not null, + content_hash text not null, + snapshot_id text not null unique, + parser text not null, + parser_version text not null, + parse_options_hash text not null, + contract_hash text, + indexed integer not null default 1, + document_json text not null, + frontmatter_json text not null, + metrics_json text not null, + provenance_json text not null, + updated_at text not null + ); + create table if not exists headings( + snapshot_id text not null, + path text not null, + idx integer not null, + level integer not null, + text text not null, + line integer not null, + primary key(snapshot_id, idx) + ); + create table if not exists sections( + snapshot_id text not null, + path text not null, + idx integer not null, + heading_text text not null, + heading_level integer not null, + line integer not null, + text text not null, + line_start integer, + line_end integer, + primary key(snapshot_id, idx) + ); + create table if not exists blocks( + snapshot_id text not null, + path text not null, + idx integer not null, + type text not null, + text text not null, + line_start integer, + line_end integer, + heading_level integer, + primary key(snapshot_id, idx) + ); + create table if not exists dependencies( + path text not null, + source_id text not null, + target text not null, + kind text not null, + target_snapshot_id text, + metadata_json text not null default '{}' + ); + create index if not exists idx_sources_content_hash on sources(content_hash); + create index if not exists idx_sources_snapshot_id on sources(snapshot_id); + create index if not exists idx_sources_parser on sources(parser, parser_version); + create index if not exists idx_headings_path on headings(path); + create index if not exists idx_sections_path on sections(path); + create index if not exists idx_blocks_path on blocks(path); + create index if not exists idx_dependencies_target on dependencies(target); + """ + ) + + +def _replace_document_units( + conn: sqlite3.Connection, + path: str, + snapshot_id: str, + document: dict[str, Any], +) -> None: + conn.execute("delete from blocks where path = ?", (path,)) + conn.execute("delete from sections where path = ?", (path,)) + conn.execute("delete from headings where path = ?", (path,)) + for idx, heading in enumerate(document.get("headings", [])): + conn.execute( + """ + insert into headings(snapshot_id, path, idx, level, text, line) + values (?, ?, ?, ?, ?, ?) + """, + ( + snapshot_id, + path, + idx, + int(heading["level"]), + str(heading["text"]), + int(heading["line"]), + ), + ) + for idx, section in enumerate(document.get("sections", [])): + heading = section["heading"] + text = "\n\n".join(str(block.get("text", "")) for block in section.get("blocks", [])) + line_start = _first_present(block.get("line_start") for block in section.get("blocks", [])) + line_end = _last_present(block.get("line_end") for block in section.get("blocks", [])) + conn.execute( + """ + insert into sections( + snapshot_id, path, idx, heading_text, heading_level, line, + text, line_start, line_end + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + snapshot_id, + path, + idx, + str(heading["text"]), + int(heading["level"]), + int(heading["line"]), + text, + line_start, + line_end, + ), + ) + for idx, block in enumerate(document.get("blocks", [])): + conn.execute( + """ + insert into blocks( + snapshot_id, path, idx, type, text, line_start, line_end, heading_level + ) values (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + snapshot_id, + path, + idx, + str(block["type"]), + str(block.get("text", "")), + block.get("line_start"), + block.get("line_end"), + block.get("heading_level"), + ), + ) + + +def _load_dependencies(conn: sqlite3.Connection) -> dict[str, list[DependencyEdge]]: + rows = conn.execute( + """ + select path, source_id, target, kind, target_snapshot_id, metadata_json + from dependencies + order by path, source_id, target + """ + ).fetchall() + dependencies: dict[str, list[DependencyEdge]] = {} + for row in rows: + dependencies.setdefault(row["path"], []).append( + DependencyEdge( + source_id=row["source_id"], + target=row["target"], + kind=row["kind"], + target_snapshot_id=row["target_snapshot_id"], + metadata=json.loads(row["metadata_json"] or "{}"), + ) + ) + return dependencies + + +def _relative(path: Path, root: Path) -> str: + resolved = path.resolve() + try: + return resolved.relative_to(root).as_posix() + except ValueError: + return resolved.as_posix() + + +def _json(data: Any) -> str: + return json.dumps(data, sort_keys=True, ensure_ascii=False) + + +def _first_present(values: Any) -> int | None: + for value in values: + if value is not None: + return int(value) + return None + + +def _last_present(values: Any) -> int | None: + found: int | None = None + for value in values: + if value is not None: + found = int(value) + return found diff --git a/src/markitect_tool/cli/main.py b/src/markitect_tool/cli/main.py index f0ec269..734a435 100644 --- a/src/markitect_tool/cli/main.py +++ b/src/markitect_tool/cli/main.py @@ -18,8 +18,10 @@ from markitect_tool.cache import ( ) from markitect_tool.backend import ( BackendRegistryError, + LocalSnapshotStore, load_backend_registry, load_snapshot_state_file, + local_index_path_for, plan_snapshot_refresh, snapshot_identity_for_file, ) @@ -95,6 +97,51 @@ def parse(file: Path, output_format: str) -> None: click.echo(json.dumps(data, indent=2, ensure_ascii=False)) +@main.group() +def ast() -> None: + """Inspect parsed Markdown ASTs and parser summaries.""" + + +@ast.command("show") +@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path)) +@click.option( + "--format", + "output_format", + type=click.Choice(["json", "yaml", "tree"], case_sensitive=False), + default="json", + show_default=True, +) +def ast_show(file: Path, output_format: str) -> None: + """Show a parsed Markdown AST without requiring a cache.""" + + document = parse_markdown_file(file) + data = document.to_dict() + if output_format == "yaml": + click.echo(yaml.safe_dump(data, sort_keys=False)) + elif output_format == "tree": + for heading in document.headings: + click.echo(f"{'#' * heading.level} {heading.text}") + else: + click.echo(json.dumps(data, indent=2, ensure_ascii=False)) + + +@ast.command("stats") +@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path)) +@click.option( + "--format", + "output_format", + type=click.Choice(["json", "yaml", "text"], case_sensitive=False), + default="text", + show_default=True, +) +def ast_stats(file: Path, output_format: str) -> None: + """Summarize parsed Markdown AST shape and token distribution.""" + + document = parse_markdown_file(file) + data = _ast_stats(document.to_dict(), str(file)) + _emit_ast_stats(data, output_format) + + @main.command() @click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path)) @click.option( @@ -726,6 +773,40 @@ def cache() -> None: """Fingerprint Markdown files and detect changed inputs.""" +@cache.command("init") +@click.option( + "--root", + type=click.Path(exists=True, file_okay=False, path_type=Path), + default=Path("."), + show_default=True, + help="Root used for the default local index path.", +) +@click.option( + "--index-path", + type=click.Path(dir_okay=False, path_type=Path), + help="SQLite index path. Defaults to .markitect/cache/index.sqlite3 under root.", +) +@click.option( + "--format", + "output_format", + type=click.Choice(["json", "yaml", "text"], case_sensitive=False), + default="text", + show_default=True, +) +def cache_init(root: Path, index_path: Path | None, output_format: str) -> None: + """Initialize the local SQLite snapshot/index store.""" + + resolved_index = local_index_path_for(root, index_path) + store = LocalSnapshotStore(resolved_index) + store.initialize() + data = { + "index_path": str(resolved_index), + "schema_version": "1", + "sources": len(store.load_state()), + } + _emit_local_index_data(data, output_format) + + @cache.command("fingerprint") @click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path)) @click.option( @@ -833,6 +914,68 @@ def cache_status( raise click.exceptions.Exit(1 if status.dirty else 0) +@cache.command("index") +@click.argument("paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)) +@click.option( + "--root", + type=click.Path(exists=True, file_okay=False, path_type=Path), + default=Path("."), + show_default=True, + help="Root used for relative index paths.", +) +@click.option( + "--index-path", + type=click.Path(dir_okay=False, path_type=Path), + help="SQLite index path. Defaults to .markitect/cache/index.sqlite3 under root.", +) +@click.option("--no-recursive", is_flag=True, help="Do not recurse into directories.") +@click.option( + "--no-verify-hashes", + is_flag=True, + help="Do not hash metadata-changed files before parsing.", +) +@click.option( + "--parse-option", + "parse_options", + multiple=True, + metavar="KEY=VALUE", + help="Parse option included in the snapshot identity hash.", +) +@click.option("--contract-hash", help="Optional contract hash included in snapshot identity.") +@click.option( + "--format", + "output_format", + type=click.Choice(["json", "yaml", "text"], case_sensitive=False), + default="text", + show_default=True, +) +def cache_index( + paths: tuple[Path, ...], + root: Path, + index_path: Path | None, + no_recursive: bool, + no_verify_hashes: bool, + parse_options: tuple[str, ...], + contract_hash: str | None, + output_format: str, +) -> None: + """Build or refresh the local SQLite snapshot/index store.""" + + try: + store = LocalSnapshotStore(local_index_path_for(root, index_path)) + result = store.build( + list(paths), + root=root, + recursive=not no_recursive, + parse_options=_parse_key_value_options(parse_options), + contract_hash=contract_hash, + verify_hashes=not no_verify_hashes, + ) + except ValueError as exc: + raise click.ClickException(str(exc)) from exc + _emit_local_index_data(result.to_dict(), output_format) + + @main.group() def template() -> None: """Render and inspect deterministic Markdown templates.""" @@ -1213,6 +1356,42 @@ def _emit_cache_data(data: dict, output_format: str) -> None: click.echo(f"written: {data['written']}") +def _emit_ast_stats(data: dict, output_format: str) -> None: + if output_format == "json": + click.echo(json.dumps(data, indent=2, ensure_ascii=False)) + elif output_format == "yaml": + click.echo(yaml.safe_dump(data, sort_keys=False)) + else: + click.echo(f"document_path: {data['document_path']}") + for key, value in data["counts"].items(): + click.echo(f"{key}: {value}") + click.echo(f"max_heading_depth: {data['max_heading_depth']}") + if data["token_types"]: + click.echo("token_types:") + for token_type, count in data["token_types"].items(): + click.echo(f"- {token_type}: {count}") + + +def _emit_local_index_data(data: dict, output_format: str) -> None: + if output_format == "json": + click.echo(json.dumps(data, indent=2, ensure_ascii=False)) + elif output_format == "yaml": + click.echo(yaml.safe_dump(data, sort_keys=False)) + else: + click.echo(f"index_path: {data['index_path']}") + if data.get("schema_version"): + click.echo(f"schema_version: {data['schema_version']}") + if data.get("sources") is not None: + click.echo(f"sources: {data['sources']}") + if data.get("dirty") is not None: + click.echo("dirty" if data["dirty"] else "clean") + for key in ["parsed", "indexed", "metadata_updated", "deleted"]: + values = data.get(key, []) + click.echo(f"{key}: {len(values)}") + for value in values: + click.echo(f"- {value}") + + def _emit_reference_result(data: dict, output_format: str) -> None: if output_format == "json": click.echo(json.dumps(data, indent=2, ensure_ascii=False)) @@ -1404,6 +1583,29 @@ def _set_path(mapping: dict[str, object], path: list[str], value: object) -> Non current[path[-1]] = value +def _ast_stats(document: dict, document_path: str) -> dict: + token_types: dict[str, int] = {} + for token in document.get("tokens", []): + token_type = str(token.get("type", "unknown")) + token_types[token_type] = token_types.get(token_type, 0) + 1 + headings = document.get("headings", []) + return { + "document_path": document_path, + "source_path": document.get("source_path"), + "counts": { + "frontmatter_keys": len(document.get("frontmatter", {})), + "headings": len(headings), + "sections": len(document.get("sections", [])), + "blocks": len(document.get("blocks", [])), + "tokens": len(document.get("tokens", [])), + }, + "max_heading_depth": max( + [int(heading.get("level", 0)) for heading in headings] or [0] + ), + "token_types": dict(sorted(token_types.items())), + } + + def _load_template_data(data_file: Path | None) -> dict[str, object]: if data_file is None: return {} diff --git a/tests/test_local_snapshot_store.py b/tests/test_local_snapshot_store.py new file mode 100644 index 0000000..4c3b659 --- /dev/null +++ b/tests/test_local_snapshot_store.py @@ -0,0 +1,89 @@ +from pathlib import Path + +from click.testing import CliRunner + +from markitect_tool.backend import LocalSnapshotStore, local_index_path_for +from markitect_tool.cli import main + + +def test_local_snapshot_store_persists_state_and_document(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("---\ntitle: Example\n---\n# Doc\n\nBody.\n", encoding="utf-8") + store = LocalSnapshotStore(tmp_path / ".markitect" / "cache" / "index.sqlite3") + + state = store.put_file(source, root=tmp_path) + loaded = store.load_state() + document = store.get_document("doc.md") + + assert state.path == "doc.md" + assert state.snapshot_id.startswith("snapshot:") + assert loaded[0].path == "doc.md" + assert loaded[0].content_hash == state.content_hash + assert document["frontmatter"]["title"] == "Example" + assert document["headings"][0]["text"] == "Doc" + + +def test_local_snapshot_store_build_is_incremental(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Doc\n", encoding="utf-8") + store = LocalSnapshotStore(local_index_path_for(tmp_path)) + + first = store.build([tmp_path], root=tmp_path) + second = store.build([tmp_path], root=tmp_path) + + assert first.parsed == ["doc.md"] + assert first.indexed == ["doc.md"] + assert second.parsed == [] + assert second.indexed == [] + assert not second.dirty + + source.write_text("# Doc\n\nChanged.\n", encoding="utf-8") + changed = store.build([tmp_path], root=tmp_path) + + assert changed.parsed == ["doc.md"] + assert changed.indexed == ["doc.md"] + + +def test_local_snapshot_store_deletes_removed_files(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Doc\n", encoding="utf-8") + store = LocalSnapshotStore(local_index_path_for(tmp_path)) + store.build([tmp_path], root=tmp_path) + + source.unlink() + result = store.build([tmp_path], root=tmp_path) + + assert result.deleted == ["doc.md"] + assert store.load_state() == [] + + +def test_mkt_ast_show_and_stats(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Doc\n\nBody.\n", encoding="utf-8") + runner = CliRunner() + + shown = runner.invoke(main, ["ast", "show", str(source), "--format", "tree"]) + stats = runner.invoke(main, ["ast", "stats", str(source)]) + + assert shown.exit_code == 0 + assert "# Doc" in shown.output + assert stats.exit_code == 0 + assert "headings: 1" in stats.output + assert "paragraph_open" in stats.output + + +def test_mkt_cache_init_and_index(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Doc\n", encoding="utf-8") + runner = CliRunner() + + initialized = runner.invoke(main, ["cache", "init", "--root", str(tmp_path)]) + indexed = runner.invoke(main, ["cache", "index", str(tmp_path), "--root", str(tmp_path)]) + clean = runner.invoke(main, ["cache", "index", str(tmp_path), "--root", str(tmp_path)]) + + assert initialized.exit_code == 0 + assert "schema_version: 1" in initialized.output + assert indexed.exit_code == 0 + assert "parsed: 1" in indexed.output + assert clean.exit_code == 0 + assert "clean" in clean.output diff --git a/workplans/MKTT-WP-0007-advanced-query-and-local-index-backend.md b/workplans/MKTT-WP-0007-advanced-query-and-local-index-backend.md index dddaea0..1d644e8 100644 --- a/workplans/MKTT-WP-0007-advanced-query-and-local-index-backend.md +++ b/workplans/MKTT-WP-0007-advanced-query-and-local-index-backend.md @@ -51,7 +51,7 @@ directly and should report actual refresh work against the same categories. ```task id: MKTT-WP-0007-T001 -status: todo +status: done priority: high state_hub_task_id: "8894a9a4-586c-457b-b4e6-add8276ff5f2" ``` @@ -59,6 +59,10 @@ state_hub_task_id: "8894a9a4-586c-457b-b4e6-add8276ff5f2" Persist parsed document snapshots and source metadata in a local cache directory. +Implemented: `LocalSnapshotStore`, SQLite schema initialization, source-state +loading, parsed document JSON persistence, provenance envelope storage, and +relative path handling. See `docs/local-index-backend.md`. + Implementation hints: - Persist `SnapshotState` fields in the snapshot/source tables. @@ -71,7 +75,7 @@ Implementation hints: ```task id: MKTT-WP-0007-T002 -status: todo +status: done priority: high state_hub_task_id: "fb9eaa9d-5c20-49a9-a7a6-acae28ac5e20" ``` @@ -86,6 +90,9 @@ mkt ast stats Use the current parsed document and token model. Do not require cache presence for single-file use. +Implemented: `mkt ast show ` and `mkt ast stats ` with JSON, YAML, +tree/text output modes. + ## P7.3 - Add optional JSONPath query adapter ```task @@ -102,7 +109,7 @@ shared query result envelope. ```task id: MKTT-WP-0007-T004 -status: todo +status: done priority: medium state_hub_task_id: "479f11a3-4ab4-451b-991c-7f143f2bffea" ``` @@ -121,6 +128,11 @@ Implementation hints: - Preserve source spans and content-unit ids from WP-0010 reference/literate layers. +Implemented: source, heading, section, block, dependency, and metadata tables; +document/frontmatter/metrics/provenance JSON payloads; hot-path indexes on +path, content hash, snapshot id, parser identity, unit path, and dependency +target. + ## P7.5 - Add FTS5 section/block search ```task @@ -137,7 +149,7 @@ relevance ranking. ```task id: MKTT-WP-0007-T006 -status: todo +status: done priority: medium state_hub_task_id: "7d9472e6-0716-435b-866c-d2c66ad786cf" ``` @@ -156,6 +168,11 @@ Implementation hints: - Report planned vs actual counts for hash, parse, index, metadata update, delete, and invalidation work. +Implemented first pass: `LocalSnapshotStore.build()` drives refresh from +`SnapshotRefreshPlan`, hashes metadata-changed files by default, skips +unchanged content, updates metadata-only rows, refreshes changed snapshots, and +deletes removed files. + ## P7.7 - Add local index CLI ```task @@ -174,6 +191,10 @@ mkt cache query mkt search ``` +Partial implementation: `mkt cache init` initializes the local SQLite store and +`mkt cache index ` builds or refreshes it. Cache-backed query and FTS +search remain part of this task. + ## Exit Criteria - Legacy AST/JSONPath value is recovered as an optional backend.