SQLite-backed local snapshot store

This commit is contained in:
2026-05-04 08:56:41 +02:00
parent 0d1ad21a9f
commit 36ff4cedab
7 changed files with 926 additions and 5 deletions

View File

@@ -32,6 +32,13 @@ from markitect_tool.backend.interfaces import (
QueryAdapter,
SnapshotBackend,
)
from markitect_tool.backend.local_store import (
DEFAULT_LOCAL_INDEX_PATH,
LOCAL_INDEX_SCHEMA_VERSION,
LocalIndexBuildResult,
LocalSnapshotStore,
local_index_path_for,
)
__all__ = [
"BACKEND_CAPABILITIES",
@@ -60,4 +67,9 @@ __all__ = [
"ProcessorResultStore",
"QueryAdapter",
"SnapshotBackend",
"DEFAULT_LOCAL_INDEX_PATH",
"LOCAL_INDEX_SCHEMA_VERSION",
"LocalIndexBuildResult",
"LocalSnapshotStore",
"local_index_path_for",
]

View File

@@ -0,0 +1,510 @@
"""Local SQLite snapshot and metadata store."""
from __future__ import annotations
import json
import sqlite3
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from markitect_tool.backend.engine import (
EMPTY_PARSE_OPTIONS_HASH,
PARSER_ID,
PARSER_VERSION,
DependencyEdge,
ProvenanceEnvelope,
snapshot_identity_for_file,
)
from markitect_tool.backend.planning import SnapshotState, plan_snapshot_refresh
from markitect_tool.cache import scan_markdown_files
from markitect_tool.contract import collect_metrics
from markitect_tool.core import parse_markdown_file
DEFAULT_LOCAL_INDEX_PATH = ".markitect/cache/index.sqlite3"
LOCAL_INDEX_SCHEMA_VERSION = "1"
@dataclass(frozen=True)
class LocalIndexBuildResult:
"""Summary of a local index build or refresh."""
index_path: str
root: str
paths: list[str]
planned: dict[str, Any]
parsed: list[str] = field(default_factory=list)
indexed: list[str] = field(default_factory=list)
metadata_updated: list[str] = field(default_factory=list)
deleted: list[str] = field(default_factory=list)
@property
def dirty(self) -> bool:
return bool(self.parsed or self.indexed or self.metadata_updated or self.deleted)
def to_dict(self) -> dict[str, Any]:
data = asdict(self)
data["dirty"] = self.dirty
data["counts"] = {
"parsed": len(self.parsed),
"indexed": len(self.indexed),
"metadata_updated": len(self.metadata_updated),
"deleted": len(self.deleted),
}
return data
class LocalSnapshotStore:
"""SQLite-backed local snapshot store for parsed Markdown documents."""
def __init__(self, path: str | Path = DEFAULT_LOCAL_INDEX_PATH) -> None:
self.path = Path(path)
def initialize(self) -> None:
"""Create or migrate the local index schema."""
self.path.parent.mkdir(parents=True, exist_ok=True)
with self._connect() as conn:
_create_schema(conn)
conn.execute(
"""
insert into meta(key, value) values('schema_version', ?)
on conflict(key) do update set value = excluded.value
""",
(LOCAL_INDEX_SCHEMA_VERSION,),
)
def load_state(self) -> list[SnapshotState]:
"""Load cheap refresh-planning state without loading document JSON."""
if not self.path.exists():
return []
with self._connect() as conn:
_create_schema(conn)
rows = conn.execute(
"""
select path, size, mtime_ns, content_hash, snapshot_id, parser,
parser_version, parse_options_hash, contract_hash, indexed
from sources
order by path
"""
).fetchall()
dependencies = _load_dependencies(conn)
return [
SnapshotState(
path=row["path"],
size=row["size"],
mtime_ns=row["mtime_ns"],
content_hash=row["content_hash"],
snapshot_id=row["snapshot_id"],
parser=row["parser"],
parser_version=row["parser_version"],
parse_options_hash=row["parse_options_hash"],
contract_hash=row["contract_hash"],
indexed=bool(row["indexed"]),
dependencies=dependencies.get(row["path"], []),
)
for row in rows
]
def put_file(
self,
path: str | Path,
*,
root: str | Path = ".",
parse_options: dict[str, Any] | None = None,
contract_hash: str | None = None,
) -> SnapshotState:
"""Parse and persist one Markdown file."""
self.initialize()
file_path = Path(path)
root_path = Path(root).resolve()
relative_path = _relative(file_path, root_path)
identity = snapshot_identity_for_file(
file_path,
parse_options=parse_options,
contract_hash=contract_hash,
)
document = parse_markdown_file(file_path)
metrics = collect_metrics(document).to_dict()
stat = file_path.stat()
now = datetime.now(timezone.utc).isoformat()
provenance = ProvenanceEnvelope(
operation="local_snapshot_store.put_file",
snapshot_id=identity.snapshot_id,
source_path=relative_path,
content_hash=identity.content_hash,
backend_id="local-sqlite",
)
with self._connect() as conn:
_create_schema(conn)
conn.execute(
"""
insert into sources(
path, abs_path, size, mtime_ns, content_hash, snapshot_id,
parser, parser_version, parse_options_hash, contract_hash,
indexed, document_json, frontmatter_json, metrics_json,
provenance_json, updated_at
) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, ?, ?, ?, ?, ?)
on conflict(path) do update set
abs_path = excluded.abs_path,
size = excluded.size,
mtime_ns = excluded.mtime_ns,
content_hash = excluded.content_hash,
snapshot_id = excluded.snapshot_id,
parser = excluded.parser,
parser_version = excluded.parser_version,
parse_options_hash = excluded.parse_options_hash,
contract_hash = excluded.contract_hash,
indexed = excluded.indexed,
document_json = excluded.document_json,
frontmatter_json = excluded.frontmatter_json,
metrics_json = excluded.metrics_json,
provenance_json = excluded.provenance_json,
updated_at = excluded.updated_at
""",
(
relative_path,
str(file_path.resolve()),
stat.st_size,
stat.st_mtime_ns,
identity.content_hash,
identity.snapshot_id,
identity.parser,
identity.parser_version,
identity.parse_options_hash,
identity.contract_hash,
_json(document.to_dict()),
_json(document.frontmatter),
_json(metrics),
_json(provenance.to_dict()),
now,
),
)
_replace_document_units(conn, relative_path, identity.snapshot_id, document.to_dict())
return SnapshotState(
path=relative_path,
size=stat.st_size,
mtime_ns=stat.st_mtime_ns,
content_hash=identity.content_hash,
snapshot_id=identity.snapshot_id,
parser=identity.parser,
parser_version=identity.parser_version,
parse_options_hash=identity.parse_options_hash,
contract_hash=identity.contract_hash,
indexed=True,
)
def update_metadata(self, path: str, *, root: str | Path = ".") -> None:
"""Update file size and mtime when content hash is unchanged."""
file_path = Path(root) / path
stat = file_path.stat()
with self._connect() as conn:
_create_schema(conn)
conn.execute(
"update sources set size = ?, mtime_ns = ?, updated_at = ? where path = ?",
(stat.st_size, stat.st_mtime_ns, datetime.now(timezone.utc).isoformat(), path),
)
def delete_path(self, path: str) -> None:
"""Delete one indexed source and derived rows."""
if not self.path.exists():
return
with self._connect() as conn:
_create_schema(conn)
conn.execute("delete from blocks where path = ?", (path,))
conn.execute("delete from sections where path = ?", (path,))
conn.execute("delete from headings where path = ?", (path,))
conn.execute("delete from dependencies where path = ?", (path,))
conn.execute("delete from sources where path = ?", (path,))
def get_document(self, path: str) -> dict[str, Any]:
"""Return stored document JSON for a relative source path."""
with self._connect() as conn:
_create_schema(conn)
row = conn.execute(
"select document_json from sources where path = ?",
(path,),
).fetchone()
if row is None:
raise KeyError(f"No indexed document `{path}`")
return json.loads(row["document_json"])
def build(
self,
paths: list[str | Path],
*,
root: str | Path = ".",
recursive: bool = True,
parse_options: dict[str, Any] | None = None,
contract_hash: str | None = None,
verify_hashes: bool = True,
) -> LocalIndexBuildResult:
"""Incrementally build or refresh the local index."""
self.initialize()
root_path = Path(root).resolve()
plan = plan_snapshot_refresh(
paths,
previous=self.load_state(),
root=root_path,
recursive=recursive,
parse_options=parse_options,
contract_hash=contract_hash,
verify_hashes=verify_hashes,
)
current_files = {
_relative(path, root_path): path
for path in scan_markdown_files(paths, recursive=recursive)
}
parsed: list[str] = []
indexed: list[str] = []
metadata_updated: list[str] = []
deleted: list[str] = []
for entry in plan.entries:
if "delete" in entry.actions:
self.delete_path(entry.path)
deleted.append(entry.path)
continue
if "parse" in entry.actions or "index" in entry.actions:
file_path = current_files.get(entry.path)
if file_path is None:
continue
self.put_file(
file_path,
root=root_path,
parse_options=parse_options,
contract_hash=contract_hash,
)
if "parse" in entry.actions:
parsed.append(entry.path)
if "index" in entry.actions:
indexed.append(entry.path)
continue
if "metadata" in entry.actions:
self.update_metadata(entry.path, root=root_path)
metadata_updated.append(entry.path)
return LocalIndexBuildResult(
index_path=str(self.path),
root=str(root_path),
paths=[str(path) for path in paths],
planned=plan.to_dict(),
parsed=parsed,
indexed=indexed,
metadata_updated=metadata_updated,
deleted=deleted,
)
def _connect(self) -> sqlite3.Connection:
conn = sqlite3.connect(self.path)
conn.row_factory = sqlite3.Row
conn.execute("pragma foreign_keys = on")
return conn
def local_index_path_for(root: str | Path, index_path: str | Path | None = None) -> Path:
"""Return the local SQLite index path for a root and optional override."""
path = Path(index_path or DEFAULT_LOCAL_INDEX_PATH)
if path.is_absolute():
return path
return Path(root) / path
def _create_schema(conn: sqlite3.Connection) -> None:
conn.executescript(
"""
create table if not exists meta(
key text primary key,
value text not null
);
create table if not exists sources(
path text primary key,
abs_path text not null,
size integer not null,
mtime_ns integer not null,
content_hash text not null,
snapshot_id text not null unique,
parser text not null,
parser_version text not null,
parse_options_hash text not null,
contract_hash text,
indexed integer not null default 1,
document_json text not null,
frontmatter_json text not null,
metrics_json text not null,
provenance_json text not null,
updated_at text not null
);
create table if not exists headings(
snapshot_id text not null,
path text not null,
idx integer not null,
level integer not null,
text text not null,
line integer not null,
primary key(snapshot_id, idx)
);
create table if not exists sections(
snapshot_id text not null,
path text not null,
idx integer not null,
heading_text text not null,
heading_level integer not null,
line integer not null,
text text not null,
line_start integer,
line_end integer,
primary key(snapshot_id, idx)
);
create table if not exists blocks(
snapshot_id text not null,
path text not null,
idx integer not null,
type text not null,
text text not null,
line_start integer,
line_end integer,
heading_level integer,
primary key(snapshot_id, idx)
);
create table if not exists dependencies(
path text not null,
source_id text not null,
target text not null,
kind text not null,
target_snapshot_id text,
metadata_json text not null default '{}'
);
create index if not exists idx_sources_content_hash on sources(content_hash);
create index if not exists idx_sources_snapshot_id on sources(snapshot_id);
create index if not exists idx_sources_parser on sources(parser, parser_version);
create index if not exists idx_headings_path on headings(path);
create index if not exists idx_sections_path on sections(path);
create index if not exists idx_blocks_path on blocks(path);
create index if not exists idx_dependencies_target on dependencies(target);
"""
)
def _replace_document_units(
conn: sqlite3.Connection,
path: str,
snapshot_id: str,
document: dict[str, Any],
) -> None:
conn.execute("delete from blocks where path = ?", (path,))
conn.execute("delete from sections where path = ?", (path,))
conn.execute("delete from headings where path = ?", (path,))
for idx, heading in enumerate(document.get("headings", [])):
conn.execute(
"""
insert into headings(snapshot_id, path, idx, level, text, line)
values (?, ?, ?, ?, ?, ?)
""",
(
snapshot_id,
path,
idx,
int(heading["level"]),
str(heading["text"]),
int(heading["line"]),
),
)
for idx, section in enumerate(document.get("sections", [])):
heading = section["heading"]
text = "\n\n".join(str(block.get("text", "")) for block in section.get("blocks", []))
line_start = _first_present(block.get("line_start") for block in section.get("blocks", []))
line_end = _last_present(block.get("line_end") for block in section.get("blocks", []))
conn.execute(
"""
insert into sections(
snapshot_id, path, idx, heading_text, heading_level, line,
text, line_start, line_end
) values (?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
snapshot_id,
path,
idx,
str(heading["text"]),
int(heading["level"]),
int(heading["line"]),
text,
line_start,
line_end,
),
)
for idx, block in enumerate(document.get("blocks", [])):
conn.execute(
"""
insert into blocks(
snapshot_id, path, idx, type, text, line_start, line_end, heading_level
) values (?, ?, ?, ?, ?, ?, ?, ?)
""",
(
snapshot_id,
path,
idx,
str(block["type"]),
str(block.get("text", "")),
block.get("line_start"),
block.get("line_end"),
block.get("heading_level"),
),
)
def _load_dependencies(conn: sqlite3.Connection) -> dict[str, list[DependencyEdge]]:
rows = conn.execute(
"""
select path, source_id, target, kind, target_snapshot_id, metadata_json
from dependencies
order by path, source_id, target
"""
).fetchall()
dependencies: dict[str, list[DependencyEdge]] = {}
for row in rows:
dependencies.setdefault(row["path"], []).append(
DependencyEdge(
source_id=row["source_id"],
target=row["target"],
kind=row["kind"],
target_snapshot_id=row["target_snapshot_id"],
metadata=json.loads(row["metadata_json"] or "{}"),
)
)
return dependencies
def _relative(path: Path, root: Path) -> str:
resolved = path.resolve()
try:
return resolved.relative_to(root).as_posix()
except ValueError:
return resolved.as_posix()
def _json(data: Any) -> str:
return json.dumps(data, sort_keys=True, ensure_ascii=False)
def _first_present(values: Any) -> int | None:
for value in values:
if value is not None:
return int(value)
return None
def _last_present(values: Any) -> int | None:
found: int | None = None
for value in values:
if value is not None:
found = int(value)
return found

View File

@@ -18,8 +18,10 @@ from markitect_tool.cache import (
)
from markitect_tool.backend import (
BackendRegistryError,
LocalSnapshotStore,
load_backend_registry,
load_snapshot_state_file,
local_index_path_for,
plan_snapshot_refresh,
snapshot_identity_for_file,
)
@@ -95,6 +97,51 @@ def parse(file: Path, output_format: str) -> None:
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
@main.group()
def ast() -> None:
"""Inspect parsed Markdown ASTs and parser summaries."""
@ast.command("show")
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "tree"], case_sensitive=False),
default="json",
show_default=True,
)
def ast_show(file: Path, output_format: str) -> None:
"""Show a parsed Markdown AST without requiring a cache."""
document = parse_markdown_file(file)
data = document.to_dict()
if output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
elif output_format == "tree":
for heading in document.headings:
click.echo(f"{'#' * heading.level} {heading.text}")
else:
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
@ast.command("stats")
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
default="text",
show_default=True,
)
def ast_stats(file: Path, output_format: str) -> None:
"""Summarize parsed Markdown AST shape and token distribution."""
document = parse_markdown_file(file)
data = _ast_stats(document.to_dict(), str(file))
_emit_ast_stats(data, output_format)
@main.command()
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
@@ -726,6 +773,40 @@ def cache() -> None:
"""Fingerprint Markdown files and detect changed inputs."""
@cache.command("init")
@click.option(
"--root",
type=click.Path(exists=True, file_okay=False, path_type=Path),
default=Path("."),
show_default=True,
help="Root used for the default local index path.",
)
@click.option(
"--index-path",
type=click.Path(dir_okay=False, path_type=Path),
help="SQLite index path. Defaults to .markitect/cache/index.sqlite3 under root.",
)
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
default="text",
show_default=True,
)
def cache_init(root: Path, index_path: Path | None, output_format: str) -> None:
"""Initialize the local SQLite snapshot/index store."""
resolved_index = local_index_path_for(root, index_path)
store = LocalSnapshotStore(resolved_index)
store.initialize()
data = {
"index_path": str(resolved_index),
"schema_version": "1",
"sources": len(store.load_state()),
}
_emit_local_index_data(data, output_format)
@cache.command("fingerprint")
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
@@ -833,6 +914,68 @@ def cache_status(
raise click.exceptions.Exit(1 if status.dirty else 0)
@cache.command("index")
@click.argument("paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path))
@click.option(
"--root",
type=click.Path(exists=True, file_okay=False, path_type=Path),
default=Path("."),
show_default=True,
help="Root used for relative index paths.",
)
@click.option(
"--index-path",
type=click.Path(dir_okay=False, path_type=Path),
help="SQLite index path. Defaults to .markitect/cache/index.sqlite3 under root.",
)
@click.option("--no-recursive", is_flag=True, help="Do not recurse into directories.")
@click.option(
"--no-verify-hashes",
is_flag=True,
help="Do not hash metadata-changed files before parsing.",
)
@click.option(
"--parse-option",
"parse_options",
multiple=True,
metavar="KEY=VALUE",
help="Parse option included in the snapshot identity hash.",
)
@click.option("--contract-hash", help="Optional contract hash included in snapshot identity.")
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
default="text",
show_default=True,
)
def cache_index(
paths: tuple[Path, ...],
root: Path,
index_path: Path | None,
no_recursive: bool,
no_verify_hashes: bool,
parse_options: tuple[str, ...],
contract_hash: str | None,
output_format: str,
) -> None:
"""Build or refresh the local SQLite snapshot/index store."""
try:
store = LocalSnapshotStore(local_index_path_for(root, index_path))
result = store.build(
list(paths),
root=root,
recursive=not no_recursive,
parse_options=_parse_key_value_options(parse_options),
contract_hash=contract_hash,
verify_hashes=not no_verify_hashes,
)
except ValueError as exc:
raise click.ClickException(str(exc)) from exc
_emit_local_index_data(result.to_dict(), output_format)
@main.group()
def template() -> None:
"""Render and inspect deterministic Markdown templates."""
@@ -1213,6 +1356,42 @@ def _emit_cache_data(data: dict, output_format: str) -> None:
click.echo(f"written: {data['written']}")
def _emit_ast_stats(data: dict, output_format: str) -> None:
if output_format == "json":
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
elif output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
else:
click.echo(f"document_path: {data['document_path']}")
for key, value in data["counts"].items():
click.echo(f"{key}: {value}")
click.echo(f"max_heading_depth: {data['max_heading_depth']}")
if data["token_types"]:
click.echo("token_types:")
for token_type, count in data["token_types"].items():
click.echo(f"- {token_type}: {count}")
def _emit_local_index_data(data: dict, output_format: str) -> None:
if output_format == "json":
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
elif output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
else:
click.echo(f"index_path: {data['index_path']}")
if data.get("schema_version"):
click.echo(f"schema_version: {data['schema_version']}")
if data.get("sources") is not None:
click.echo(f"sources: {data['sources']}")
if data.get("dirty") is not None:
click.echo("dirty" if data["dirty"] else "clean")
for key in ["parsed", "indexed", "metadata_updated", "deleted"]:
values = data.get(key, [])
click.echo(f"{key}: {len(values)}")
for value in values:
click.echo(f"- {value}")
def _emit_reference_result(data: dict, output_format: str) -> None:
if output_format == "json":
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
@@ -1404,6 +1583,29 @@ def _set_path(mapping: dict[str, object], path: list[str], value: object) -> Non
current[path[-1]] = value
def _ast_stats(document: dict, document_path: str) -> dict:
token_types: dict[str, int] = {}
for token in document.get("tokens", []):
token_type = str(token.get("type", "unknown"))
token_types[token_type] = token_types.get(token_type, 0) + 1
headings = document.get("headings", [])
return {
"document_path": document_path,
"source_path": document.get("source_path"),
"counts": {
"frontmatter_keys": len(document.get("frontmatter", {})),
"headings": len(headings),
"sections": len(document.get("sections", [])),
"blocks": len(document.get("blocks", [])),
"tokens": len(document.get("tokens", [])),
},
"max_heading_depth": max(
[int(heading.get("level", 0)) for heading in headings] or [0]
),
"token_types": dict(sorted(token_types.items())),
}
def _load_template_data(data_file: Path | None) -> dict[str, object]:
if data_file is None:
return {}