From 36bfa33fb9888de6890068ae8640549a00dcf7c2 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 17 May 2026 11:30:49 +0200 Subject: [PATCH] IB-WP-0014: archive integration with artifact-store (T01+T02) Reframe IB-WP-0014 from "in-repo S3/git backend adapters" to "durable archive surface via artifact-store". The live infospace stays in a local working folder; finalized snapshots are bundled into content-addressed artifact-store packages. - New module infospace_bench.archive: archive_infospace(), list_archives(), ArchiveRecord. Self-bootstraps a SQLite + local-FS registry under output/archives/.store/ when no Registry is passed in. - New output/archives/index.yaml records each archive event (package id, manifest digest, retention class, included paths, file count, note). - artifactstore added as a path dep; Python floor bumped to 3.12 to match. - Makefile for venv-based dev setup; stack-and-commands.md updated. - tests/test_archive.py covers index write, list, recursive-capture guard, caller-supplied include, and empty-include error. Full suite 65 passed. Remaining tasks (T03 list CLI, T04 restore, T05 docs) tracked in the workplan. Co-Authored-By: Claude Opus 4.7 --- .claude/rules/stack-and-commands.md | 9 + Makefile | 29 ++ pyproject.toml | 3 +- src/infospace_bench/__init__.py | 4 + src/infospace_bench/archive.py | 325 ++++++++++++++++++ tests/test_archive.py | 101 ++++++ ...B-WP-0014-infospace-backend-abstraction.md | 242 ++++++++----- 7 files changed, 628 insertions(+), 85 deletions(-) create mode 100644 Makefile create mode 100644 src/infospace_bench/archive.py create mode 100644 tests/test_archive.py diff --git a/.claude/rules/stack-and-commands.md b/.claude/rules/stack-and-commands.md index a7096eb..7a75863 100644 --- a/.claude/rules/stack-and-commands.md +++ b/.claude/rules/stack-and-commands.md @@ -3,6 +3,15 @@ The implementation stack is not established yet. Until it is, prefer documentation and small scaffold changes over choosing frameworks prematurely. +The Python package depends on path deps (`markitect-tool`, `artifactstore`) +that bring heavy runtime dependencies. Use the Makefile to provision a +local venv before running tests: + +```bash +make install # creates ./.venv with all path deps +make test # full pytest suite (must run via .venv/bin/python) +``` + Useful commands: ```bash diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..091b20a --- /dev/null +++ b/Makefile @@ -0,0 +1,29 @@ +PYTHON ?= .venv/bin/python +UV ?= uv + +.PHONY: help venv install test test-archive clean + +help: + @echo "infospace-bench dev targets" + @echo + @echo " make venv create ./.venv via uv" + @echo " make install install path deps (markitect-tool, artifactstore) into .venv" + @echo " make test run the full pytest suite" + @echo " make test-archive run only the artifact-store archive integration tests" + @echo " make clean remove ./.venv and pytest caches" + +venv: + $(UV) venv + +install: venv + $(UV) pip install -e . + $(UV) pip install pytest pytest-asyncio + +test: + $(PYTHON) -m pytest -q + +test-archive: + $(PYTHON) -m pytest tests/test_archive.py -q + +clean: + rm -rf .venv .pytest_cache diff --git a/pyproject.toml b/pyproject.toml index 5f4c049..415e819 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,10 +2,11 @@ name = "infospace-bench" version = "0.1.0" description = "Application-layer workspace for concrete structured knowledge spaces." -requires-python = ">=3.11" +requires-python = ">=3.12" dependencies = [ "PyYAML>=6", "markitect-tool @ file:///home/worsch/markitect-tool", + "artifactstore @ file:///home/worsch/artifact-store", ] [project.scripts] diff --git a/src/infospace_bench/__init__.py b/src/infospace_bench/__init__.py index c5e1069..dfaf84d 100644 --- a/src/infospace_bench/__init__.py +++ b/src/infospace_bench/__init__.py @@ -1,3 +1,4 @@ +from .archive import ArchiveRecord, archive_infospace, list_archives from .errors import InfospaceError from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue, ScoreEntry from .engine import ( @@ -40,6 +41,7 @@ from .semantics import EntityRecord, RelationRecord, list_entities, list_relatio from .workflow import load_workflows, plan_workflow, run_workflow __all__ = [ + "ArchiveRecord", "DisciplineBinding", "EntityEvaluation", "EvaluationSnapshot", @@ -60,11 +62,13 @@ __all__ = [ "ViabilityThreshold", "add_artifact", "append_to_history", + "archive_infospace", "create_infospace", "engine_capability_contract", "find_snapshot", "get_history", "get_latest_snapshot", + "list_archives", "list_entities", "list_relations", "load_infospace", diff --git a/src/infospace_bench/archive.py b/src/infospace_bench/archive.py new file mode 100644 index 0000000..9fb57ea --- /dev/null +++ b/src/infospace_bench/archive.py @@ -0,0 +1,325 @@ +"""Archive integration with artifact-store (IB-WP-0014). + +The live infospace stays in a local working folder. This module bundles a +finalized snapshot of the infospace (or a curated slice) into a content- +addressed artifact-store package, finalizes it, and records the returned +package id and manifest digest under ``output/archives/index.yaml``. + +Storage backend choice (local FS in artifact-store v0.1, S3 in WP-0004) is +delegated to artifact-store - it is not re-implemented here. +""" + +from __future__ import annotations + +import asyncio +import fnmatch +import mimetypes +import os +from collections.abc import AsyncIterator, Iterable +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import yaml +from sqlalchemy import insert, select +from sqlalchemy.ext.asyncio import create_async_engine + +from artifactstore.dataplane import InProcessDataPlane +from artifactstore.db.schema import metadata as artifactstore_metadata +from artifactstore.db.schema import retention_classes as retention_classes_table +from artifactstore.db.seed import RETENTION_CLASS_SEEDS +from artifactstore.events import RegistryViewWriter +from artifactstore.registry import Registry +from artifactstore.storage import LocalBackend + +from .errors import InfospaceError +from .lifecycle import load_infospace + +ARCHIVE_INDEX_PATH = "output/archives/index.yaml" +ARCHIVE_STORE_DIR = "output/archives/.store" +ARCHIVE_DB_NAME = "registry.sqlite" +ARCHIVE_BACKEND_DIR = "storage" + +DEFAULT_INCLUDE: tuple[str, ...] = ( + "infospace.yaml", + "artifacts", + "workflows", + "output", + "reports", + "exports", +) +DEFAULT_RETENTION_CLASS = "release-evidence" +PRODUCER = "infospace-bench" +DEFAULT_ACTOR = "infospace-bench" + + +@dataclass(frozen=True) +class ArchiveRecord: + """One row of ``output/archives/index.yaml``.""" + + package_id: str + manifest_digest: str + retention_class: str + created_at: str + included_paths: list[str] + file_count: int + note: str = "" + producer: str = PRODUCER + subject: str = "" + store_root: str | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + out: dict[str, Any] = { + "package_id": self.package_id, + "manifest_digest": self.manifest_digest, + "retention_class": self.retention_class, + "created_at": self.created_at, + "included_paths": list(self.included_paths), + "file_count": self.file_count, + "note": self.note, + "producer": self.producer, + "subject": self.subject, + } + if self.store_root is not None: + out["store_root"] = self.store_root + if self.metadata: + out["metadata"] = dict(self.metadata) + return out + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ArchiveRecord": + return cls( + package_id=str(data["package_id"]), + manifest_digest=str(data["manifest_digest"]), + retention_class=str(data["retention_class"]), + created_at=str(data["created_at"]), + included_paths=list(data.get("included_paths", [])), + file_count=int(data.get("file_count", 0)), + note=str(data.get("note", "")), + producer=str(data.get("producer", PRODUCER)), + subject=str(data.get("subject", "")), + store_root=( + str(data["store_root"]) if data.get("store_root") is not None else None + ), + metadata=dict(data.get("metadata", {})), + ) + + +def archive_infospace( + root: str | Path, + *, + retention_class: str = DEFAULT_RETENTION_CLASS, + include: Iterable[str] | None = None, + exclude: Iterable[str] | None = None, + note: str = "", + registry: Registry | None = None, + store_root: str | Path | None = None, + actor: str = DEFAULT_ACTOR, +) -> ArchiveRecord: + """Bundle the infospace at ``root`` into an artifact-store package. + + Returns the new :class:`ArchiveRecord` and appends it to the infospace's + ``output/archives/index.yaml``. When ``registry`` is None, a self-contained + SQLite + local-FS registry is built under ``store_root`` (default: + ``/output/archives/.store/``). + """ + + include_tuple = tuple(include) if include else DEFAULT_INCLUDE + exclude_tuple = tuple(exclude or ()) + return asyncio.run( + _archive_infospace_async( + Path(root), + retention_class=retention_class, + include=include_tuple, + exclude=exclude_tuple, + note=note, + registry=registry, + store_root=Path(store_root) if store_root else None, + actor=actor, + ) + ) + + +def list_archives(root: str | Path) -> list[ArchiveRecord]: + """Return the recorded archive entries for an infospace.""" + path = Path(root) / ARCHIVE_INDEX_PATH + if not path.exists(): + return [] + raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + if not isinstance(raw, dict): + raise InfospaceError( + "invalid_archive_index", + f"Archive index must be a mapping: {path}", + {"path": str(path)}, + ) + items = raw.get("archives", []) + if not isinstance(items, list): + raise InfospaceError( + "invalid_archive_index", + f"Archive index 'archives' must be a list: {path}", + {"path": str(path)}, + ) + return [ArchiveRecord.from_dict(item) for item in items] + + +async def _archive_infospace_async( + root: Path, + *, + retention_class: str, + include: tuple[str, ...], + exclude: tuple[str, ...], + note: str, + registry: Registry | None, + store_root: Path | None, + actor: str, +) -> ArchiveRecord: + infospace = load_infospace(root) + subject = infospace.config.slug + auto_exclude = (ARCHIVE_STORE_DIR, ARCHIVE_INDEX_PATH) + effective_exclude = exclude + auto_exclude + + files = _collect_files(root, include=include, exclude=effective_exclude) + if not files: + raise InfospaceError( + "empty_archive", + "No files matched the include set for archiving", + {"root": str(root), "include": list(include)}, + ) + + owned_registry = registry is None + effective_store_root: Path | None = None + if owned_registry: + effective_store_root = store_root or (root / ARCHIVE_STORE_DIR) + registry = await _build_local_registry(effective_store_root) + + try: + assert registry is not None + package_id = await registry.create_package( + name=f"infospace {subject}", + producer=PRODUCER, + subject=subject, + retention_class=retention_class, + actor=actor, + metadata={ + "infospace_slug": subject, + "infospace_name": infospace.config.name, + "topic_domain": infospace.config.topic.domain, + "included_paths": list(include), + "note": note, + }, + ) + for relative_path, abs_path in files: + media = ( + mimetypes.guess_type(abs_path.name)[0] or "application/octet-stream" + ) + await registry.ingest_file( + package_id, + relative_path=relative_path, + media_type=media, + stream=_file_stream(abs_path), + actor=actor, + ) + manifest_addr = await registry.finalize_package(package_id, actor=actor) + pkg = await registry.get_package(package_id) + finally: + if owned_registry and registry is not None: + await registry.dispose() + + finalized_at = pkg.finalized_at or datetime.now(timezone.utc) + record = ArchiveRecord( + package_id=str(package_id), + manifest_digest=str(manifest_addr), + retention_class=retention_class, + created_at=finalized_at.isoformat(), + included_paths=list(include), + file_count=len(files), + note=note, + producer=PRODUCER, + subject=subject, + store_root=str(effective_store_root) if effective_store_root else None, + ) + _append_index(root, record) + return record + + +def _collect_files( + root: Path, + *, + include: tuple[str, ...], + exclude: tuple[str, ...], +) -> list[tuple[str, Path]]: + seen: dict[str, Path] = {} + for pattern in include: + target = root / pattern + if target.is_file(): + rel = pattern.replace(os.sep, "/") + if not _is_excluded(rel, exclude): + seen.setdefault(rel, target) + elif target.is_dir(): + for path in target.rglob("*"): + if not path.is_file(): + continue + rel = str(path.relative_to(root)).replace(os.sep, "/") + if _is_excluded(rel, exclude): + continue + seen.setdefault(rel, path) + return sorted(seen.items()) + + +def _is_excluded(rel_path: str, exclude: tuple[str, ...]) -> bool: + for pattern in exclude: + cleaned = pattern.rstrip("/") + if rel_path == cleaned or rel_path.startswith(cleaned + "/"): + return True + if fnmatch.fnmatch(rel_path, pattern): + return True + return False + + +async def _file_stream( + path: Path, + chunk_size: int = 1024 * 1024, +) -> AsyncIterator[bytes]: + with path.open("rb") as fh: + while True: + chunk = fh.read(chunk_size) + if not chunk: + break + yield chunk + + +def _append_index(root: Path, record: ArchiveRecord) -> None: + path = root / ARCHIVE_INDEX_PATH + path.parent.mkdir(parents=True, exist_ok=True) + existing: list[dict[str, Any]] = [] + if path.exists(): + raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + if isinstance(raw, dict): + items = raw.get("archives", []) + if isinstance(items, list): + existing = list(items) + existing.append(record.to_dict()) + path.write_text( + yaml.safe_dump({"archives": existing}, sort_keys=False), + encoding="utf-8", + ) + + +async def _build_local_registry(store_root: Path) -> Registry: + store_root.mkdir(parents=True, exist_ok=True) + db_path = store_root / ARCHIVE_DB_NAME + backend_root = store_root / ARCHIVE_BACKEND_DIR + engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True) + async with engine.begin() as conn: + await conn.run_sync(artifactstore_metadata.create_all) + seeded = ( + await conn.execute(select(retention_classes_table).limit(1)) + ).first() + if seeded is None: + for seed in RETENTION_CLASS_SEEDS: + await conn.execute(insert(retention_classes_table).values(**seed)) + backend = LocalBackend(backend_root, backend_id="local") + dataplane = InProcessDataPlane(backend) + return Registry(engine, dataplane, RegistryViewWriter()) diff --git a/tests/test_archive.py b/tests/test_archive.py new file mode 100644 index 0000000..441fd8c --- /dev/null +++ b/tests/test_archive.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest +import yaml + +from infospace_bench import ( + ArchiveRecord, + InfospaceError, + add_artifact, + archive_infospace, + create_infospace, + list_archives, +) +from infospace_bench.archive import ( + ARCHIVE_INDEX_PATH, + ARCHIVE_STORE_DIR, + DEFAULT_RETENTION_CLASS, + PRODUCER, +) + + +def _seed_infospace(workspace: Path, slug: str = "demo") -> Path: + create_infospace(workspace, slug, name="Demo", topic_domain="Test") + root = workspace / "infospaces" / slug + source = workspace / "source.md" + source.write_text("# source\n", encoding="utf-8") + add_artifact(root, source, kind="source", title="Source One") + (root / "reports" / "summary.md").write_text("# summary\n", encoding="utf-8") + return root + + +def test_archive_infospace_writes_index_and_finalizes_package(tmp_path: Path) -> None: + root = _seed_infospace(tmp_path) + + record = archive_infospace(root, note="first archive") + + assert isinstance(record, ArchiveRecord) + assert record.package_id + assert record.manifest_digest.startswith("blake3:") + assert record.retention_class == DEFAULT_RETENTION_CLASS + assert record.producer == PRODUCER + assert record.subject == "demo" + assert record.note == "first archive" + assert record.file_count >= 4 # infospace.yaml, index.yaml, source.md, summary.md + + index_path = root / ARCHIVE_INDEX_PATH + assert index_path.is_file() + data = yaml.safe_load(index_path.read_text(encoding="utf-8")) + assert isinstance(data, dict) + assert len(data["archives"]) == 1 + assert data["archives"][0]["package_id"] == record.package_id + + store_root = root / ARCHIVE_STORE_DIR + assert (store_root / "registry.sqlite").is_file() + assert (store_root / "storage").is_dir() + + +def test_list_archives_returns_recorded_entries(tmp_path: Path) -> None: + root = _seed_infospace(tmp_path) + + assert list_archives(root) == [] + first = archive_infospace(root, note="alpha") + second = archive_infospace(root, note="beta") + + archives = list_archives(root) + assert [a.package_id for a in archives] == [first.package_id, second.package_id] + assert [a.note for a in archives] == ["alpha", "beta"] + + +def test_archive_excludes_store_dir_to_avoid_recursive_capture(tmp_path: Path) -> None: + root = _seed_infospace(tmp_path) + + first = archive_infospace(root) + second = archive_infospace(root) + + # The store dir grows on the first call; the second call must not pick up + # any of its bytes (otherwise file_count would balloon). + assert second.file_count == first.file_count + second_record = list_archives(root)[1] + assert all( + not path.startswith(ARCHIVE_STORE_DIR) + for path in second_record.included_paths + ) + + +def test_archive_respects_caller_supplied_include_set(tmp_path: Path) -> None: + root = _seed_infospace(tmp_path) + + record = archive_infospace(root, include=["infospace.yaml"]) + assert record.included_paths == ["infospace.yaml"] + assert record.file_count == 1 + + +def test_archive_rejects_empty_include(tmp_path: Path) -> None: + root = _seed_infospace(tmp_path) + + with pytest.raises(InfospaceError) as excinfo: + archive_infospace(root, include=["does-not-exist"]) + assert excinfo.value.code == "empty_archive" diff --git a/workplans/IB-WP-0014-infospace-backend-abstraction.md b/workplans/IB-WP-0014-infospace-backend-abstraction.md index 189d93b..460348a 100644 --- a/workplans/IB-WP-0014-infospace-backend-abstraction.md +++ b/workplans/IB-WP-0014-infospace-backend-abstraction.md @@ -1,77 +1,124 @@ --- id: IB-WP-0014 type: workplan -title: "Infospace Backend Abstraction" +title: "Infospace Archive Integration With artifact-store" domain: markitect repo: infospace-bench -status: todo +status: in_progress owner: markitect topic_slug: markitect created: "2026-05-14" -updated: "2026-05-14" +updated: "2026-05-17" state_hub_workstream_slug: "ib-wp-0014-infospace-backend-abstraction" state_hub_workstream_id: "c2d23ee7-6b2b-4db0-b660-a9e295c94956" --- -# IB-WP-0014 - Infospace Backend Abstraction +# IB-WP-0014 - Infospace Archive Integration With artifact-store ## Goal -Allow an infospace to live behind a selectable backend instead of assuming only -a local filesystem directory. - -Target backends: - -- local folder -- remote or mounted folder -- S3-compatible bucket/prefix -- git repository - -This is a new successor capability, not legacy parity. It should be designed so -generation, validation, evaluation, and inspection logic do not care where the -infospace is physically stored. +Let a finalized infospace state (or a curated slice of it) be preserved as a +durable, content-addressed package through `artifact-store`, while the live +infospace continues to live in a local working folder. ## Intent -The current repo is intentionally file-backed. That should remain the default. -The improvement is to formalize the storage boundary so the same lifecycle and -workflow APIs can operate on other backing stores through explicit adapters. +The original framing of this workplan asked for a pluggable storage backend +(local, remote folder, S3, git) *inside* `infospace-bench`. Looking at +`/home/worsch/artifact-store`, that is exactly the boundary the artifact-store +service is being built for: an immutable, content-addressed registry with +retention policy, holds, audit, manifests, and a pluggable storage adapter +SPI (local FS in v0.1, S3-compatible/Ceph RGW in WP-0004). -The design should keep `infospace-bench` as an application workspace, not a -durable storage engine. Credentials, remote locking, rich audit, and runtime -orchestration should be delegated or integrated carefully rather than invented -inside core application logic. +Re-inventing a second backend abstraction in `infospace-bench` would duplicate +that surface and tangle durable-storage concerns with the live infospace +working directory (which is read-write-read-write across many sessions and is +not a fit for content-addressed immutability). + +This workplan therefore replaces "selectable backend" with "durable archive +surface": + +- The working infospace continues to live in a local folder. That stays the + only *working* storage form. +- A new `archive` capability bundles the infospace (or selected subdirs) into + an `artifact-store` package, finalizes it, and records the returned package + id and manifest digest inside the infospace. +- A `restore` capability re-materializes a previously archived state into a + target directory. +- Multi-backend storage (S3-compatible, Ceph RGW) is delegated to the + configured artifact-store deployment, not implemented here. ## Non-Goals -- Replace the existing local folder behavior. -- Require S3 or git dependencies for ordinary local use. -- Store secrets in `infospace.yaml`. -- Build a general database, sync server, or object storage service inside this - repo. -- Solve multi-writer conflict resolution beyond clear detection and reporting - in the first pass. +- Replace the local working folder for live infospace operations. +- Re-implement S3, git, or any other storage backend inside `infospace-bench`. +- Make the live infospace content-addressed or immutable. +- Provide multi-writer concurrency control beyond what artifact-store offers. +- Ship a remote service. Integration is library-only via the `artifactstore` + Python package (path dep), wired in-process. + +## Development setup + +`artifactstore` brings runtime deps (SQLAlchemy, FastAPI, cbor2, blake3, +pydantic-settings, structlog) that are not on the system Python. Use the +repo Makefile to provision a local venv: + +```bash +make install # creates .venv and installs path deps + pytest +make test # full suite +make test-archive # only the archive integration tests +``` + +The `.venv/` directory is gitignored. + +## Architecture + +```text ++----------------------------+ +-------------------------+ +| live infospace (folder) | | artifact-store | +| - infospace.yaml | ==> | (library, in-process) | +| - artifacts/... |archive | - registry | +| - output/metrics/... | | - manifest | +| - reports/... | <== | - retention policy | +| - exports/... | restore| - storage backends | +| - output/archives/index | +-------------------------+ ++----------------------------+ +``` + +- The infospace remains the working source of truth for the live state. +- artifact-store owns durable storage, content hashing, manifest, retention, + audit, and backend selection. +- A new `output/archives/index.yaml` inside the infospace records every + archive event (package id, manifest digest, retention class, included + paths, note). ## Tasks -### T01 - Backend contract and URI model +### T01 - Archive contract and infospace metadata ```task id: IB-WP-0014-T01 -status: todo +status: in_progress priority: high state_hub_task_id: "75b7df31-066a-47ac-bb94-a4ae908569fd" ``` -- Define a backend-neutral infospace location model -- Support local paths without changing current user flows -- Define URI examples for local, mounted folder, S3-compatible, and git-backed - infospaces -- Define backend capabilities: read, write, list, exists, atomic write, - digest, version, sync, lock, and credentials-required -- Document where credentials and remote configuration are allowed to live +- Add `artifactstore` as a path dependency on `/home/worsch/artifact-store`. +- Define an in-repo Python contract `infospace_bench.archive`: + - `archive_infospace(root, *, retention_class, include, note, registry=None) -> ArchiveRecord` + - `restore_archive(package_id, *, target, registry=None) -> RestoredArchive` + - `list_archives(root) -> list[ArchiveRecord]` +- Map default `retention_class` to `release-evidence`; allow any class the + registry exposes via `list_retention_classes()`. +- Default `include` set: `infospace.yaml`, `artifacts/`, `workflows/`, + `output/`, `reports/`, `exports/`. Allow caller-supplied include patterns. +- Document credentials policy: never write secrets into `infospace.yaml` or + archive metadata; backend secrets stay with the artifact-store deployment. +- Define `output/archives/index.yaml` schema: list of records with + `package_id`, `manifest_digest`, `retention_class`, `created_at`, + `included_paths`, `file_count`, `note`, `producer`, `subject`. -### T02 - Local and remote folder backend baseline +### T02 - Archive command and library implementation ```task id: IB-WP-0014-T02 @@ -80,16 +127,28 @@ priority: high state_hub_task_id: "2e33d98a-0cd0-4608-b7a1-76c5a7bb26ca" ``` -- Refactor lifecycle reads and writes behind a backend adapter while preserving - current `Path`-based behavior -- Keep local folders as the default backend -- Treat mounted or remote folders as folder backends when the OS exposes them - as paths -- Add tests proving current pilots and CLI commands still work unchanged -- Add tests for backend errors such as missing files, write failures, and - unsafe paths +- Implement `archive_infospace` against `artifactstore.registry.Registry`: + - Create a package with producer=`infospace-bench`, + subject=``, retention class as requested. + - Walk the include set; stream each file via `registry.ingest_file`. + - Finalize the package and capture the manifest digest. + - Append the new record to `output/archives/index.yaml`. +- Wire `infospace-bench archive ` in the CLI with flags + `--retention-class`, `--include`, `--note`, `--store-root`. +- Provide a `_build_default_registry(store_root)` helper that calls + `artifactstore.app.build_registry()` with overridden settings so the + default behavior is a self-contained store under + `/output/archives/.store/` (SQLite + local FS). Honor + `ARTIFACTSTORE_*` env vars when set so operators can point at a shared + artifact-store deployment. +- Tests: + - Archiving a small infospace returns a stable record and writes index. + - Re-archiving the same content reuses content-addressed bytes (verifies + artifact-store dedup at the storage layer). + - Excluded paths are not ingested. + - Default-include path produces a non-empty package. -### T03 - S3 object-store backend adapter +### T03 - Archive index and list command ```task id: IB-WP-0014-T03 @@ -98,31 +157,36 @@ priority: high state_hub_task_id: "e2ee9497-0a6c-419f-a045-fb994bf73b05" ``` -- Design an optional S3-compatible backend adapter -- Use a fake in-memory or local test double for default tests -- Keep real credentials and network calls out of the default test suite -- Define object key layout for manifests, artifacts, reports, exports, and run - records -- Decide how digests, optimistic concurrency, and partial writes are reported +- Implement `list_archives(root)` reading `output/archives/index.yaml`. +- Wire `infospace-bench archive-list ` to print the records as a table + and optionally `--json`. +- Surface retention state when a registry is available: query + `get_retention_state(package_id)` and annotate each record with current + expiry and hold status. +- Tests for empty index, single-record index, and registry-augmented listing. -### T04 - Git repository backend adapter +### T04 - Restore command ```task id: IB-WP-0014-T04 status: todo -priority: high +priority: medium state_hub_task_id: "e2938c5b-e6c2-468a-b782-b39962e5a81b" ``` -- Support opening or initializing an infospace backed by a git repository -- Prove behavior against local test repositories before any remote network - workflow -- Define when commits are created, when they are only suggested, and how dirty - trees are reported -- Keep automatic commits opt-in -- Preserve compatibility with the existing State Hub and workplan workflow +- Implement `restore_archive(package_id, *, target, registry)`: + - Fetch the finalized manifest via `registry.get_manifest_bytes(..., format="json")`. + - Iterate files, call `registry.get_file(file_id)`, stream bytes to + `/`. + - Refuse to overwrite an existing non-empty target unless `--force` is set. +- Wire `infospace-bench restore --target ` in the CLI. +- Tests: + - Round-trip: archive an infospace, restore into a new directory, diff is + empty (modulo `output/archives/index.yaml` which is local). + - Restore refuses to overwrite a non-empty target. + - Restore by manifest digest also works (lookup via `list_packages`). -### T05 - Backend CLI docs and migration path +### T05 - Docs and operator notes ```task id: IB-WP-0014-T05 @@ -131,26 +195,36 @@ priority: medium state_hub_task_id: "20d75d49-f62a-4236-a895-698cd2fae45a" ``` -- Expose backend selection in CLI/API docs -- Add examples for local, mounted folder, S3-compatible, and git-backed - infospaces -- Document backend capabilities and limitations -- Add a migration guide for moving a local infospace to another backend -- Update acceptance docs so backend support is distinct from Wealth/VSM - generation parity +- Add `docs/archive-integration.md` covering: + - When to archive vs keep editing locally. + - How to point at a shared artifact-store deployment via `ARTIFACTSTORE_*` + env vars. + - Retention class selection guidance. + - Restore workflow. +- Cross-link from `SCOPE.md` and the relevant CLI help output. +- Note the explicit non-goal: infospace-bench does not implement S3 or git + backends; those live in artifact-store. ## Acceptance -- Existing local-folder behavior remains backward compatible -- Lifecycle, validation, inspection, workflow, metrics, history, and graph - commands can operate through the backend contract -- Default tests remain deterministic and do not require network credentials -- Backend-specific capabilities and failure modes are visible to callers -- S3 and git support are optional and clearly documented -- Storage backend concerns stay separate from generation workflow semantics +- `infospace-bench archive ` produces a finalized artifact-store package + and writes a new entry to `output/archives/index.yaml`. +- `infospace-bench archive-list ` lists recorded archives, with optional + retention annotations when the registry is reachable. +- `infospace-bench restore --target ` round-trips the + archived state byte-for-byte through artifact-store. +- The default-included file set covers the live infospace contract + (`infospace.yaml`, `artifacts/`, `workflows/`, `output/`, `reports/`, + `exports/`). +- Default tests do not require network access or external credentials; + archive and restore round-trip against the local-FS backend. +- No S3, git, or remote-folder code exists inside `infospace-bench` after + this workplan. -## Relationship To IB-WP-0013 +## Relationship To Other Workplans -`IB-WP-0013` should prove generation parity on the default local backend first. -This workplan then makes the same infospace operations portable across storage -backends. +- `IB-WP-0013` proves generation parity on the local working folder. This + workplan adds durable preservation of those generated outputs. +- `artifact-store` WP-0004 will bring S3-compatible storage; once that + lands, pointing infospace-bench archives at S3 is purely an + artifact-store configuration change.