IB-WP-0014: archive-list, restore, retention annotation, docs (T03-T05)

Round out IB-WP-0014 with the remaining archive operations and docs.

- restore_archive() and `infospace-bench restore <pkg> --target <dir>` round-trip
  a finalized package's bytes back to disk. Refuses to overwrite a non-empty
  target unless --force. --from <infospace-root> resolves the store location.
- archive-list CLI with --with-retention flag; annotate_retention() opens the
  per-infospace registry and joins each record with its current retention
  state (effective class, expires, holds, eligibility).
- docs/archive-integration.md covers when to archive, the include set,
  retention classes, storage layout, credentials policy, and the explicit
  non-goal that S3/git backends live in artifact-store.
- SCOPE.md cross-links the new doc.
- Workplan flipped to status: done. Full pytest suite: 72 passed.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-17 11:46:23 +02:00
parent e343443d77
commit ddefd69f71
8 changed files with 636 additions and 2 deletions

View File

@@ -13,6 +13,7 @@ from __future__ import annotations
import asyncio
import fnmatch
import json
import mimetypes
import os
from collections.abc import AsyncIterator, Iterable
@@ -20,6 +21,7 @@ from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from uuid import UUID
import yaml
from sqlalchemy import insert, select
@@ -142,6 +144,77 @@ def archive_infospace(
)
@dataclass(frozen=True)
class RestoredArchive:
"""Result of :func:`restore_archive`."""
package_id: str
manifest_digest: str
target: str
file_count: int
restored_paths: list[str]
def to_dict(self) -> dict[str, Any]:
return {
"package_id": self.package_id,
"manifest_digest": self.manifest_digest,
"target": self.target,
"file_count": self.file_count,
"restored_paths": list(self.restored_paths),
}
def restore_archive(
package_id: str,
*,
target: str | Path,
store_root: str | Path | None = None,
source_infospace: str | Path | None = None,
registry: Registry | None = None,
force: bool = False,
) -> RestoredArchive:
"""Re-materialize an archived infospace package into ``target``.
Exactly one of ``store_root``, ``source_infospace``, or ``registry`` must
locate the artifact-store. ``source_infospace`` is a convenience that
resolves to ``<source_infospace>/output/archives/.store/``.
"""
return asyncio.run(
_restore_archive_async(
package_id=package_id,
target=Path(target),
store_root=Path(store_root) if store_root else None,
source_infospace=Path(source_infospace) if source_infospace else None,
registry=registry,
force=force,
)
)
def annotate_retention(
archives: Iterable[ArchiveRecord],
*,
store_root: str | Path | None = None,
source_infospace: str | Path | None = None,
registry: Registry | None = None,
) -> list[dict[str, Any]]:
"""Pair each record with its current retention state if reachable.
Returns a list of ``{"archive": ArchiveRecord.to_dict(), "retention": {...}|None}``
entries. Records whose registry cannot be opened get ``retention: None``.
"""
return asyncio.run(
_annotate_retention_async(
tuple(archives),
store_root=Path(store_root) if store_root else None,
source_infospace=Path(source_infospace) if source_infospace else None,
registry=registry,
)
)
def list_archives(root: str | Path) -> list[ArchiveRecord]:
"""Return the recorded archive entries for an infospace."""
path = Path(root) / ARCHIVE_INDEX_PATH
@@ -323,3 +396,151 @@ async def _build_local_registry(store_root: Path) -> Registry:
backend = LocalBackend(backend_root, backend_id="local")
dataplane = InProcessDataPlane(backend)
return Registry(engine, dataplane, RegistryViewWriter())
def _resolve_store_root(
*,
store_root: Path | None,
source_infospace: Path | None,
) -> Path | None:
if store_root is not None and source_infospace is not None:
raise InfospaceError(
"ambiguous_archive_store",
"Pass at most one of store_root or source_infospace",
{},
)
if store_root is not None:
return store_root
if source_infospace is not None:
return source_infospace / ARCHIVE_STORE_DIR
return None
async def _restore_archive_async(
*,
package_id: str,
target: Path,
store_root: Path | None,
source_infospace: Path | None,
registry: Registry | None,
force: bool,
) -> RestoredArchive:
owned_registry = registry is None
if owned_registry:
resolved_store = _resolve_store_root(
store_root=store_root,
source_infospace=source_infospace,
)
if resolved_store is None:
raise InfospaceError(
"missing_archive_store",
"restore_archive needs registry, store_root, or source_infospace",
{},
)
if not resolved_store.exists():
raise InfospaceError(
"missing_archive_store",
f"Archive store does not exist: {resolved_store}",
{"store_root": str(resolved_store)},
)
registry = await _build_local_registry(resolved_store)
try:
assert registry is not None
pkg_uuid = UUID(package_id)
pkg = await registry.get_package(pkg_uuid)
if pkg.manifest_digest_hex is None:
raise InfospaceError(
"unfinalized_package",
f"Package is not finalized: {package_id}",
{"package_id": package_id, "status": pkg.status},
)
manifest_bytes = await registry.get_manifest_bytes(pkg_uuid, format="json")
manifest = json.loads(manifest_bytes.decode("utf-8"))
target.mkdir(parents=True, exist_ok=True)
if not force and any(target.iterdir()):
raise InfospaceError(
"restore_target_not_empty",
f"Refusing to restore into non-empty directory: {target}",
{"target": str(target)},
)
restored: list[str] = []
for entry in manifest.get("files", []):
rel = str(entry["relative_path"])
file_id = UUID(str(entry["id"]))
dest = (target / rel).resolve()
target_resolved = target.resolve()
if target_resolved not in dest.parents and dest != target_resolved:
raise InfospaceError(
"unsafe_restore_path",
f"Manifest path escapes target: {rel}",
{"target": str(target), "relative_path": rel},
)
dest.parent.mkdir(parents=True, exist_ok=True)
stream = await registry.get_file(file_id)
with dest.open("wb") as fh:
async for chunk in stream:
fh.write(chunk)
restored.append(rel)
finally:
if owned_registry and registry is not None:
await registry.dispose()
return RestoredArchive(
package_id=package_id,
manifest_digest=f"blake3:{pkg.manifest_digest_hex}",
target=str(target),
file_count=len(restored),
restored_paths=restored,
)
async def _annotate_retention_async(
archives: tuple[ArchiveRecord, ...],
*,
store_root: Path | None,
source_infospace: Path | None,
registry: Registry | None,
) -> list[dict[str, Any]]:
if not archives:
return []
owned_registry = registry is None
used_store_root: Path | None = None
if owned_registry:
used_store_root = _resolve_store_root(
store_root=store_root,
source_infospace=source_infospace,
)
if used_store_root is None or not used_store_root.exists():
return [{"archive": rec.to_dict(), "retention": None} for rec in archives]
registry = await _build_local_registry(used_store_root)
try:
assert registry is not None
results: list[dict[str, Any]] = []
for rec in archives:
retention: dict[str, Any] | None
try:
state = await registry.get_retention_state(UUID(rec.package_id))
retention = {
"current_expires_at": (
state.current_expires_at.isoformat()
if state.current_expires_at
else None
),
"effective_class": state.effective_class,
"active_hold_id": (
str(state.active_hold_id) if state.active_hold_id else None
),
"eligible_for_deletion": state.eligible_for_deletion,
}
except Exception as exc:
retention = {"error": f"{type(exc).__name__}: {exc}"}
results.append({"archive": rec.to_dict(), "retention": retention})
return results
finally:
if owned_registry and registry is not None:
await registry.dispose()