Files
infospace-bench/src/infospace_bench/archive.py
tegwick ddefd69f71 IB-WP-0014: archive-list, restore, retention annotation, docs (T03-T05)
Round out IB-WP-0014 with the remaining archive operations and docs.

- restore_archive() and `infospace-bench restore <pkg> --target <dir>` round-trip
  a finalized package's bytes back to disk. Refuses to overwrite a non-empty
  target unless --force. --from <infospace-root> resolves the store location.
- archive-list CLI with --with-retention flag; annotate_retention() opens the
  per-infospace registry and joins each record with its current retention
  state (effective class, expires, holds, eligibility).
- docs/archive-integration.md covers when to archive, the include set,
  retention classes, storage layout, credentials policy, and the explicit
  non-goal that S3/git backends live in artifact-store.
- SCOPE.md cross-links the new doc.
- Workplan flipped to status: done. Full pytest suite: 72 passed.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-17 11:46:23 +02:00

547 lines
18 KiB
Python

"""Archive integration with artifact-store (IB-WP-0014).
The live infospace stays in a local working folder. This module bundles a
finalized snapshot of the infospace (or a curated slice) into a content-
addressed artifact-store package, finalizes it, and records the returned
package id and manifest digest under ``output/archives/index.yaml``.
Storage backend choice (local FS in artifact-store v0.1, S3 in WP-0004) is
delegated to artifact-store - it is not re-implemented here.
"""
from __future__ import annotations
import asyncio
import fnmatch
import json
import mimetypes
import os
from collections.abc import AsyncIterator, Iterable
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from uuid import UUID
import yaml
from sqlalchemy import insert, select
from sqlalchemy.ext.asyncio import create_async_engine
from artifactstore.dataplane import InProcessDataPlane
from artifactstore.db.schema import metadata as artifactstore_metadata
from artifactstore.db.schema import retention_classes as retention_classes_table
from artifactstore.db.seed import RETENTION_CLASS_SEEDS
from artifactstore.events import RegistryViewWriter
from artifactstore.registry import Registry
from artifactstore.storage import LocalBackend
from .errors import InfospaceError
from .lifecycle import load_infospace
ARCHIVE_INDEX_PATH = "output/archives/index.yaml"
ARCHIVE_STORE_DIR = "output/archives/.store"
ARCHIVE_DB_NAME = "registry.sqlite"
ARCHIVE_BACKEND_DIR = "storage"
DEFAULT_INCLUDE: tuple[str, ...] = (
"infospace.yaml",
"artifacts",
"workflows",
"output",
"reports",
"exports",
)
DEFAULT_RETENTION_CLASS = "release-evidence"
PRODUCER = "infospace-bench"
DEFAULT_ACTOR = "infospace-bench"
@dataclass(frozen=True)
class ArchiveRecord:
"""One row of ``output/archives/index.yaml``."""
package_id: str
manifest_digest: str
retention_class: str
created_at: str
included_paths: list[str]
file_count: int
note: str = ""
producer: str = PRODUCER
subject: str = ""
store_root: str | None = None
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
out: dict[str, Any] = {
"package_id": self.package_id,
"manifest_digest": self.manifest_digest,
"retention_class": self.retention_class,
"created_at": self.created_at,
"included_paths": list(self.included_paths),
"file_count": self.file_count,
"note": self.note,
"producer": self.producer,
"subject": self.subject,
}
if self.store_root is not None:
out["store_root"] = self.store_root
if self.metadata:
out["metadata"] = dict(self.metadata)
return out
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "ArchiveRecord":
return cls(
package_id=str(data["package_id"]),
manifest_digest=str(data["manifest_digest"]),
retention_class=str(data["retention_class"]),
created_at=str(data["created_at"]),
included_paths=list(data.get("included_paths", [])),
file_count=int(data.get("file_count", 0)),
note=str(data.get("note", "")),
producer=str(data.get("producer", PRODUCER)),
subject=str(data.get("subject", "")),
store_root=(
str(data["store_root"]) if data.get("store_root") is not None else None
),
metadata=dict(data.get("metadata", {})),
)
def archive_infospace(
root: str | Path,
*,
retention_class: str = DEFAULT_RETENTION_CLASS,
include: Iterable[str] | None = None,
exclude: Iterable[str] | None = None,
note: str = "",
registry: Registry | None = None,
store_root: str | Path | None = None,
actor: str = DEFAULT_ACTOR,
) -> ArchiveRecord:
"""Bundle the infospace at ``root`` into an artifact-store package.
Returns the new :class:`ArchiveRecord` and appends it to the infospace's
``output/archives/index.yaml``. When ``registry`` is None, a self-contained
SQLite + local-FS registry is built under ``store_root`` (default:
``<root>/output/archives/.store/``).
"""
include_tuple = tuple(include) if include else DEFAULT_INCLUDE
exclude_tuple = tuple(exclude or ())
return asyncio.run(
_archive_infospace_async(
Path(root),
retention_class=retention_class,
include=include_tuple,
exclude=exclude_tuple,
note=note,
registry=registry,
store_root=Path(store_root) if store_root else None,
actor=actor,
)
)
@dataclass(frozen=True)
class RestoredArchive:
"""Result of :func:`restore_archive`."""
package_id: str
manifest_digest: str
target: str
file_count: int
restored_paths: list[str]
def to_dict(self) -> dict[str, Any]:
return {
"package_id": self.package_id,
"manifest_digest": self.manifest_digest,
"target": self.target,
"file_count": self.file_count,
"restored_paths": list(self.restored_paths),
}
def restore_archive(
package_id: str,
*,
target: str | Path,
store_root: str | Path | None = None,
source_infospace: str | Path | None = None,
registry: Registry | None = None,
force: bool = False,
) -> RestoredArchive:
"""Re-materialize an archived infospace package into ``target``.
Exactly one of ``store_root``, ``source_infospace``, or ``registry`` must
locate the artifact-store. ``source_infospace`` is a convenience that
resolves to ``<source_infospace>/output/archives/.store/``.
"""
return asyncio.run(
_restore_archive_async(
package_id=package_id,
target=Path(target),
store_root=Path(store_root) if store_root else None,
source_infospace=Path(source_infospace) if source_infospace else None,
registry=registry,
force=force,
)
)
def annotate_retention(
archives: Iterable[ArchiveRecord],
*,
store_root: str | Path | None = None,
source_infospace: str | Path | None = None,
registry: Registry | None = None,
) -> list[dict[str, Any]]:
"""Pair each record with its current retention state if reachable.
Returns a list of ``{"archive": ArchiveRecord.to_dict(), "retention": {...}|None}``
entries. Records whose registry cannot be opened get ``retention: None``.
"""
return asyncio.run(
_annotate_retention_async(
tuple(archives),
store_root=Path(store_root) if store_root else None,
source_infospace=Path(source_infospace) if source_infospace else None,
registry=registry,
)
)
def list_archives(root: str | Path) -> list[ArchiveRecord]:
"""Return the recorded archive entries for an infospace."""
path = Path(root) / ARCHIVE_INDEX_PATH
if not path.exists():
return []
raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
if not isinstance(raw, dict):
raise InfospaceError(
"invalid_archive_index",
f"Archive index must be a mapping: {path}",
{"path": str(path)},
)
items = raw.get("archives", [])
if not isinstance(items, list):
raise InfospaceError(
"invalid_archive_index",
f"Archive index 'archives' must be a list: {path}",
{"path": str(path)},
)
return [ArchiveRecord.from_dict(item) for item in items]
async def _archive_infospace_async(
root: Path,
*,
retention_class: str,
include: tuple[str, ...],
exclude: tuple[str, ...],
note: str,
registry: Registry | None,
store_root: Path | None,
actor: str,
) -> ArchiveRecord:
infospace = load_infospace(root)
subject = infospace.config.slug
auto_exclude = (ARCHIVE_STORE_DIR, ARCHIVE_INDEX_PATH)
effective_exclude = exclude + auto_exclude
files = _collect_files(root, include=include, exclude=effective_exclude)
if not files:
raise InfospaceError(
"empty_archive",
"No files matched the include set for archiving",
{"root": str(root), "include": list(include)},
)
owned_registry = registry is None
effective_store_root: Path | None = None
if owned_registry:
effective_store_root = store_root or (root / ARCHIVE_STORE_DIR)
registry = await _build_local_registry(effective_store_root)
try:
assert registry is not None
package_id = await registry.create_package(
name=f"infospace {subject}",
producer=PRODUCER,
subject=subject,
retention_class=retention_class,
actor=actor,
metadata={
"infospace_slug": subject,
"infospace_name": infospace.config.name,
"topic_domain": infospace.config.topic.domain,
"included_paths": list(include),
"note": note,
},
)
for relative_path, abs_path in files:
media = (
mimetypes.guess_type(abs_path.name)[0] or "application/octet-stream"
)
await registry.ingest_file(
package_id,
relative_path=relative_path,
media_type=media,
stream=_file_stream(abs_path),
actor=actor,
)
manifest_addr = await registry.finalize_package(package_id, actor=actor)
pkg = await registry.get_package(package_id)
finally:
if owned_registry and registry is not None:
await registry.dispose()
finalized_at = pkg.finalized_at or datetime.now(timezone.utc)
record = ArchiveRecord(
package_id=str(package_id),
manifest_digest=str(manifest_addr),
retention_class=retention_class,
created_at=finalized_at.isoformat(),
included_paths=list(include),
file_count=len(files),
note=note,
producer=PRODUCER,
subject=subject,
store_root=str(effective_store_root) if effective_store_root else None,
)
_append_index(root, record)
return record
def _collect_files(
root: Path,
*,
include: tuple[str, ...],
exclude: tuple[str, ...],
) -> list[tuple[str, Path]]:
seen: dict[str, Path] = {}
for pattern in include:
target = root / pattern
if target.is_file():
rel = pattern.replace(os.sep, "/")
if not _is_excluded(rel, exclude):
seen.setdefault(rel, target)
elif target.is_dir():
for path in target.rglob("*"):
if not path.is_file():
continue
rel = str(path.relative_to(root)).replace(os.sep, "/")
if _is_excluded(rel, exclude):
continue
seen.setdefault(rel, path)
return sorted(seen.items())
def _is_excluded(rel_path: str, exclude: tuple[str, ...]) -> bool:
for pattern in exclude:
cleaned = pattern.rstrip("/")
if rel_path == cleaned or rel_path.startswith(cleaned + "/"):
return True
if fnmatch.fnmatch(rel_path, pattern):
return True
return False
async def _file_stream(
path: Path,
chunk_size: int = 1024 * 1024,
) -> AsyncIterator[bytes]:
with path.open("rb") as fh:
while True:
chunk = fh.read(chunk_size)
if not chunk:
break
yield chunk
def _append_index(root: Path, record: ArchiveRecord) -> None:
path = root / ARCHIVE_INDEX_PATH
path.parent.mkdir(parents=True, exist_ok=True)
existing: list[dict[str, Any]] = []
if path.exists():
raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
if isinstance(raw, dict):
items = raw.get("archives", [])
if isinstance(items, list):
existing = list(items)
existing.append(record.to_dict())
path.write_text(
yaml.safe_dump({"archives": existing}, sort_keys=False),
encoding="utf-8",
)
async def _build_local_registry(store_root: Path) -> Registry:
store_root.mkdir(parents=True, exist_ok=True)
db_path = store_root / ARCHIVE_DB_NAME
backend_root = store_root / ARCHIVE_BACKEND_DIR
engine = create_async_engine(f"sqlite+aiosqlite:///{db_path}", future=True)
async with engine.begin() as conn:
await conn.run_sync(artifactstore_metadata.create_all)
seeded = (
await conn.execute(select(retention_classes_table).limit(1))
).first()
if seeded is None:
for seed in RETENTION_CLASS_SEEDS:
await conn.execute(insert(retention_classes_table).values(**seed))
backend = LocalBackend(backend_root, backend_id="local")
dataplane = InProcessDataPlane(backend)
return Registry(engine, dataplane, RegistryViewWriter())
def _resolve_store_root(
*,
store_root: Path | None,
source_infospace: Path | None,
) -> Path | None:
if store_root is not None and source_infospace is not None:
raise InfospaceError(
"ambiguous_archive_store",
"Pass at most one of store_root or source_infospace",
{},
)
if store_root is not None:
return store_root
if source_infospace is not None:
return source_infospace / ARCHIVE_STORE_DIR
return None
async def _restore_archive_async(
*,
package_id: str,
target: Path,
store_root: Path | None,
source_infospace: Path | None,
registry: Registry | None,
force: bool,
) -> RestoredArchive:
owned_registry = registry is None
if owned_registry:
resolved_store = _resolve_store_root(
store_root=store_root,
source_infospace=source_infospace,
)
if resolved_store is None:
raise InfospaceError(
"missing_archive_store",
"restore_archive needs registry, store_root, or source_infospace",
{},
)
if not resolved_store.exists():
raise InfospaceError(
"missing_archive_store",
f"Archive store does not exist: {resolved_store}",
{"store_root": str(resolved_store)},
)
registry = await _build_local_registry(resolved_store)
try:
assert registry is not None
pkg_uuid = UUID(package_id)
pkg = await registry.get_package(pkg_uuid)
if pkg.manifest_digest_hex is None:
raise InfospaceError(
"unfinalized_package",
f"Package is not finalized: {package_id}",
{"package_id": package_id, "status": pkg.status},
)
manifest_bytes = await registry.get_manifest_bytes(pkg_uuid, format="json")
manifest = json.loads(manifest_bytes.decode("utf-8"))
target.mkdir(parents=True, exist_ok=True)
if not force and any(target.iterdir()):
raise InfospaceError(
"restore_target_not_empty",
f"Refusing to restore into non-empty directory: {target}",
{"target": str(target)},
)
restored: list[str] = []
for entry in manifest.get("files", []):
rel = str(entry["relative_path"])
file_id = UUID(str(entry["id"]))
dest = (target / rel).resolve()
target_resolved = target.resolve()
if target_resolved not in dest.parents and dest != target_resolved:
raise InfospaceError(
"unsafe_restore_path",
f"Manifest path escapes target: {rel}",
{"target": str(target), "relative_path": rel},
)
dest.parent.mkdir(parents=True, exist_ok=True)
stream = await registry.get_file(file_id)
with dest.open("wb") as fh:
async for chunk in stream:
fh.write(chunk)
restored.append(rel)
finally:
if owned_registry and registry is not None:
await registry.dispose()
return RestoredArchive(
package_id=package_id,
manifest_digest=f"blake3:{pkg.manifest_digest_hex}",
target=str(target),
file_count=len(restored),
restored_paths=restored,
)
async def _annotate_retention_async(
archives: tuple[ArchiveRecord, ...],
*,
store_root: Path | None,
source_infospace: Path | None,
registry: Registry | None,
) -> list[dict[str, Any]]:
if not archives:
return []
owned_registry = registry is None
used_store_root: Path | None = None
if owned_registry:
used_store_root = _resolve_store_root(
store_root=store_root,
source_infospace=source_infospace,
)
if used_store_root is None or not used_store_root.exists():
return [{"archive": rec.to_dict(), "retention": None} for rec in archives]
registry = await _build_local_registry(used_store_root)
try:
assert registry is not None
results: list[dict[str, Any]] = []
for rec in archives:
retention: dict[str, Any] | None
try:
state = await registry.get_retention_state(UUID(rec.package_id))
retention = {
"current_expires_at": (
state.current_expires_at.isoformat()
if state.current_expires_at
else None
),
"effective_class": state.effective_class,
"active_hold_id": (
str(state.active_hold_id) if state.active_hold_id else None
),
"eligible_for_deletion": state.eligible_for_deletion,
}
except Exception as exc:
retention = {"error": f"{type(exc).__name__}: {exc}"}
results.append({"archive": rec.to_dict(), "retention": retention})
return results
finally:
if owned_registry and registry is not None:
await registry.dispose()