From 37c28d2298b08f48fe4382975dfa0410b7ee5b68 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 17 May 2026 12:21:19 +0200 Subject: [PATCH] archive: include contracts/, schemas/; report skipped top-level dirs Two of yesterday's archives silently dropped infospace content: the default include set was missing contracts/, so wealth-vsm-generation-pilot (16 files) and wealth-vsm-legacy-slice (12 files) were preserved as 14 and 10 files respectively. Fix the include set and make silent drops visible. - DEFAULT_INCLUDE now: infospace.yaml, artifacts, contracts, schemas, workflows, output, reports, exports - ArchiveRecord gains skipped_top_level: top-level entries present in the live root that are not in the include set, not excluded, and not auto- hidden (hidden dotfiles, empty dirs, .store/index.yaml). Surfaces in index.yaml only when non-empty. - Re-archived the two affected pilots with correct counts. Prior records remain in each index.yaml as history. Co-Authored-By: Claude Opus 4.7 --- .../output/archives/index.yaml | 19 ++++++ .../output/archives/index.yaml | 19 ++++++ src/infospace_bench/archive.py | 60 +++++++++++++++++++ tests/test_archive.py | 37 ++++++++++++ 4 files changed, 135 insertions(+) diff --git a/infospaces/wealth-vsm-generation-pilot/output/archives/index.yaml b/infospaces/wealth-vsm-generation-pilot/output/archives/index.yaml index 015dd6f..e9b38b9 100644 --- a/infospaces/wealth-vsm-generation-pilot/output/archives/index.yaml +++ b/infospaces/wealth-vsm-generation-pilot/output/archives/index.yaml @@ -15,3 +15,22 @@ archives: producer: infospace-bench subject: wealth-vsm-generation-pilot store_root: infospaces/wealth-vsm-generation-pilot/output/archives/.store +- package_id: a4c7809f-51f3-4fd7-8f15-1be95e8b6171 + manifest_digest: blake3:f5932ba65730a41c0a844f86749854200bf6443f2246afd88a4a0dfbefabde6a + retention_class: release-evidence + created_at: '2026-05-17T10:20:51' + included_paths: + - infospace.yaml + - artifacts + - contracts + - schemas + - workflows + - output + - reports + - exports + file_count: 16 + note: 'Re-archive: capture contracts/ that initial archive missed (DEFAULT_INCLUDE + fix)' + producer: infospace-bench + subject: wealth-vsm-generation-pilot + store_root: infospaces/wealth-vsm-generation-pilot/output/archives/.store diff --git a/infospaces/wealth-vsm-legacy-slice/output/archives/index.yaml b/infospaces/wealth-vsm-legacy-slice/output/archives/index.yaml index 3fbdedd..658c532 100644 --- a/infospaces/wealth-vsm-legacy-slice/output/archives/index.yaml +++ b/infospaces/wealth-vsm-legacy-slice/output/archives/index.yaml @@ -15,3 +15,22 @@ archives: producer: infospace-bench subject: wealth-vsm-legacy-slice store_root: infospaces/wealth-vsm-legacy-slice/output/archives/.store +- package_id: ba107ffc-03b8-4c39-a72f-9aec66cf1b45 + manifest_digest: blake3:30b06d0b6fe7d9fed1a094805c07ce7896fff950aece8ec33e4df99da162accb + retention_class: release-evidence + created_at: '2026-05-17T10:20:54' + included_paths: + - infospace.yaml + - artifacts + - contracts + - schemas + - workflows + - output + - reports + - exports + file_count: 12 + note: 'Re-archive: capture contracts/ that initial archive missed (DEFAULT_INCLUDE + fix)' + producer: infospace-bench + subject: wealth-vsm-legacy-slice + store_root: infospaces/wealth-vsm-legacy-slice/output/archives/.store diff --git a/src/infospace_bench/archive.py b/src/infospace_bench/archive.py index c78ead8..ee1da28 100644 --- a/src/infospace_bench/archive.py +++ b/src/infospace_bench/archive.py @@ -46,11 +46,29 @@ ARCHIVE_BACKEND_DIR = "storage" DEFAULT_INCLUDE: tuple[str, ...] = ( "infospace.yaml", "artifacts", + "contracts", + "schemas", "workflows", "output", "reports", "exports", ) +# Top-level entries the default include set already considers (file or dir), +# plus things we never want to capture. Anything in the live root that is not +# in this set and not in `exclude` shows up under `skipped_top_level` so silent +# data loss is visible in the archive record. +_KNOWN_TOP_LEVEL_NAMES: frozenset[str] = frozenset( + { + "infospace.yaml", + "artifacts", + "contracts", + "schemas", + "workflows", + "output", + "reports", + "exports", + } +) DEFAULT_RETENTION_CLASS = "release-evidence" PRODUCER = "infospace-bench" DEFAULT_ACTOR = "infospace-bench" @@ -71,6 +89,7 @@ class ArchiveRecord: subject: str = "" store_root: str | None = None metadata: dict[str, Any] = field(default_factory=dict) + skipped_top_level: list[str] = field(default_factory=list) def to_dict(self) -> dict[str, Any]: out: dict[str, Any] = { @@ -88,6 +107,8 @@ class ArchiveRecord: out["store_root"] = self.store_root if self.metadata: out["metadata"] = dict(self.metadata) + if self.skipped_top_level: + out["skipped_top_level"] = list(self.skipped_top_level) return out @classmethod @@ -106,6 +127,7 @@ class ArchiveRecord: str(data["store_root"]) if data.get("store_root") is not None else None ), metadata=dict(data.get("metadata", {})), + skipped_top_level=list(data.get("skipped_top_level", [])), ) @@ -260,6 +282,9 @@ async def _archive_infospace_async( "No files matched the include set for archiving", {"root": str(root), "include": list(include)}, ) + skipped_top_level = _find_skipped_top_level( + root, include=include, exclude=effective_exclude + ) owned_registry = registry is None effective_store_root: Path | None = None @@ -312,6 +337,7 @@ async def _archive_infospace_async( producer=PRODUCER, subject=subject, store_root=str(effective_store_root) if effective_store_root else None, + skipped_top_level=skipped_top_level, ) _append_index(root, record) return record @@ -341,6 +367,40 @@ def _collect_files( return sorted(seen.items()) +def _find_skipped_top_level( + root: Path, + *, + include: tuple[str, ...], + exclude: tuple[str, ...], +) -> list[str]: + """Return non-empty top-level entries that are silently dropped. + + A top-level entry is "skipped" when it is neither in the include set, nor + in the known structural set, nor matched by an exclude pattern. The auto- + excluded ``output/archives`` index/store paths do not count as user- + visible drops. + """ + + auto_excluded = {".store", "index.yaml"} + skipped: list[str] = [] + for entry in sorted(root.iterdir()): + name = entry.name + if name in _KNOWN_TOP_LEVEL_NAMES or name in include: + continue + if _is_excluded(name, exclude) or name in auto_excluded: + continue + # Hide hidden files (.git, .DS_Store, ...) by default. + if name.startswith("."): + continue + if entry.is_dir(): + try: + next(entry.iterdir()) + except (StopIteration, PermissionError): + continue + skipped.append(name) + return skipped + + def _is_excluded(rel_path: str, exclude: tuple[str, ...]) -> bool: for pattern in exclude: cleaned = pattern.rstrip("/") diff --git a/tests/test_archive.py b/tests/test_archive.py index 242626d..f9c6076 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -25,6 +25,16 @@ from infospace_bench.archive import ( ) +def _restored_paths_via_round_trip( + record: ArchiveRecord, source: Path, tmp_path: Path +) -> list[str]: + target = tmp_path / f"restore-{record.package_id[:8]}" + result = restore_archive( + record.package_id, target=target, source_infospace=source, + ) + return list(result.restored_paths) + + def _seed_infospace(workspace: Path, slug: str = "demo") -> Path: create_infospace(workspace, slug, name="Demo", topic_domain="Test") root = workspace / "infospaces" / slug @@ -188,6 +198,33 @@ def test_annotate_retention_returns_state_for_each_archive(tmp_path: Path) -> No assert retention["eligible_for_deletion"] is False +def test_archive_default_include_captures_contracts_and_schemas( + tmp_path: Path, +) -> None: + root = _seed_infospace(tmp_path) + (root / "contracts").mkdir() + (root / "contracts" / "entity.contract.md").write_text( + "# contract\n", encoding="utf-8" + ) + (root / "schemas").mkdir() + (root / "schemas" / "entity.schema.json").write_text("{}", encoding="utf-8") + + record = archive_infospace(root) + assert "contracts/entity.contract.md" in [ + rel for rel in _restored_paths_via_round_trip(record, root, tmp_path) + ] + + +def test_archive_surfaces_skipped_top_level_dirs(tmp_path: Path) -> None: + root = _seed_infospace(tmp_path) + (root / "experimental").mkdir() + (root / "experimental" / "scratch.md").write_text("scratch", encoding="utf-8") + (root / "empty-dir").mkdir() # empty: not flagged + + record = archive_infospace(root) + assert record.skipped_top_level == ["experimental"] + + def test_annotate_retention_returns_none_when_store_missing(tmp_path: Path) -> None: root = _seed_infospace(tmp_path) archive_infospace(root, store_root=tmp_path / "external-store")