archive: include contracts/, schemas/; report skipped top-level dirs

Two of yesterday's archives silently dropped infospace content: the default
include set was missing contracts/, so wealth-vsm-generation-pilot (16 files)
and wealth-vsm-legacy-slice (12 files) were preserved as 14 and 10 files
respectively. Fix the include set and make silent drops visible.

- DEFAULT_INCLUDE now: infospace.yaml, artifacts, contracts, schemas,
  workflows, output, reports, exports
- ArchiveRecord gains skipped_top_level: top-level entries present in the
  live root that are not in the include set, not excluded, and not auto-
  hidden (hidden dotfiles, empty dirs, .store/index.yaml). Surfaces in
  index.yaml only when non-empty.
- Re-archived the two affected pilots with correct counts. Prior records
  remain in each index.yaml as history.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-17 12:21:19 +02:00
parent 523db6d341
commit 37c28d2298
4 changed files with 135 additions and 0 deletions

View File

@@ -15,3 +15,22 @@ archives:
producer: infospace-bench
subject: wealth-vsm-generation-pilot
store_root: infospaces/wealth-vsm-generation-pilot/output/archives/.store
- package_id: a4c7809f-51f3-4fd7-8f15-1be95e8b6171
manifest_digest: blake3:f5932ba65730a41c0a844f86749854200bf6443f2246afd88a4a0dfbefabde6a
retention_class: release-evidence
created_at: '2026-05-17T10:20:51'
included_paths:
- infospace.yaml
- artifacts
- contracts
- schemas
- workflows
- output
- reports
- exports
file_count: 16
note: 'Re-archive: capture contracts/ that initial archive missed (DEFAULT_INCLUDE
fix)'
producer: infospace-bench
subject: wealth-vsm-generation-pilot
store_root: infospaces/wealth-vsm-generation-pilot/output/archives/.store

View File

@@ -15,3 +15,22 @@ archives:
producer: infospace-bench
subject: wealth-vsm-legacy-slice
store_root: infospaces/wealth-vsm-legacy-slice/output/archives/.store
- package_id: ba107ffc-03b8-4c39-a72f-9aec66cf1b45
manifest_digest: blake3:30b06d0b6fe7d9fed1a094805c07ce7896fff950aece8ec33e4df99da162accb
retention_class: release-evidence
created_at: '2026-05-17T10:20:54'
included_paths:
- infospace.yaml
- artifacts
- contracts
- schemas
- workflows
- output
- reports
- exports
file_count: 12
note: 'Re-archive: capture contracts/ that initial archive missed (DEFAULT_INCLUDE
fix)'
producer: infospace-bench
subject: wealth-vsm-legacy-slice
store_root: infospaces/wealth-vsm-legacy-slice/output/archives/.store

View File

@@ -46,11 +46,29 @@ ARCHIVE_BACKEND_DIR = "storage"
DEFAULT_INCLUDE: tuple[str, ...] = (
"infospace.yaml",
"artifacts",
"contracts",
"schemas",
"workflows",
"output",
"reports",
"exports",
)
# Top-level entries the default include set already considers (file or dir),
# plus things we never want to capture. Anything in the live root that is not
# in this set and not in `exclude` shows up under `skipped_top_level` so silent
# data loss is visible in the archive record.
_KNOWN_TOP_LEVEL_NAMES: frozenset[str] = frozenset(
{
"infospace.yaml",
"artifacts",
"contracts",
"schemas",
"workflows",
"output",
"reports",
"exports",
}
)
DEFAULT_RETENTION_CLASS = "release-evidence"
PRODUCER = "infospace-bench"
DEFAULT_ACTOR = "infospace-bench"
@@ -71,6 +89,7 @@ class ArchiveRecord:
subject: str = ""
store_root: str | None = None
metadata: dict[str, Any] = field(default_factory=dict)
skipped_top_level: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
out: dict[str, Any] = {
@@ -88,6 +107,8 @@ class ArchiveRecord:
out["store_root"] = self.store_root
if self.metadata:
out["metadata"] = dict(self.metadata)
if self.skipped_top_level:
out["skipped_top_level"] = list(self.skipped_top_level)
return out
@classmethod
@@ -106,6 +127,7 @@ class ArchiveRecord:
str(data["store_root"]) if data.get("store_root") is not None else None
),
metadata=dict(data.get("metadata", {})),
skipped_top_level=list(data.get("skipped_top_level", [])),
)
@@ -260,6 +282,9 @@ async def _archive_infospace_async(
"No files matched the include set for archiving",
{"root": str(root), "include": list(include)},
)
skipped_top_level = _find_skipped_top_level(
root, include=include, exclude=effective_exclude
)
owned_registry = registry is None
effective_store_root: Path | None = None
@@ -312,6 +337,7 @@ async def _archive_infospace_async(
producer=PRODUCER,
subject=subject,
store_root=str(effective_store_root) if effective_store_root else None,
skipped_top_level=skipped_top_level,
)
_append_index(root, record)
return record
@@ -341,6 +367,40 @@ def _collect_files(
return sorted(seen.items())
def _find_skipped_top_level(
root: Path,
*,
include: tuple[str, ...],
exclude: tuple[str, ...],
) -> list[str]:
"""Return non-empty top-level entries that are silently dropped.
A top-level entry is "skipped" when it is neither in the include set, nor
in the known structural set, nor matched by an exclude pattern. The auto-
excluded ``output/archives`` index/store paths do not count as user-
visible drops.
"""
auto_excluded = {".store", "index.yaml"}
skipped: list[str] = []
for entry in sorted(root.iterdir()):
name = entry.name
if name in _KNOWN_TOP_LEVEL_NAMES or name in include:
continue
if _is_excluded(name, exclude) or name in auto_excluded:
continue
# Hide hidden files (.git, .DS_Store, ...) by default.
if name.startswith("."):
continue
if entry.is_dir():
try:
next(entry.iterdir())
except (StopIteration, PermissionError):
continue
skipped.append(name)
return skipped
def _is_excluded(rel_path: str, exclude: tuple[str, ...]) -> bool:
for pattern in exclude:
cleaned = pattern.rstrip("/")

View File

@@ -25,6 +25,16 @@ from infospace_bench.archive import (
)
def _restored_paths_via_round_trip(
record: ArchiveRecord, source: Path, tmp_path: Path
) -> list[str]:
target = tmp_path / f"restore-{record.package_id[:8]}"
result = restore_archive(
record.package_id, target=target, source_infospace=source,
)
return list(result.restored_paths)
def _seed_infospace(workspace: Path, slug: str = "demo") -> Path:
create_infospace(workspace, slug, name="Demo", topic_domain="Test")
root = workspace / "infospaces" / slug
@@ -188,6 +198,33 @@ def test_annotate_retention_returns_state_for_each_archive(tmp_path: Path) -> No
assert retention["eligible_for_deletion"] is False
def test_archive_default_include_captures_contracts_and_schemas(
tmp_path: Path,
) -> None:
root = _seed_infospace(tmp_path)
(root / "contracts").mkdir()
(root / "contracts" / "entity.contract.md").write_text(
"# contract\n", encoding="utf-8"
)
(root / "schemas").mkdir()
(root / "schemas" / "entity.schema.json").write_text("{}", encoding="utf-8")
record = archive_infospace(root)
assert "contracts/entity.contract.md" in [
rel for rel in _restored_paths_via_round_trip(record, root, tmp_path)
]
def test_archive_surfaces_skipped_top_level_dirs(tmp_path: Path) -> None:
root = _seed_infospace(tmp_path)
(root / "experimental").mkdir()
(root / "experimental" / "scratch.md").write_text("scratch", encoding="utf-8")
(root / "empty-dir").mkdir() # empty: not flagged
record = archive_infospace(root)
assert record.skipped_top_level == ["experimental"]
def test_annotate_retention_returns_none_when_store_missing(tmp_path: Path) -> None:
root = _seed_infospace(tmp_path)
archive_infospace(root, store_root=tmp_path / "external-store")