diff --git a/pyproject.toml b/pyproject.toml index 79a9ae9..94f3bb8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "alembic >= 1.13", "blake3 >= 0.4", "cbor2 >= 5.6", + "jcs >= 0.2", "typer >= 0.12", "structlog >= 24.1", "pydantic >= 2.7", @@ -89,6 +90,10 @@ mypy_path = "src" explicit_package_bases = true namespace_packages = true +[[tool.mypy.overrides]] +module = ["jcs"] +ignore_missing_imports = true + [tool.pytest.ini_options] asyncio_mode = "auto" testpaths = ["tests"] diff --git a/src/artifactstore/manifest/__init__.py b/src/artifactstore/manifest/__init__.py index 9b87e45..36ca0fa 100644 --- a/src/artifactstore/manifest/__init__.py +++ b/src/artifactstore/manifest/__init__.py @@ -1,5 +1,37 @@ -"""Package manifest model and canonical-CBOR codec. +"""Package manifest model, codec, and projections (ADR-0003). -Real implementation lands in ARTIFACT-STORE-WP-0001-T010. See ADR-0003 for -the canonicalisation pin (RFC 8949 §4.2.2). +The canonical wire format is CBOR with deterministic encoding (RFC 8949). +JCS (RFC 8785) is the JSON projection used for display and signing-tool +interop. A package's external identifier is the BLAKE3 digest over its +canonical CBOR bytes (ADR-0001). """ + +from artifactstore.manifest.codec import decode, encode +from artifactstore.manifest.digest import manifest_digest +from artifactstore.manifest.model import ( + MANIFEST_VERSION, + FileEntry, + Manifest, + Package, + Provenance, + RetentionHold, + RetentionSummary, + StorageReceipt, +) +from artifactstore.manifest.projection import cbor_from_jcs, jcs_projection + +__all__ = [ + "MANIFEST_VERSION", + "FileEntry", + "Manifest", + "Package", + "Provenance", + "RetentionHold", + "RetentionSummary", + "StorageReceipt", + "cbor_from_jcs", + "decode", + "encode", + "jcs_projection", + "manifest_digest", +] diff --git a/src/artifactstore/manifest/codec.py b/src/artifactstore/manifest/codec.py new file mode 100644 index 0000000..b16bc6a --- /dev/null +++ b/src/artifactstore/manifest/codec.py @@ -0,0 +1,63 @@ +"""Canonical CBOR codec for the manifest (ADR-0003). + +Canonicalisation uses cbor2's deterministic encoding (RFC 8949): definite- +length encoding, shortest-form integers, sorted map keys. The same input +manifest always produces the same byte sequence, which is the property the +manifest digest (ADR-0001) and signature flows rely on. +""" + +from __future__ import annotations + +from dataclasses import asdict +from typing import Any + +import cbor2 + +from artifactstore.manifest.model import ( + FileEntry, + Manifest, + Package, + Provenance, + RetentionHold, + RetentionSummary, + StorageReceipt, +) + +__all__ = ["decode", "encode"] + + +def encode(manifest: Manifest) -> bytes: + """Serialise a :class:`Manifest` to canonical CBOR bytes.""" + payload = asdict(manifest) + return cbor2.dumps(payload, canonical=True) + + +def decode(data: bytes) -> Manifest: + """Parse canonical CBOR bytes back into a :class:`Manifest`.""" + payload = cbor2.loads(data) + if not isinstance(payload, dict): + raise ValueError("manifest must decode to a CBOR map") + return _from_dict(payload) + + +def _from_dict(payload: dict[str, Any]) -> Manifest: + package = Package(**payload["package"]) + files = [FileEntry(**f) for f in payload["files"]] + receipts = [StorageReceipt(**r) for r in payload["storage_receipts"]] + rs_raw = payload["retention_summary"] + holds = [RetentionHold(**h) for h in rs_raw["active_holds"]] + retention_summary = RetentionSummary( + retention_class=rs_raw["retention_class"], + expires_at=rs_raw["expires_at"], + active_holds=holds, + last_retention_event_sequence=rs_raw["last_retention_event_sequence"], + ) + provenance = Provenance(**payload["provenance"]) + return Manifest( + manifest_version=payload["manifest_version"], + package=package, + files=files, + storage_receipts=receipts, + retention_summary=retention_summary, + provenance=provenance, + ) diff --git a/src/artifactstore/manifest/digest.py b/src/artifactstore/manifest/digest.py new file mode 100644 index 0000000..c9484c3 --- /dev/null +++ b/src/artifactstore/manifest/digest.py @@ -0,0 +1,19 @@ +"""Manifest digest helper (ADR-0001 + ADR-0003).""" + +from __future__ import annotations + +from artifactstore.identity import PRIMARY_ALGORITHM, ContentAddress, digest_bytes +from artifactstore.manifest.codec import encode +from artifactstore.manifest.model import Manifest + +__all__ = ["manifest_digest"] + + +def manifest_digest(manifest: Manifest) -> ContentAddress: + """Return the canonical content address of a manifest. + + Defined as the primary digest (default BLAKE3) over the manifest's + canonical CBOR bytes. This is the package's external identifier. + """ + cbor_bytes = encode(manifest) + return digest_bytes(cbor_bytes, primary=PRIMARY_ALGORITHM).primary.content_address diff --git a/src/artifactstore/manifest/model.py b/src/artifactstore/manifest/model.py new file mode 100644 index 0000000..0e30af2 --- /dev/null +++ b/src/artifactstore/manifest/model.py @@ -0,0 +1,97 @@ +"""Manifest data model (ADR-0003). + +Field types are restricted to CBOR-/JSON-compatible primitives (``str``, +``int``, ``bool``, ``None``, ``list``, ``dict``) so the canonical CBOR +encoding and the JCS JSON projection round-trip losslessly. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +MANIFEST_VERSION = 1 + + +@dataclass(frozen=True, slots=True) +class FileEntry: + """One stored file in a package.""" + + id: str + relative_path: str + media_type: str + size_bytes: int + digest_algorithm: str + digest_primary_hex: str + digest_sha256_hex: str + + +@dataclass(frozen=True, slots=True) +class StorageReceipt: + """A record of where a file's bytes are stored.""" + + file_id: str + backend_id: str + content_address: str + retrieval_tier: str + status: str + + +@dataclass(frozen=True, slots=True) +class RetentionHold: + """An active hold preventing deletion eligibility.""" + + hold_id: str + reason: str + actor: str + applied_at: str + + +@dataclass(frozen=True, slots=True) +class RetentionSummary: + """Retention state summary as of manifest write time.""" + + retention_class: str + expires_at: str | None + active_holds: list[RetentionHold] + last_retention_event_sequence: int | None + + +@dataclass(frozen=True, slots=True) +class Package: + """Package-level metadata.""" + + id: str + name: str + producer: str + subject: str + retention_class: str + status: str + created_at: str + finalized_at: str | None + expires_at: str | None + metadata: dict[str, Any] + metadata_schema_id: str | None + + +@dataclass(frozen=True, slots=True) +class Provenance: + """Provenance fields recorded at ingest time.""" + + source_commits: dict[str, str] + tool_versions: dict[str, str] + environment: dict[str, str] + ingest_actor: str + ingest_timestamps: dict[str, str] + + +@dataclass(frozen=True, slots=True) +class Manifest: + """The complete v1 manifest payload.""" + + manifest_version: int + package: Package + files: list[FileEntry] + storage_receipts: list[StorageReceipt] + retention_summary: RetentionSummary + provenance: Provenance diff --git a/src/artifactstore/manifest/projection.py b/src/artifactstore/manifest/projection.py new file mode 100644 index 0000000..dd28186 --- /dev/null +++ b/src/artifactstore/manifest/projection.py @@ -0,0 +1,35 @@ +"""JCS (RFC 8785) projection of the manifest. + +Used for human display, signing-tool interop, and as the JSON form of the +manifest. Round-trips losslessly with the canonical CBOR form as long as +manifest values use only JSON-compatible primitive types. +""" + +from __future__ import annotations + +import json +from dataclasses import asdict + +import cbor2 +import jcs + +from artifactstore.manifest.model import Manifest + +__all__ = ["cbor_from_jcs", "jcs_projection"] + + +def jcs_projection(manifest: Manifest) -> bytes: + """Serialise a :class:`Manifest` to canonical JSON (RFC 8785).""" + payload = asdict(manifest) + return jcs.canonicalize(payload) # type: ignore[no-any-return] + + +def cbor_from_jcs(jcs_bytes: bytes) -> bytes: + """Decode JCS JSON bytes and re-encode as canonical CBOR. + + Used to validate the JCS↔CBOR round-trip property: ``cbor_from_jcs`` of + a JCS projection must equal the canonical CBOR encoding of the same + manifest (after re-decoding). + """ + payload = json.loads(jcs_bytes) + return cbor2.dumps(payload, canonical=True) diff --git a/tests/unit/test_manifest.py b/tests/unit/test_manifest.py new file mode 100644 index 0000000..23efaf8 --- /dev/null +++ b/tests/unit/test_manifest.py @@ -0,0 +1,167 @@ +"""Tests for :mod:`artifactstore.manifest` (ARTIFACT-STORE-WP-0001-T010).""" + +from __future__ import annotations + +from typing import Any + +import cbor2 +import pytest +from hypothesis import HealthCheck, given, settings +from hypothesis import strategies as st + +from artifactstore.identity import PRIMARY_ALGORITHM, digest_bytes +from artifactstore.manifest import ( + MANIFEST_VERSION, + FileEntry, + Manifest, + Package, + Provenance, + RetentionHold, + RetentionSummary, + StorageReceipt, + cbor_from_jcs, + decode, + encode, + jcs_projection, + manifest_digest, +) + + +def _sample_manifest( + *, + file_count: int = 1, + holds: int = 0, + metadata: dict[str, Any] | None = None, +) -> Manifest: + files = [ + FileEntry( + id=f"file-{i:04d}", + relative_path=f"reports/r-{i}.json", + media_type="application/json", + size_bytes=42 + i, + digest_algorithm=PRIMARY_ALGORITHM, + digest_primary_hex="ab" * 32, + digest_sha256_hex="cd" * 32, + ) + for i in range(file_count) + ] + receipts = [ + StorageReceipt( + file_id=f.id, + backend_id="local", + content_address=f"{f.digest_algorithm}:{f.digest_primary_hex}", + retrieval_tier="hot", + status="recorded", + ) + for f in files + ] + holds_list = [ + RetentionHold( + hold_id=f"hold-{i:04d}", + reason="audit-prep", + actor="ops@example.org", + applied_at="2026-05-15T12:00:00Z", + ) + for i in range(holds) + ] + return Manifest( + manifest_version=MANIFEST_VERSION, + package=Package( + id="pkg-0001", + name="guide-board cmis run", + producer="guide-board", + subject="kontextual-engine", + retention_class="raw-evidence", + status="finalized", + created_at="2026-05-15T10:00:00Z", + finalized_at="2026-05-15T11:00:00Z", + expires_at="2027-05-15T11:00:00Z", + metadata=metadata if metadata is not None else {}, + metadata_schema_id=None, + ), + files=files, + storage_receipts=receipts, + retention_summary=RetentionSummary( + retention_class="raw-evidence", + expires_at="2027-05-15T11:00:00Z", + active_holds=holds_list, + last_retention_event_sequence=None, + ), + provenance=Provenance( + source_commits={"guide-board": "abc1234"}, + tool_versions={"guide-board": "0.1.0"}, + environment={"runner": "ci"}, + ingest_actor="codex", + ingest_timestamps={"received_at": "2026-05-15T10:00:00Z"}, + ), + ) + + +def test_cbor_round_trip_simple() -> None: + m = _sample_manifest(file_count=3, holds=1) + assert decode(encode(m)) == m + + +def test_canonical_cbor_is_byte_stable_across_calls() -> None: + m = _sample_manifest( + file_count=5, + holds=0, + metadata={"zz": "z", "aa": "a", "mm": "m"}, + ) + assert encode(m) == encode(m) + + +def test_jcs_round_trip_via_cbor() -> None: + m = _sample_manifest(file_count=2, holds=2, metadata={"key": "value", "n": 7}) + cbor_a = encode(m) + jcs_bytes = jcs_projection(m) + cbor_b = cbor_from_jcs(jcs_bytes) + m_decoded = decode(cbor_b) + cbor_c = encode(m_decoded) + assert cbor_a == cbor_c + + +def test_manifest_digest_is_blake3_of_canonical_cbor() -> None: + m = _sample_manifest() + ca = manifest_digest(m) + expected = digest_bytes(encode(m)).primary.content_address + assert ca == expected + assert str(ca).startswith("blake3:") + + +def test_decode_rejects_non_map_cbor() -> None: + bad = cbor2.dumps([1, 2, 3], canonical=True) + with pytest.raises(ValueError): + decode(bad) + + +def test_jcs_projection_is_canonical_json() -> None: + m = _sample_manifest(file_count=1, holds=0, metadata={"b": 2, "a": 1}) + jcs_bytes = jcs_projection(m) + # JCS sorts object keys lexicographically; verify metadata keys appear + # in alphabetical order in the serialised output. + text = jcs_bytes.decode("utf-8") + assert text.find('"a":1') < text.find('"b":2') + + +@settings(suppress_health_check=[HealthCheck.too_slow], max_examples=25) +@given( + file_count=st.integers(min_value=0, max_value=5), + hold_count=st.integers(min_value=0, max_value=3), +) +def test_property_cbor_round_trip(file_count: int, hold_count: int) -> None: + m = _sample_manifest(file_count=file_count, holds=hold_count) + assert decode(encode(m)) == m + + +@settings(suppress_health_check=[HealthCheck.too_slow], max_examples=25) +@given( + file_count=st.integers(min_value=0, max_value=3), + hold_count=st.integers(min_value=0, max_value=2), +) +def test_property_jcs_to_cbor_round_trip(file_count: int, hold_count: int) -> None: + m = _sample_manifest(file_count=file_count, holds=hold_count) + cbor_a = encode(m) + cbor_b = cbor_from_jcs(jcs_projection(m)) + cbor_c = encode(decode(cbor_b)) + assert cbor_a == cbor_c diff --git a/uv.lock b/uv.lock index 19615df..8d5348f 100644 --- a/uv.lock +++ b/uv.lock @@ -70,6 +70,7 @@ dependencies = [ { name = "blake3" }, { name = "cbor2" }, { name = "fastapi" }, + { name = "jcs" }, { name = "pydantic" }, { name = "pydantic-settings" }, { name = "sqlalchemy" }, @@ -108,6 +109,7 @@ requires-dist = [ { name = "fastapi", specifier = ">=0.115" }, { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.27" }, { name = "hypothesis", marker = "extra == 'dev'", specifier = ">=6.100" }, + { name = "jcs", specifier = ">=0.2" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.10" }, { name = "pydantic", specifier = ">=2.7" }, { name = "pydantic-settings", specifier = ">=2.4" }, @@ -505,6 +507,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484 }, ] +[[package]] +name = "jcs" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/e5/9d547f0d42ba00f68eec773aa7b2145e0c0335eb632cbaf519f480a429af/jcs-0.2.1.tar.gz", hash = "sha256:9f20360b2f3b0a410d65493b448f96306d80e37fb46283f3f4aa5db2c7c1472b", size = 6886 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/d4/9a99bc15266a842bd14a1913afdb05182888ebab035666c1ce8a64537ca2/jcs-0.2.1-py3-none-any.whl", hash = "sha256:e23a3e1de60f832d33cd811bb9c3b3be79219cdf95f63b88f0972732c3fa8476", size = 7603 }, +] + [[package]] name = "librt" version = "0.11.0" diff --git a/workplans/ARTIFACT-STORE-WP-0001-service-baseline.md b/workplans/ARTIFACT-STORE-WP-0001-service-baseline.md index 64affab..3eca839 100644 --- a/workplans/ARTIFACT-STORE-WP-0001-service-baseline.md +++ b/workplans/ARTIFACT-STORE-WP-0001-service-baseline.md @@ -118,7 +118,7 @@ Acceptance: ```task id: ARTIFACT-STORE-WP-0001-T010 -status: todo +status: done priority: high state_hub_task_id: "8b45a3d9-aa19-4ae8-afe0-687417bf12d0" ```