WP-0001-T010: manifest model, canonical CBOR codec, JCS projection

Adds the manifest layer per ADR-0003. The canonical wire format is CBOR with
deterministic encoding (cbor2 canonical=True: definite-length, shortest-form
integers, sorted map keys); JCS (RFC 8785) is the JSON projection.

src/artifactstore/manifest/:
- model.py: frozen dataclasses for Manifest (manifest_version=1, package,
  files, storage_receipts, retention_summary, provenance) with restricted
  types (str/int/bool/None/list/dict) so CBOR and JCS round-trip losslessly.
- codec.py: encode (Manifest -> canonical CBOR bytes) and decode (CBOR bytes
  -> Manifest) via cbor2.
- projection.py: jcs_projection (Manifest -> RFC 8785 canonical JSON) plus
  cbor_from_jcs for cross-format round-trip verification.
- digest.py: manifest_digest returns the BLAKE3 content address of the
  manifest's canonical CBOR bytes (ADR-0001).
- __init__.py: re-exports the public surface.

tests/unit/test_manifest.py:
- decode(encode(m)) == m round-trip (hypothesis-parameterised).
- JCS↔CBOR round-trip: encode(decode(cbor_from_jcs(jcs(m)))) == encode(m).
- Byte stability of the canonical CBOR encoder across calls.
- manifest_digest matches independent BLAKE3 over encode(m).
- Decode rejects non-map CBOR.
- JCS projection sorts keys lexicographically.

Deps: jcs added to project requirements; mypy override for the jcs package
(no stubs published yet).

Gates: ruff clean, mypy --strict clean on 26 files, 26 tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-16 01:39:42 +02:00
parent c1bfb8b486
commit 9cbb9847ed
9 changed files with 433 additions and 4 deletions

View File

@@ -29,6 +29,7 @@ dependencies = [
"alembic >= 1.13",
"blake3 >= 0.4",
"cbor2 >= 5.6",
"jcs >= 0.2",
"typer >= 0.12",
"structlog >= 24.1",
"pydantic >= 2.7",
@@ -89,6 +90,10 @@ mypy_path = "src"
explicit_package_bases = true
namespace_packages = true
[[tool.mypy.overrides]]
module = ["jcs"]
ignore_missing_imports = true
[tool.pytest.ini_options]
asyncio_mode = "auto"
testpaths = ["tests"]

View File

@@ -1,5 +1,37 @@
"""Package manifest model and canonical-CBOR codec.
"""Package manifest model, codec, and projections (ADR-0003).
Real implementation lands in ARTIFACT-STORE-WP-0001-T010. See ADR-0003 for
the canonicalisation pin (RFC 8949 §4.2.2).
The canonical wire format is CBOR with deterministic encoding (RFC 8949).
JCS (RFC 8785) is the JSON projection used for display and signing-tool
interop. A package's external identifier is the BLAKE3 digest over its
canonical CBOR bytes (ADR-0001).
"""
from artifactstore.manifest.codec import decode, encode
from artifactstore.manifest.digest import manifest_digest
from artifactstore.manifest.model import (
MANIFEST_VERSION,
FileEntry,
Manifest,
Package,
Provenance,
RetentionHold,
RetentionSummary,
StorageReceipt,
)
from artifactstore.manifest.projection import cbor_from_jcs, jcs_projection
__all__ = [
"MANIFEST_VERSION",
"FileEntry",
"Manifest",
"Package",
"Provenance",
"RetentionHold",
"RetentionSummary",
"StorageReceipt",
"cbor_from_jcs",
"decode",
"encode",
"jcs_projection",
"manifest_digest",
]

View File

@@ -0,0 +1,63 @@
"""Canonical CBOR codec for the manifest (ADR-0003).
Canonicalisation uses cbor2's deterministic encoding (RFC 8949): definite-
length encoding, shortest-form integers, sorted map keys. The same input
manifest always produces the same byte sequence, which is the property the
manifest digest (ADR-0001) and signature flows rely on.
"""
from __future__ import annotations
from dataclasses import asdict
from typing import Any
import cbor2
from artifactstore.manifest.model import (
FileEntry,
Manifest,
Package,
Provenance,
RetentionHold,
RetentionSummary,
StorageReceipt,
)
__all__ = ["decode", "encode"]
def encode(manifest: Manifest) -> bytes:
"""Serialise a :class:`Manifest` to canonical CBOR bytes."""
payload = asdict(manifest)
return cbor2.dumps(payload, canonical=True)
def decode(data: bytes) -> Manifest:
"""Parse canonical CBOR bytes back into a :class:`Manifest`."""
payload = cbor2.loads(data)
if not isinstance(payload, dict):
raise ValueError("manifest must decode to a CBOR map")
return _from_dict(payload)
def _from_dict(payload: dict[str, Any]) -> Manifest:
package = Package(**payload["package"])
files = [FileEntry(**f) for f in payload["files"]]
receipts = [StorageReceipt(**r) for r in payload["storage_receipts"]]
rs_raw = payload["retention_summary"]
holds = [RetentionHold(**h) for h in rs_raw["active_holds"]]
retention_summary = RetentionSummary(
retention_class=rs_raw["retention_class"],
expires_at=rs_raw["expires_at"],
active_holds=holds,
last_retention_event_sequence=rs_raw["last_retention_event_sequence"],
)
provenance = Provenance(**payload["provenance"])
return Manifest(
manifest_version=payload["manifest_version"],
package=package,
files=files,
storage_receipts=receipts,
retention_summary=retention_summary,
provenance=provenance,
)

View File

@@ -0,0 +1,19 @@
"""Manifest digest helper (ADR-0001 + ADR-0003)."""
from __future__ import annotations
from artifactstore.identity import PRIMARY_ALGORITHM, ContentAddress, digest_bytes
from artifactstore.manifest.codec import encode
from artifactstore.manifest.model import Manifest
__all__ = ["manifest_digest"]
def manifest_digest(manifest: Manifest) -> ContentAddress:
"""Return the canonical content address of a manifest.
Defined as the primary digest (default BLAKE3) over the manifest's
canonical CBOR bytes. This is the package's external identifier.
"""
cbor_bytes = encode(manifest)
return digest_bytes(cbor_bytes, primary=PRIMARY_ALGORITHM).primary.content_address

View File

@@ -0,0 +1,97 @@
"""Manifest data model (ADR-0003).
Field types are restricted to CBOR-/JSON-compatible primitives (``str``,
``int``, ``bool``, ``None``, ``list``, ``dict``) so the canonical CBOR
encoding and the JCS JSON projection round-trip losslessly.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
MANIFEST_VERSION = 1
@dataclass(frozen=True, slots=True)
class FileEntry:
"""One stored file in a package."""
id: str
relative_path: str
media_type: str
size_bytes: int
digest_algorithm: str
digest_primary_hex: str
digest_sha256_hex: str
@dataclass(frozen=True, slots=True)
class StorageReceipt:
"""A record of where a file's bytes are stored."""
file_id: str
backend_id: str
content_address: str
retrieval_tier: str
status: str
@dataclass(frozen=True, slots=True)
class RetentionHold:
"""An active hold preventing deletion eligibility."""
hold_id: str
reason: str
actor: str
applied_at: str
@dataclass(frozen=True, slots=True)
class RetentionSummary:
"""Retention state summary as of manifest write time."""
retention_class: str
expires_at: str | None
active_holds: list[RetentionHold]
last_retention_event_sequence: int | None
@dataclass(frozen=True, slots=True)
class Package:
"""Package-level metadata."""
id: str
name: str
producer: str
subject: str
retention_class: str
status: str
created_at: str
finalized_at: str | None
expires_at: str | None
metadata: dict[str, Any]
metadata_schema_id: str | None
@dataclass(frozen=True, slots=True)
class Provenance:
"""Provenance fields recorded at ingest time."""
source_commits: dict[str, str]
tool_versions: dict[str, str]
environment: dict[str, str]
ingest_actor: str
ingest_timestamps: dict[str, str]
@dataclass(frozen=True, slots=True)
class Manifest:
"""The complete v1 manifest payload."""
manifest_version: int
package: Package
files: list[FileEntry]
storage_receipts: list[StorageReceipt]
retention_summary: RetentionSummary
provenance: Provenance

View File

@@ -0,0 +1,35 @@
"""JCS (RFC 8785) projection of the manifest.
Used for human display, signing-tool interop, and as the JSON form of the
manifest. Round-trips losslessly with the canonical CBOR form as long as
manifest values use only JSON-compatible primitive types.
"""
from __future__ import annotations
import json
from dataclasses import asdict
import cbor2
import jcs
from artifactstore.manifest.model import Manifest
__all__ = ["cbor_from_jcs", "jcs_projection"]
def jcs_projection(manifest: Manifest) -> bytes:
"""Serialise a :class:`Manifest` to canonical JSON (RFC 8785)."""
payload = asdict(manifest)
return jcs.canonicalize(payload) # type: ignore[no-any-return]
def cbor_from_jcs(jcs_bytes: bytes) -> bytes:
"""Decode JCS JSON bytes and re-encode as canonical CBOR.
Used to validate the JCS↔CBOR round-trip property: ``cbor_from_jcs`` of
a JCS projection must equal the canonical CBOR encoding of the same
manifest (after re-decoding).
"""
payload = json.loads(jcs_bytes)
return cbor2.dumps(payload, canonical=True)

167
tests/unit/test_manifest.py Normal file
View File

@@ -0,0 +1,167 @@
"""Tests for :mod:`artifactstore.manifest` (ARTIFACT-STORE-WP-0001-T010)."""
from __future__ import annotations
from typing import Any
import cbor2
import pytest
from hypothesis import HealthCheck, given, settings
from hypothesis import strategies as st
from artifactstore.identity import PRIMARY_ALGORITHM, digest_bytes
from artifactstore.manifest import (
MANIFEST_VERSION,
FileEntry,
Manifest,
Package,
Provenance,
RetentionHold,
RetentionSummary,
StorageReceipt,
cbor_from_jcs,
decode,
encode,
jcs_projection,
manifest_digest,
)
def _sample_manifest(
*,
file_count: int = 1,
holds: int = 0,
metadata: dict[str, Any] | None = None,
) -> Manifest:
files = [
FileEntry(
id=f"file-{i:04d}",
relative_path=f"reports/r-{i}.json",
media_type="application/json",
size_bytes=42 + i,
digest_algorithm=PRIMARY_ALGORITHM,
digest_primary_hex="ab" * 32,
digest_sha256_hex="cd" * 32,
)
for i in range(file_count)
]
receipts = [
StorageReceipt(
file_id=f.id,
backend_id="local",
content_address=f"{f.digest_algorithm}:{f.digest_primary_hex}",
retrieval_tier="hot",
status="recorded",
)
for f in files
]
holds_list = [
RetentionHold(
hold_id=f"hold-{i:04d}",
reason="audit-prep",
actor="ops@example.org",
applied_at="2026-05-15T12:00:00Z",
)
for i in range(holds)
]
return Manifest(
manifest_version=MANIFEST_VERSION,
package=Package(
id="pkg-0001",
name="guide-board cmis run",
producer="guide-board",
subject="kontextual-engine",
retention_class="raw-evidence",
status="finalized",
created_at="2026-05-15T10:00:00Z",
finalized_at="2026-05-15T11:00:00Z",
expires_at="2027-05-15T11:00:00Z",
metadata=metadata if metadata is not None else {},
metadata_schema_id=None,
),
files=files,
storage_receipts=receipts,
retention_summary=RetentionSummary(
retention_class="raw-evidence",
expires_at="2027-05-15T11:00:00Z",
active_holds=holds_list,
last_retention_event_sequence=None,
),
provenance=Provenance(
source_commits={"guide-board": "abc1234"},
tool_versions={"guide-board": "0.1.0"},
environment={"runner": "ci"},
ingest_actor="codex",
ingest_timestamps={"received_at": "2026-05-15T10:00:00Z"},
),
)
def test_cbor_round_trip_simple() -> None:
m = _sample_manifest(file_count=3, holds=1)
assert decode(encode(m)) == m
def test_canonical_cbor_is_byte_stable_across_calls() -> None:
m = _sample_manifest(
file_count=5,
holds=0,
metadata={"zz": "z", "aa": "a", "mm": "m"},
)
assert encode(m) == encode(m)
def test_jcs_round_trip_via_cbor() -> None:
m = _sample_manifest(file_count=2, holds=2, metadata={"key": "value", "n": 7})
cbor_a = encode(m)
jcs_bytes = jcs_projection(m)
cbor_b = cbor_from_jcs(jcs_bytes)
m_decoded = decode(cbor_b)
cbor_c = encode(m_decoded)
assert cbor_a == cbor_c
def test_manifest_digest_is_blake3_of_canonical_cbor() -> None:
m = _sample_manifest()
ca = manifest_digest(m)
expected = digest_bytes(encode(m)).primary.content_address
assert ca == expected
assert str(ca).startswith("blake3:")
def test_decode_rejects_non_map_cbor() -> None:
bad = cbor2.dumps([1, 2, 3], canonical=True)
with pytest.raises(ValueError):
decode(bad)
def test_jcs_projection_is_canonical_json() -> None:
m = _sample_manifest(file_count=1, holds=0, metadata={"b": 2, "a": 1})
jcs_bytes = jcs_projection(m)
# JCS sorts object keys lexicographically; verify metadata keys appear
# in alphabetical order in the serialised output.
text = jcs_bytes.decode("utf-8")
assert text.find('"a":1') < text.find('"b":2')
@settings(suppress_health_check=[HealthCheck.too_slow], max_examples=25)
@given(
file_count=st.integers(min_value=0, max_value=5),
hold_count=st.integers(min_value=0, max_value=3),
)
def test_property_cbor_round_trip(file_count: int, hold_count: int) -> None:
m = _sample_manifest(file_count=file_count, holds=hold_count)
assert decode(encode(m)) == m
@settings(suppress_health_check=[HealthCheck.too_slow], max_examples=25)
@given(
file_count=st.integers(min_value=0, max_value=3),
hold_count=st.integers(min_value=0, max_value=2),
)
def test_property_jcs_to_cbor_round_trip(file_count: int, hold_count: int) -> None:
m = _sample_manifest(file_count=file_count, holds=hold_count)
cbor_a = encode(m)
cbor_b = cbor_from_jcs(jcs_projection(m))
cbor_c = encode(decode(cbor_b))
assert cbor_a == cbor_c

11
uv.lock generated
View File

@@ -70,6 +70,7 @@ dependencies = [
{ name = "blake3" },
{ name = "cbor2" },
{ name = "fastapi" },
{ name = "jcs" },
{ name = "pydantic" },
{ name = "pydantic-settings" },
{ name = "sqlalchemy" },
@@ -108,6 +109,7 @@ requires-dist = [
{ name = "fastapi", specifier = ">=0.115" },
{ name = "httpx", marker = "extra == 'dev'", specifier = ">=0.27" },
{ name = "hypothesis", marker = "extra == 'dev'", specifier = ">=6.100" },
{ name = "jcs", specifier = ">=0.2" },
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.10" },
{ name = "pydantic", specifier = ">=2.7" },
{ name = "pydantic-settings", specifier = ">=2.4" },
@@ -505,6 +507,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484 },
]
[[package]]
name = "jcs"
version = "0.2.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/47/e5/9d547f0d42ba00f68eec773aa7b2145e0c0335eb632cbaf519f480a429af/jcs-0.2.1.tar.gz", hash = "sha256:9f20360b2f3b0a410d65493b448f96306d80e37fb46283f3f4aa5db2c7c1472b", size = 6886 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c0/d4/9a99bc15266a842bd14a1913afdb05182888ebab035666c1ce8a64537ca2/jcs-0.2.1-py3-none-any.whl", hash = "sha256:e23a3e1de60f832d33cd811bb9c3b3be79219cdf95f63b88f0972732c3fa8476", size = 7603 },
]
[[package]]
name = "librt"
version = "0.11.0"

View File

@@ -118,7 +118,7 @@ Acceptance:
```task
id: ARTIFACT-STORE-WP-0001-T010
status: todo
status: done
priority: high
state_hub_task_id: "8b45a3d9-aa19-4ae8-afe0-687417bf12d0"
```