From c1bfb8b486e10e1be2343bd6aabd14dd172e62f1 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sat, 16 May 2026 01:34:24 +0200 Subject: [PATCH] WP-0001-T009: digest abstraction and content address (ADR-0001) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/artifactstore/identity/__init__.py: - Digest: frozen, hashable dataclass (algorithm + lowercase hex), validated. - ContentAddress: canonical `:` string form with validating parser (to_digest) and emitter (str / from_digest). - DigestPair: dual-digest result (primary + sha256) from a single hashing pass. - Algorithm registry: register_algorithm / get_algorithm / list_algorithms with name validation `[a-z][a-z0-9_-]*`. - digest_bytes (sync) and digest_stream (async) — single-pass dual hashing. - BLAKE3 registered as PRIMARY_ALGORITHM, SHA-256 as INTEROP_ALGORITHM at module import. tests/unit/test_identity.py: - Hypothesis property test asserts digest_bytes matches hashlib.sha256 and blake3.blake3 for random byte sequences up to 4 KiB. - digest_stream invariants: equivalence with digest_bytes under chunked input; defaults to BLAKE3 primary; always computes SHA-256; handles empty input. - Digest / ContentAddress invariants: rejects uppercase hex, empty fields, odd hex length, missing separator; frozen and hashable. Gates: ruff clean, mypy --strict clean on 21 source files, 18 tests pass. Co-Authored-By: Claude Opus 4.7 --- src/artifactstore/identity/__init__.py | 203 +++++++++++++++++- tests/unit/test_identity.py | 125 +++++++++++ ...ARTIFACT-STORE-WP-0001-service-baseline.md | 2 +- 3 files changed, 326 insertions(+), 4 deletions(-) create mode 100644 tests/unit/test_identity.py diff --git a/src/artifactstore/identity/__init__.py b/src/artifactstore/identity/__init__.py index 34b6bb9..20adfe2 100644 --- a/src/artifactstore/identity/__init__.py +++ b/src/artifactstore/identity/__init__.py @@ -1,5 +1,202 @@ -"""Content addresses and digest abstraction. +"""Content addresses and digest abstraction (ADR-0001). -Real implementation lands in ARTIFACT-STORE-WP-0001-T009. See ADR-0001 for -the dual-digest contract (BLAKE3 primary, SHA-256 retained for interop). +Implements the dual-digest contract: + +* :class:`Digest` — immutable algorithm + lowercase hex value object. +* :class:`ContentAddress` — canonical string form ``:``. +* :class:`DigestPair` — dual digest produced in a single hashing pass. +* Algorithm registry with ``blake3`` (default primary) and ``sha256`` + (always-computed interop digest) registered at import. +* :func:`digest_stream` — async single-pass dual hash. +* :func:`digest_bytes` — synchronous convenience wrapper. """ + +from __future__ import annotations + +import hashlib +import re +from collections.abc import AsyncIterator, Callable, Iterator +from dataclasses import dataclass +from typing import Protocol, cast + +import blake3 as _blake3 + +__all__ = [ + "INTEROP_ALGORITHM", + "PRIMARY_ALGORITHM", + "ContentAddress", + "Digest", + "DigestPair", + "HasherFactory", + "HasherProto", + "digest_bytes", + "digest_stream", + "get_algorithm", + "list_algorithms", + "register_algorithm", +] + +PRIMARY_ALGORITHM = "blake3" +"""Default primary algorithm for new content (ADR-0001).""" + +INTEROP_ALGORITHM = "sha256" +"""Always-computed secondary digest for OCI / SLSA / cosign interop.""" + +_NAME_RE = re.compile(r"^[a-z][a-z0-9_-]*$") +_HEX_RE = re.compile(r"^[0-9a-f]+$") + + +class HasherProto(Protocol): + """Subset of the hashlib hasher interface used by the registry.""" + + def update(self, data: bytes, /) -> None: ... + def hexdigest(self) -> str: ... + + +HasherFactory = Callable[[], HasherProto] + + +_registry: dict[str, HasherFactory] = {} + + +def register_algorithm(name: str, factory: HasherFactory) -> None: + """Register a streaming hasher factory under ``name``. + + Names must be lowercase and match ``[a-z][a-z0-9_-]*``. + """ + if not _NAME_RE.match(name): + raise ValueError(f"algorithm name must match /^[a-z][a-z0-9_-]*$/: {name!r}") + _registry[name] = factory + + +def get_algorithm(name: str) -> HasherFactory: + """Return the hasher factory registered under ``name``.""" + try: + return _registry[name] + except KeyError as exc: + raise KeyError(f"unknown digest algorithm: {name!r}") from exc + + +def list_algorithms() -> list[str]: + """Return registered algorithm names, sorted.""" + return sorted(_registry) + + +@dataclass(frozen=True, slots=True) +class Digest: + """An immutable, hashable digest value (algorithm + lowercase hex).""" + + algorithm: str + hex: str + + def __post_init__(self) -> None: + if not self.algorithm: + raise ValueError("algorithm must be non-empty") + if not _NAME_RE.match(self.algorithm): + raise ValueError(f"algorithm must match /^[a-z][a-z0-9_-]*$/: {self.algorithm!r}") + if not self.hex: + raise ValueError("hex must be non-empty") + if not _HEX_RE.match(self.hex): + raise ValueError(f"hex must be lowercase hexadecimal: {self.hex!r}") + if len(self.hex) % 2 != 0: + raise ValueError(f"hex length must be even: {self.hex!r}") + + @property + def content_address(self) -> ContentAddress: + """The canonical string form of this digest.""" + return ContentAddress(f"{self.algorithm}:{self.hex}") + + def __str__(self) -> str: + return f"{self.algorithm}:{self.hex}" + + +@dataclass(frozen=True, slots=True) +class ContentAddress: + """Canonical string form of a digest: ``:`` (ADR-0001). + + Storage backends address bytes by content address rather than by logical + path; the registry retains :class:`Digest` for in-memory operations. + """ + + value: str + + def __post_init__(self) -> None: + # to_digest validates the structure (algorithm + lowercase hex). + self.to_digest() + + @classmethod + def from_digest(cls, digest: Digest) -> ContentAddress: + """Build a :class:`ContentAddress` from a :class:`Digest`.""" + return cls(f"{digest.algorithm}:{digest.hex}") + + def to_digest(self) -> Digest: + """Parse this content address into a :class:`Digest`.""" + if ":" not in self.value: + raise ValueError(f"content address missing algorithm separator: {self.value!r}") + algorithm, _, hex_ = self.value.partition(":") + return Digest(algorithm=algorithm, hex=hex_) + + def __str__(self) -> str: + return self.value + + +@dataclass(frozen=True, slots=True) +class DigestPair: + """Dual-digest result of a single hashing pass. + + ``primary`` is the algorithm configured for new content (default BLAKE3); + ``sha256`` is the always-computed interop digest. + """ + + primary: Digest + sha256: Digest + + +def _compute_pair_sync(chunks: Iterator[bytes], *, primary: str) -> DigestPair: + primary_h = get_algorithm(primary)() + sha_h = get_algorithm(INTEROP_ALGORITHM)() + for chunk in chunks: + primary_h.update(chunk) + sha_h.update(chunk) + return DigestPair( + primary=Digest(algorithm=primary, hex=primary_h.hexdigest()), + sha256=Digest(algorithm=INTEROP_ALGORITHM, hex=sha_h.hexdigest()), + ) + + +def digest_bytes(data: bytes, *, primary: str = PRIMARY_ALGORITHM) -> DigestPair: + """Compute the dual digest pair over an in-memory byte buffer.""" + return _compute_pair_sync(iter([data]), primary=primary) + + +async def digest_stream( + stream: AsyncIterator[bytes], + *, + primary: str = PRIMARY_ALGORITHM, +) -> DigestPair: + """Compute the dual digest pair over an async byte stream. + + The stream is consumed once; the primary digest (default BLAKE3) and the + SHA-256 interop digest are both updated with the same chunks. + """ + primary_h = get_algorithm(primary)() + sha_h = get_algorithm(INTEROP_ALGORITHM)() + async for chunk in stream: + primary_h.update(chunk) + sha_h.update(chunk) + return DigestPair( + primary=Digest(algorithm=primary, hex=primary_h.hexdigest()), + sha256=Digest(algorithm=INTEROP_ALGORITHM, hex=sha_h.hexdigest()), + ) + + +def _sha256_factory() -> HasherProto: + return hashlib.sha256() + + +def _blake3_factory() -> HasherProto: + return cast(HasherProto, _blake3.blake3()) + + +register_algorithm(INTEROP_ALGORITHM, _sha256_factory) +register_algorithm(PRIMARY_ALGORITHM, _blake3_factory) diff --git a/tests/unit/test_identity.py b/tests/unit/test_identity.py new file mode 100644 index 0000000..7ade084 --- /dev/null +++ b/tests/unit/test_identity.py @@ -0,0 +1,125 @@ +"""Tests for :mod:`artifactstore.identity` (ARTIFACT-STORE-WP-0001-T009).""" + +from __future__ import annotations + +import hashlib +from collections.abc import AsyncIterator +from dataclasses import FrozenInstanceError + +import blake3 as _blake3 +import pytest +from hypothesis import given +from hypothesis import strategies as st + +from artifactstore.identity import ( + INTEROP_ALGORITHM, + PRIMARY_ALGORITHM, + ContentAddress, + Digest, + digest_bytes, + digest_stream, + get_algorithm, + list_algorithms, + register_algorithm, +) + + +def test_registry_has_blake3_and_sha256() -> None: + algos = list_algorithms() + assert PRIMARY_ALGORITHM in algos + assert INTEROP_ALGORITHM in algos + + +def test_get_algorithm_unknown_raises() -> None: + with pytest.raises(KeyError): + get_algorithm("does-not-exist") + + +def test_register_algorithm_rejects_bad_names() -> None: + with pytest.raises(ValueError): + register_algorithm("UPPER", hashlib.sha256) # uppercase + with pytest.raises(ValueError): + register_algorithm("1bad", hashlib.sha256) # leading digit + + +def test_digest_rejects_uppercase_hex() -> None: + with pytest.raises(ValueError): + Digest(algorithm="sha256", hex="DEADBEEF") + + +def test_digest_rejects_empty_fields() -> None: + with pytest.raises(ValueError): + Digest(algorithm="", hex="ab") + with pytest.raises(ValueError): + Digest(algorithm="sha256", hex="") + + +def test_digest_rejects_odd_length_hex() -> None: + with pytest.raises(ValueError): + Digest(algorithm="sha256", hex="abc") + + +def test_digest_is_hashable() -> None: + a = Digest(algorithm="sha256", hex="ab" * 32) + b = Digest(algorithm="sha256", hex="ab" * 32) + assert hash(a) == hash(b) + assert a == b + # usable as set / dict key + assert {a, b} == {a} + + +def test_digest_is_frozen() -> None: + d = Digest(algorithm="sha256", hex="ab" * 32) + with pytest.raises(FrozenInstanceError): + d.algorithm = "blake3" # type: ignore[misc] + + +def test_content_address_round_trips() -> None: + d = Digest(algorithm="sha256", hex="ab" * 32) + ca = d.content_address + assert str(ca) == "sha256:" + "ab" * 32 + assert ca.to_digest() == d + assert ContentAddress.from_digest(d) == ca + + +def test_content_address_rejects_malformed() -> None: + with pytest.raises(ValueError): + ContentAddress("not-a-digest") + with pytest.raises(ValueError): + ContentAddress("sha256:DEADBEEF") # uppercase hex + with pytest.raises(ValueError): + ContentAddress(":abcd") # empty algorithm + + +@given(st.binary(max_size=4096)) +def test_digest_bytes_matches_reference_libs(data: bytes) -> None: + pair = digest_bytes(data) + assert pair.primary.algorithm == PRIMARY_ALGORITHM + assert pair.sha256.algorithm == INTEROP_ALGORITHM + assert pair.sha256.hex == hashlib.sha256(data).hexdigest() + assert pair.primary.hex == _blake3.blake3(data).hexdigest() + + +async def _chunked(data: bytes, chunk: int) -> AsyncIterator[bytes]: + for i in range(0, len(data), chunk): + yield data[i : i + chunk] + + +async def test_digest_stream_matches_digest_bytes() -> None: + data = b"the quick brown fox jumps over the lazy dog" * 100 + pair = await digest_stream(_chunked(data, 17)) + assert pair == digest_bytes(data) + + +async def test_digest_stream_defaults_blake3_and_always_computes_sha256() -> None: + pair = await digest_stream(_chunked(b"hello", 2)) + assert pair.primary.algorithm == PRIMARY_ALGORITHM + assert pair.sha256.algorithm == INTEROP_ALGORITHM + assert pair.sha256.hex == hashlib.sha256(b"hello").hexdigest() + assert pair.primary.hex == _blake3.blake3(b"hello").hexdigest() + + +async def test_digest_stream_handles_empty_input() -> None: + pair = await digest_stream(_chunked(b"", 1)) + assert pair.sha256.hex == hashlib.sha256(b"").hexdigest() + assert pair.primary.hex == _blake3.blake3(b"").hexdigest() diff --git a/workplans/ARTIFACT-STORE-WP-0001-service-baseline.md b/workplans/ARTIFACT-STORE-WP-0001-service-baseline.md index 234f230..64affab 100644 --- a/workplans/ARTIFACT-STORE-WP-0001-service-baseline.md +++ b/workplans/ARTIFACT-STORE-WP-0001-service-baseline.md @@ -95,7 +95,7 @@ Acceptance: ```task id: ARTIFACT-STORE-WP-0001-T009 -status: todo +status: done priority: high state_hub_task_id: "4dc465c5-5c14-412d-b8c0-aa84076e4560" ```