WP-0001-T009: digest abstraction and content address (ADR-0001)

src/artifactstore/identity/__init__.py:
- Digest: frozen, hashable dataclass (algorithm + lowercase hex), validated.
- ContentAddress: canonical `<algorithm>:<hex>` string form with validating
  parser (to_digest) and emitter (str / from_digest).
- DigestPair: dual-digest result (primary + sha256) from a single hashing pass.
- Algorithm registry: register_algorithm / get_algorithm / list_algorithms
  with name validation `[a-z][a-z0-9_-]*`.
- digest_bytes (sync) and digest_stream (async) — single-pass dual hashing.
- BLAKE3 registered as PRIMARY_ALGORITHM, SHA-256 as INTEROP_ALGORITHM at
  module import.

tests/unit/test_identity.py:
- Hypothesis property test asserts digest_bytes matches hashlib.sha256 and
  blake3.blake3 for random byte sequences up to 4 KiB.
- digest_stream invariants: equivalence with digest_bytes under chunked input;
  defaults to BLAKE3 primary; always computes SHA-256; handles empty input.
- Digest / ContentAddress invariants: rejects uppercase hex, empty fields,
  odd hex length, missing separator; frozen and hashable.

Gates: ruff clean, mypy --strict clean on 21 source files, 18 tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-16 01:34:24 +02:00
parent 6a136dd814
commit c1bfb8b486
3 changed files with 326 additions and 4 deletions

View File

@@ -1,5 +1,202 @@
"""Content addresses and digest abstraction.
"""Content addresses and digest abstraction (ADR-0001).
Real implementation lands in ARTIFACT-STORE-WP-0001-T009. See ADR-0001 for
the dual-digest contract (BLAKE3 primary, SHA-256 retained for interop).
Implements the dual-digest contract:
* :class:`Digest` — immutable algorithm + lowercase hex value object.
* :class:`ContentAddress` — canonical string form ``<algorithm>:<hex>``.
* :class:`DigestPair` — dual digest produced in a single hashing pass.
* Algorithm registry with ``blake3`` (default primary) and ``sha256``
(always-computed interop digest) registered at import.
* :func:`digest_stream` — async single-pass dual hash.
* :func:`digest_bytes` — synchronous convenience wrapper.
"""
from __future__ import annotations
import hashlib
import re
from collections.abc import AsyncIterator, Callable, Iterator
from dataclasses import dataclass
from typing import Protocol, cast
import blake3 as _blake3
__all__ = [
"INTEROP_ALGORITHM",
"PRIMARY_ALGORITHM",
"ContentAddress",
"Digest",
"DigestPair",
"HasherFactory",
"HasherProto",
"digest_bytes",
"digest_stream",
"get_algorithm",
"list_algorithms",
"register_algorithm",
]
PRIMARY_ALGORITHM = "blake3"
"""Default primary algorithm for new content (ADR-0001)."""
INTEROP_ALGORITHM = "sha256"
"""Always-computed secondary digest for OCI / SLSA / cosign interop."""
_NAME_RE = re.compile(r"^[a-z][a-z0-9_-]*$")
_HEX_RE = re.compile(r"^[0-9a-f]+$")
class HasherProto(Protocol):
"""Subset of the hashlib hasher interface used by the registry."""
def update(self, data: bytes, /) -> None: ...
def hexdigest(self) -> str: ...
HasherFactory = Callable[[], HasherProto]
_registry: dict[str, HasherFactory] = {}
def register_algorithm(name: str, factory: HasherFactory) -> None:
"""Register a streaming hasher factory under ``name``.
Names must be lowercase and match ``[a-z][a-z0-9_-]*``.
"""
if not _NAME_RE.match(name):
raise ValueError(f"algorithm name must match /^[a-z][a-z0-9_-]*$/: {name!r}")
_registry[name] = factory
def get_algorithm(name: str) -> HasherFactory:
"""Return the hasher factory registered under ``name``."""
try:
return _registry[name]
except KeyError as exc:
raise KeyError(f"unknown digest algorithm: {name!r}") from exc
def list_algorithms() -> list[str]:
"""Return registered algorithm names, sorted."""
return sorted(_registry)
@dataclass(frozen=True, slots=True)
class Digest:
"""An immutable, hashable digest value (algorithm + lowercase hex)."""
algorithm: str
hex: str
def __post_init__(self) -> None:
if not self.algorithm:
raise ValueError("algorithm must be non-empty")
if not _NAME_RE.match(self.algorithm):
raise ValueError(f"algorithm must match /^[a-z][a-z0-9_-]*$/: {self.algorithm!r}")
if not self.hex:
raise ValueError("hex must be non-empty")
if not _HEX_RE.match(self.hex):
raise ValueError(f"hex must be lowercase hexadecimal: {self.hex!r}")
if len(self.hex) % 2 != 0:
raise ValueError(f"hex length must be even: {self.hex!r}")
@property
def content_address(self) -> ContentAddress:
"""The canonical string form of this digest."""
return ContentAddress(f"{self.algorithm}:{self.hex}")
def __str__(self) -> str:
return f"{self.algorithm}:{self.hex}"
@dataclass(frozen=True, slots=True)
class ContentAddress:
"""Canonical string form of a digest: ``<algorithm>:<hex>`` (ADR-0001).
Storage backends address bytes by content address rather than by logical
path; the registry retains :class:`Digest` for in-memory operations.
"""
value: str
def __post_init__(self) -> None:
# to_digest validates the structure (algorithm + lowercase hex).
self.to_digest()
@classmethod
def from_digest(cls, digest: Digest) -> ContentAddress:
"""Build a :class:`ContentAddress` from a :class:`Digest`."""
return cls(f"{digest.algorithm}:{digest.hex}")
def to_digest(self) -> Digest:
"""Parse this content address into a :class:`Digest`."""
if ":" not in self.value:
raise ValueError(f"content address missing algorithm separator: {self.value!r}")
algorithm, _, hex_ = self.value.partition(":")
return Digest(algorithm=algorithm, hex=hex_)
def __str__(self) -> str:
return self.value
@dataclass(frozen=True, slots=True)
class DigestPair:
"""Dual-digest result of a single hashing pass.
``primary`` is the algorithm configured for new content (default BLAKE3);
``sha256`` is the always-computed interop digest.
"""
primary: Digest
sha256: Digest
def _compute_pair_sync(chunks: Iterator[bytes], *, primary: str) -> DigestPair:
primary_h = get_algorithm(primary)()
sha_h = get_algorithm(INTEROP_ALGORITHM)()
for chunk in chunks:
primary_h.update(chunk)
sha_h.update(chunk)
return DigestPair(
primary=Digest(algorithm=primary, hex=primary_h.hexdigest()),
sha256=Digest(algorithm=INTEROP_ALGORITHM, hex=sha_h.hexdigest()),
)
def digest_bytes(data: bytes, *, primary: str = PRIMARY_ALGORITHM) -> DigestPair:
"""Compute the dual digest pair over an in-memory byte buffer."""
return _compute_pair_sync(iter([data]), primary=primary)
async def digest_stream(
stream: AsyncIterator[bytes],
*,
primary: str = PRIMARY_ALGORITHM,
) -> DigestPair:
"""Compute the dual digest pair over an async byte stream.
The stream is consumed once; the primary digest (default BLAKE3) and the
SHA-256 interop digest are both updated with the same chunks.
"""
primary_h = get_algorithm(primary)()
sha_h = get_algorithm(INTEROP_ALGORITHM)()
async for chunk in stream:
primary_h.update(chunk)
sha_h.update(chunk)
return DigestPair(
primary=Digest(algorithm=primary, hex=primary_h.hexdigest()),
sha256=Digest(algorithm=INTEROP_ALGORITHM, hex=sha_h.hexdigest()),
)
def _sha256_factory() -> HasherProto:
return hashlib.sha256()
def _blake3_factory() -> HasherProto:
return cast(HasherProto, _blake3.blake3())
register_algorithm(INTEROP_ALGORITHM, _sha256_factory)
register_algorithm(PRIMARY_ALGORITHM, _blake3_factory)