generated from coulomb/repo-seed
WP-0001-T009: digest abstraction and content address (ADR-0001)
src/artifactstore/identity/__init__.py: - Digest: frozen, hashable dataclass (algorithm + lowercase hex), validated. - ContentAddress: canonical `<algorithm>:<hex>` string form with validating parser (to_digest) and emitter (str / from_digest). - DigestPair: dual-digest result (primary + sha256) from a single hashing pass. - Algorithm registry: register_algorithm / get_algorithm / list_algorithms with name validation `[a-z][a-z0-9_-]*`. - digest_bytes (sync) and digest_stream (async) — single-pass dual hashing. - BLAKE3 registered as PRIMARY_ALGORITHM, SHA-256 as INTEROP_ALGORITHM at module import. tests/unit/test_identity.py: - Hypothesis property test asserts digest_bytes matches hashlib.sha256 and blake3.blake3 for random byte sequences up to 4 KiB. - digest_stream invariants: equivalence with digest_bytes under chunked input; defaults to BLAKE3 primary; always computes SHA-256; handles empty input. - Digest / ContentAddress invariants: rejects uppercase hex, empty fields, odd hex length, missing separator; frozen and hashable. Gates: ruff clean, mypy --strict clean on 21 source files, 18 tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,202 @@
|
||||
"""Content addresses and digest abstraction.
|
||||
"""Content addresses and digest abstraction (ADR-0001).
|
||||
|
||||
Real implementation lands in ARTIFACT-STORE-WP-0001-T009. See ADR-0001 for
|
||||
the dual-digest contract (BLAKE3 primary, SHA-256 retained for interop).
|
||||
Implements the dual-digest contract:
|
||||
|
||||
* :class:`Digest` — immutable algorithm + lowercase hex value object.
|
||||
* :class:`ContentAddress` — canonical string form ``<algorithm>:<hex>``.
|
||||
* :class:`DigestPair` — dual digest produced in a single hashing pass.
|
||||
* Algorithm registry with ``blake3`` (default primary) and ``sha256``
|
||||
(always-computed interop digest) registered at import.
|
||||
* :func:`digest_stream` — async single-pass dual hash.
|
||||
* :func:`digest_bytes` — synchronous convenience wrapper.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from collections.abc import AsyncIterator, Callable, Iterator
|
||||
from dataclasses import dataclass
|
||||
from typing import Protocol, cast
|
||||
|
||||
import blake3 as _blake3
|
||||
|
||||
__all__ = [
|
||||
"INTEROP_ALGORITHM",
|
||||
"PRIMARY_ALGORITHM",
|
||||
"ContentAddress",
|
||||
"Digest",
|
||||
"DigestPair",
|
||||
"HasherFactory",
|
||||
"HasherProto",
|
||||
"digest_bytes",
|
||||
"digest_stream",
|
||||
"get_algorithm",
|
||||
"list_algorithms",
|
||||
"register_algorithm",
|
||||
]
|
||||
|
||||
PRIMARY_ALGORITHM = "blake3"
|
||||
"""Default primary algorithm for new content (ADR-0001)."""
|
||||
|
||||
INTEROP_ALGORITHM = "sha256"
|
||||
"""Always-computed secondary digest for OCI / SLSA / cosign interop."""
|
||||
|
||||
_NAME_RE = re.compile(r"^[a-z][a-z0-9_-]*$")
|
||||
_HEX_RE = re.compile(r"^[0-9a-f]+$")
|
||||
|
||||
|
||||
class HasherProto(Protocol):
|
||||
"""Subset of the hashlib hasher interface used by the registry."""
|
||||
|
||||
def update(self, data: bytes, /) -> None: ...
|
||||
def hexdigest(self) -> str: ...
|
||||
|
||||
|
||||
HasherFactory = Callable[[], HasherProto]
|
||||
|
||||
|
||||
_registry: dict[str, HasherFactory] = {}
|
||||
|
||||
|
||||
def register_algorithm(name: str, factory: HasherFactory) -> None:
|
||||
"""Register a streaming hasher factory under ``name``.
|
||||
|
||||
Names must be lowercase and match ``[a-z][a-z0-9_-]*``.
|
||||
"""
|
||||
if not _NAME_RE.match(name):
|
||||
raise ValueError(f"algorithm name must match /^[a-z][a-z0-9_-]*$/: {name!r}")
|
||||
_registry[name] = factory
|
||||
|
||||
|
||||
def get_algorithm(name: str) -> HasherFactory:
|
||||
"""Return the hasher factory registered under ``name``."""
|
||||
try:
|
||||
return _registry[name]
|
||||
except KeyError as exc:
|
||||
raise KeyError(f"unknown digest algorithm: {name!r}") from exc
|
||||
|
||||
|
||||
def list_algorithms() -> list[str]:
|
||||
"""Return registered algorithm names, sorted."""
|
||||
return sorted(_registry)
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class Digest:
|
||||
"""An immutable, hashable digest value (algorithm + lowercase hex)."""
|
||||
|
||||
algorithm: str
|
||||
hex: str
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not self.algorithm:
|
||||
raise ValueError("algorithm must be non-empty")
|
||||
if not _NAME_RE.match(self.algorithm):
|
||||
raise ValueError(f"algorithm must match /^[a-z][a-z0-9_-]*$/: {self.algorithm!r}")
|
||||
if not self.hex:
|
||||
raise ValueError("hex must be non-empty")
|
||||
if not _HEX_RE.match(self.hex):
|
||||
raise ValueError(f"hex must be lowercase hexadecimal: {self.hex!r}")
|
||||
if len(self.hex) % 2 != 0:
|
||||
raise ValueError(f"hex length must be even: {self.hex!r}")
|
||||
|
||||
@property
|
||||
def content_address(self) -> ContentAddress:
|
||||
"""The canonical string form of this digest."""
|
||||
return ContentAddress(f"{self.algorithm}:{self.hex}")
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.algorithm}:{self.hex}"
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class ContentAddress:
|
||||
"""Canonical string form of a digest: ``<algorithm>:<hex>`` (ADR-0001).
|
||||
|
||||
Storage backends address bytes by content address rather than by logical
|
||||
path; the registry retains :class:`Digest` for in-memory operations.
|
||||
"""
|
||||
|
||||
value: str
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# to_digest validates the structure (algorithm + lowercase hex).
|
||||
self.to_digest()
|
||||
|
||||
@classmethod
|
||||
def from_digest(cls, digest: Digest) -> ContentAddress:
|
||||
"""Build a :class:`ContentAddress` from a :class:`Digest`."""
|
||||
return cls(f"{digest.algorithm}:{digest.hex}")
|
||||
|
||||
def to_digest(self) -> Digest:
|
||||
"""Parse this content address into a :class:`Digest`."""
|
||||
if ":" not in self.value:
|
||||
raise ValueError(f"content address missing algorithm separator: {self.value!r}")
|
||||
algorithm, _, hex_ = self.value.partition(":")
|
||||
return Digest(algorithm=algorithm, hex=hex_)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.value
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class DigestPair:
|
||||
"""Dual-digest result of a single hashing pass.
|
||||
|
||||
``primary`` is the algorithm configured for new content (default BLAKE3);
|
||||
``sha256`` is the always-computed interop digest.
|
||||
"""
|
||||
|
||||
primary: Digest
|
||||
sha256: Digest
|
||||
|
||||
|
||||
def _compute_pair_sync(chunks: Iterator[bytes], *, primary: str) -> DigestPair:
|
||||
primary_h = get_algorithm(primary)()
|
||||
sha_h = get_algorithm(INTEROP_ALGORITHM)()
|
||||
for chunk in chunks:
|
||||
primary_h.update(chunk)
|
||||
sha_h.update(chunk)
|
||||
return DigestPair(
|
||||
primary=Digest(algorithm=primary, hex=primary_h.hexdigest()),
|
||||
sha256=Digest(algorithm=INTEROP_ALGORITHM, hex=sha_h.hexdigest()),
|
||||
)
|
||||
|
||||
|
||||
def digest_bytes(data: bytes, *, primary: str = PRIMARY_ALGORITHM) -> DigestPair:
|
||||
"""Compute the dual digest pair over an in-memory byte buffer."""
|
||||
return _compute_pair_sync(iter([data]), primary=primary)
|
||||
|
||||
|
||||
async def digest_stream(
|
||||
stream: AsyncIterator[bytes],
|
||||
*,
|
||||
primary: str = PRIMARY_ALGORITHM,
|
||||
) -> DigestPair:
|
||||
"""Compute the dual digest pair over an async byte stream.
|
||||
|
||||
The stream is consumed once; the primary digest (default BLAKE3) and the
|
||||
SHA-256 interop digest are both updated with the same chunks.
|
||||
"""
|
||||
primary_h = get_algorithm(primary)()
|
||||
sha_h = get_algorithm(INTEROP_ALGORITHM)()
|
||||
async for chunk in stream:
|
||||
primary_h.update(chunk)
|
||||
sha_h.update(chunk)
|
||||
return DigestPair(
|
||||
primary=Digest(algorithm=primary, hex=primary_h.hexdigest()),
|
||||
sha256=Digest(algorithm=INTEROP_ALGORITHM, hex=sha_h.hexdigest()),
|
||||
)
|
||||
|
||||
|
||||
def _sha256_factory() -> HasherProto:
|
||||
return hashlib.sha256()
|
||||
|
||||
|
||||
def _blake3_factory() -> HasherProto:
|
||||
return cast(HasherProto, _blake3.blake3())
|
||||
|
||||
|
||||
register_algorithm(INTEROP_ALGORITHM, _sha256_factory)
|
||||
register_algorithm(PRIMARY_ALGORITHM, _blake3_factory)
|
||||
|
||||
125
tests/unit/test_identity.py
Normal file
125
tests/unit/test_identity.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""Tests for :mod:`artifactstore.identity` (ARTIFACT-STORE-WP-0001-T009)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from collections.abc import AsyncIterator
|
||||
from dataclasses import FrozenInstanceError
|
||||
|
||||
import blake3 as _blake3
|
||||
import pytest
|
||||
from hypothesis import given
|
||||
from hypothesis import strategies as st
|
||||
|
||||
from artifactstore.identity import (
|
||||
INTEROP_ALGORITHM,
|
||||
PRIMARY_ALGORITHM,
|
||||
ContentAddress,
|
||||
Digest,
|
||||
digest_bytes,
|
||||
digest_stream,
|
||||
get_algorithm,
|
||||
list_algorithms,
|
||||
register_algorithm,
|
||||
)
|
||||
|
||||
|
||||
def test_registry_has_blake3_and_sha256() -> None:
|
||||
algos = list_algorithms()
|
||||
assert PRIMARY_ALGORITHM in algos
|
||||
assert INTEROP_ALGORITHM in algos
|
||||
|
||||
|
||||
def test_get_algorithm_unknown_raises() -> None:
|
||||
with pytest.raises(KeyError):
|
||||
get_algorithm("does-not-exist")
|
||||
|
||||
|
||||
def test_register_algorithm_rejects_bad_names() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
register_algorithm("UPPER", hashlib.sha256) # uppercase
|
||||
with pytest.raises(ValueError):
|
||||
register_algorithm("1bad", hashlib.sha256) # leading digit
|
||||
|
||||
|
||||
def test_digest_rejects_uppercase_hex() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
Digest(algorithm="sha256", hex="DEADBEEF")
|
||||
|
||||
|
||||
def test_digest_rejects_empty_fields() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
Digest(algorithm="", hex="ab")
|
||||
with pytest.raises(ValueError):
|
||||
Digest(algorithm="sha256", hex="")
|
||||
|
||||
|
||||
def test_digest_rejects_odd_length_hex() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
Digest(algorithm="sha256", hex="abc")
|
||||
|
||||
|
||||
def test_digest_is_hashable() -> None:
|
||||
a = Digest(algorithm="sha256", hex="ab" * 32)
|
||||
b = Digest(algorithm="sha256", hex="ab" * 32)
|
||||
assert hash(a) == hash(b)
|
||||
assert a == b
|
||||
# usable as set / dict key
|
||||
assert {a, b} == {a}
|
||||
|
||||
|
||||
def test_digest_is_frozen() -> None:
|
||||
d = Digest(algorithm="sha256", hex="ab" * 32)
|
||||
with pytest.raises(FrozenInstanceError):
|
||||
d.algorithm = "blake3" # type: ignore[misc]
|
||||
|
||||
|
||||
def test_content_address_round_trips() -> None:
|
||||
d = Digest(algorithm="sha256", hex="ab" * 32)
|
||||
ca = d.content_address
|
||||
assert str(ca) == "sha256:" + "ab" * 32
|
||||
assert ca.to_digest() == d
|
||||
assert ContentAddress.from_digest(d) == ca
|
||||
|
||||
|
||||
def test_content_address_rejects_malformed() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
ContentAddress("not-a-digest")
|
||||
with pytest.raises(ValueError):
|
||||
ContentAddress("sha256:DEADBEEF") # uppercase hex
|
||||
with pytest.raises(ValueError):
|
||||
ContentAddress(":abcd") # empty algorithm
|
||||
|
||||
|
||||
@given(st.binary(max_size=4096))
|
||||
def test_digest_bytes_matches_reference_libs(data: bytes) -> None:
|
||||
pair = digest_bytes(data)
|
||||
assert pair.primary.algorithm == PRIMARY_ALGORITHM
|
||||
assert pair.sha256.algorithm == INTEROP_ALGORITHM
|
||||
assert pair.sha256.hex == hashlib.sha256(data).hexdigest()
|
||||
assert pair.primary.hex == _blake3.blake3(data).hexdigest()
|
||||
|
||||
|
||||
async def _chunked(data: bytes, chunk: int) -> AsyncIterator[bytes]:
|
||||
for i in range(0, len(data), chunk):
|
||||
yield data[i : i + chunk]
|
||||
|
||||
|
||||
async def test_digest_stream_matches_digest_bytes() -> None:
|
||||
data = b"the quick brown fox jumps over the lazy dog" * 100
|
||||
pair = await digest_stream(_chunked(data, 17))
|
||||
assert pair == digest_bytes(data)
|
||||
|
||||
|
||||
async def test_digest_stream_defaults_blake3_and_always_computes_sha256() -> None:
|
||||
pair = await digest_stream(_chunked(b"hello", 2))
|
||||
assert pair.primary.algorithm == PRIMARY_ALGORITHM
|
||||
assert pair.sha256.algorithm == INTEROP_ALGORITHM
|
||||
assert pair.sha256.hex == hashlib.sha256(b"hello").hexdigest()
|
||||
assert pair.primary.hex == _blake3.blake3(b"hello").hexdigest()
|
||||
|
||||
|
||||
async def test_digest_stream_handles_empty_input() -> None:
|
||||
pair = await digest_stream(_chunked(b"", 1))
|
||||
assert pair.sha256.hex == hashlib.sha256(b"").hexdigest()
|
||||
assert pair.primary.hex == _blake3.blake3(b"").hexdigest()
|
||||
@@ -95,7 +95,7 @@ Acceptance:
|
||||
|
||||
```task
|
||||
id: ARTIFACT-STORE-WP-0001-T009
|
||||
status: todo
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "4dc465c5-5c14-412d-b8c0-aa84076e4560"
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user