generated from coulomb/repo-seed
feat(incremental): I-2 digest + consistency-checker (WP-0011 T3)
A Merkle-style digest summarizes the derived tier (per-identity fingerprint + incident edges as order-independent leaves) so equal states have equal digests and the digest is stable under equivalent event orders. A ConsistencyChecker recomputes the authoritative fold from the current source, compares it over a sampled region, and on mismatch scoped-recomputes just the affected identities — self-healing missed-delta drift, corrupted internal state, and vanished pages. Makes derived = f(canonical) verified, not asserted. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,12 @@ from shard_wiki.incremental.minhash import (
|
||||
jaccard,
|
||||
shingles,
|
||||
)
|
||||
from shard_wiki.incremental.verification import (
|
||||
ConsistencyChecker,
|
||||
ConsistencyReport,
|
||||
derived_digest,
|
||||
region_digest,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"shingles",
|
||||
@@ -31,4 +37,8 @@ __all__ = [
|
||||
"EquivalenceEdge",
|
||||
"EquivalenceIndex",
|
||||
"normalized_title",
|
||||
"derived_digest",
|
||||
"region_digest",
|
||||
"ConsistencyReport",
|
||||
"ConsistencyChecker",
|
||||
]
|
||||
|
||||
@@ -136,6 +136,15 @@ class EquivalenceIndex:
|
||||
|
||||
# -- queries -------------------------------------------------------------
|
||||
|
||||
def identities(self) -> frozenset[Identity]:
|
||||
"""All identities currently present in the index."""
|
||||
return frozenset(self._entries)
|
||||
|
||||
def fingerprint(self, identity: Identity) -> str | None:
|
||||
"""The content fingerprint indexed for ``identity`` (None if absent) — a digest leaf."""
|
||||
entry = self._entries.get(identity)
|
||||
return entry.fingerprint if entry is not None else None
|
||||
|
||||
def edges(self) -> frozenset[frozenset[Identity]]:
|
||||
"""All equivalence edges (content + curator) among currently present identities."""
|
||||
present = self._entries.keys()
|
||||
|
||||
112
src/shard_wiki/incremental/verification.py
Normal file
112
src/shard_wiki/incremental/verification.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""I-2 verification — digest + background consistency-checker (SHARD-WP-0011 T3).
|
||||
|
||||
``derived = f(canonical)`` is made *verified*, not asserted. A **Merkle-style digest** summarizes
|
||||
the derived tier (each identity's content fingerprint + its incident equivalence edges as a leaf,
|
||||
order-independently combined into a root) so two derived states are equal iff their digests match.
|
||||
A **consistency-checker** recomputes the authoritative fold from the current source, compares it to
|
||||
the maintained index over a (sampled) region, and on mismatch performs a **scoped recompute** of
|
||||
just the affected identities — self-healing drift from a missed delta or corrupted state.
|
||||
|
||||
The digest is a pure function of index state, so it is "maintained alongside deltas" for free and
|
||||
is stable under equivalent event orders (leaves are sorted before combination).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from collections.abc import Callable, Iterable
|
||||
from dataclasses import dataclass
|
||||
|
||||
from shard_wiki.incremental.equivalence import EquivalenceIndex
|
||||
from shard_wiki.model import Identity, Page
|
||||
|
||||
__all__ = ["region_digest", "derived_digest", "ConsistencyReport", "ConsistencyChecker"]
|
||||
|
||||
CuratorEdges = Iterable[tuple[Identity, Identity]]
|
||||
|
||||
|
||||
def _leaf(index: EquivalenceIndex, identity: Identity) -> str:
|
||||
"""A digest leaf for one identity: its fingerprint + its incident edges (as sorted peers)."""
|
||||
fingerprint = index.fingerprint(identity) or "∅"
|
||||
peers = sorted(
|
||||
str(other)
|
||||
for edge in index.edges()
|
||||
if identity in edge
|
||||
for other in edge
|
||||
if other != identity
|
||||
)
|
||||
payload = f"{identity}|{fingerprint}|{','.join(peers)}"
|
||||
return hashlib.blake2b(payload.encode("utf-8"), digest_size=16).hexdigest()
|
||||
|
||||
|
||||
def region_digest(index: EquivalenceIndex, identities: Iterable[Identity]) -> str:
|
||||
"""A Merkle-style root over the given identities' leaves (order-independent)."""
|
||||
leaves = sorted(_leaf(index, identity) for identity in identities)
|
||||
root = hashlib.blake2b(digest_size=16)
|
||||
for leaf in leaves:
|
||||
root.update(leaf.encode("utf-8"))
|
||||
return root.hexdigest()
|
||||
|
||||
|
||||
def derived_digest(index: EquivalenceIndex) -> str:
|
||||
"""The digest of the whole maintained derived tier."""
|
||||
return region_digest(index, index.identities())
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class ConsistencyReport:
|
||||
"""Outcome of a consistency check: what was examined, whether it drifted, and if it healed."""
|
||||
|
||||
checked: int
|
||||
drifted: bool
|
||||
repaired: bool
|
||||
healthy: bool
|
||||
|
||||
|
||||
class ConsistencyChecker:
|
||||
"""Compares the maintained index against an authoritative rebuild and repairs drift in place."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
index: EquivalenceIndex,
|
||||
pages: Callable[[], Iterable[Page]],
|
||||
curator_edges: Callable[[], CuratorEdges] = lambda: (),
|
||||
) -> None:
|
||||
self._index = index
|
||||
self._pages = pages
|
||||
self._curator = curator_edges
|
||||
|
||||
def _authoritative(self) -> EquivalenceIndex:
|
||||
expected = EquivalenceIndex(
|
||||
num_bands=self._index.num_bands, threshold=self._index.threshold
|
||||
)
|
||||
expected.build(list(self._pages()), list(self._curator()))
|
||||
return expected
|
||||
|
||||
def check_and_repair(self, sample: Iterable[Identity] | None = None) -> ConsistencyReport:
|
||||
"""Verify the (sampled) region against a from-scratch fold; scoped-recompute on mismatch."""
|
||||
source = {p.identity: p for p in self._pages()}
|
||||
expected = self._authoritative()
|
||||
region = (
|
||||
set(sample)
|
||||
if sample is not None
|
||||
else set(source) | set(self._index.identities())
|
||||
)
|
||||
|
||||
drifted = region_digest(self._index, region) != region_digest(expected, region)
|
||||
if not drifted:
|
||||
return ConsistencyReport(len(region), drifted=False, repaired=False, healthy=True)
|
||||
|
||||
self._repair(region, source)
|
||||
healthy = region_digest(self._index, region) == region_digest(expected, region)
|
||||
return ConsistencyReport(len(region), drifted=True, repaired=True, healthy=healthy)
|
||||
|
||||
def _repair(self, region: set[Identity], source: dict[Identity, Page]) -> None:
|
||||
"""Scoped recompute: reconcile each affected identity to the current source."""
|
||||
present = self._index.identities()
|
||||
for identity in region:
|
||||
page = source.get(identity)
|
||||
if page is not None:
|
||||
self._index.update(page) if identity in present else self._index.add(page)
|
||||
elif identity in present:
|
||||
self._index.remove(identity)
|
||||
89
tests/test_incremental_verification.py
Normal file
89
tests/test_incremental_verification.py
Normal file
@@ -0,0 +1,89 @@
|
||||
"""Tests for I-2 verification — digest + consistency-checker (SHARD-WP-0011 T3)."""
|
||||
|
||||
from shard_wiki.incremental import (
|
||||
ConsistencyChecker,
|
||||
EquivalenceIndex,
|
||||
derived_digest,
|
||||
)
|
||||
from shard_wiki.model import Identity, Page
|
||||
from shard_wiki.provenance import ProvenanceEnvelope
|
||||
|
||||
|
||||
def _page(shard, key, body):
|
||||
return Page(
|
||||
identity=Identity(shard, key),
|
||||
body=body,
|
||||
envelope=ProvenanceEnvelope(source_shard=shard),
|
||||
)
|
||||
|
||||
|
||||
def test_digest_is_stable_under_equivalent_event_orders():
|
||||
pages = [
|
||||
_page("A", "Foo", "shared body text here"),
|
||||
_page("B", "Bar", "shared body text here"),
|
||||
_page("C", "Baz", "an entirely separate unrelated document"),
|
||||
]
|
||||
forward = EquivalenceIndex()
|
||||
for p in pages:
|
||||
forward.add(p)
|
||||
reverse = EquivalenceIndex()
|
||||
for p in reversed(pages):
|
||||
reverse.add(p)
|
||||
assert derived_digest(forward) == derived_digest(reverse)
|
||||
|
||||
|
||||
def test_clean_index_reports_healthy():
|
||||
pages = [_page("A", "Foo", "same body"), _page("B", "Bar", "same body")]
|
||||
idx = EquivalenceIndex()
|
||||
idx.build(pages)
|
||||
checker = ConsistencyChecker(idx, pages_fn := (lambda: pages))
|
||||
report = checker.check_and_repair()
|
||||
assert report.drifted is False and report.healthy is True
|
||||
assert pages_fn() # source unchanged
|
||||
|
||||
|
||||
def test_missed_delta_drift_is_detected_and_repaired():
|
||||
a = _page("A", "Foo", "converging target body")
|
||||
b = _page("B", "Bar", "initially unrelated separate text")
|
||||
source = {"pages": [a, b]}
|
||||
idx = EquivalenceIndex()
|
||||
idx.build(source["pages"])
|
||||
assert idx.groups() == () # not equivalent yet
|
||||
|
||||
# Source changes B to match A, but the index is never told (a missed delta → drift).
|
||||
b2 = _page("B", "Bar", "converging target body")
|
||||
source["pages"] = [a, b2]
|
||||
|
||||
checker = ConsistencyChecker(idx, lambda: source["pages"])
|
||||
report = checker.check_and_repair()
|
||||
assert report.drifted is True and report.repaired is True and report.healthy is True
|
||||
# Self-healed: the index now reflects the equivalence.
|
||||
assert idx.equivalent_to(Identity("A", "Foo")) == frozenset(
|
||||
{Identity("A", "Foo"), Identity("B", "Bar")}
|
||||
)
|
||||
|
||||
|
||||
def test_corrupted_internal_state_is_healed():
|
||||
a = _page("A", "Foo", "identical content")
|
||||
b = _page("B", "Bar", "identical content")
|
||||
idx = EquivalenceIndex()
|
||||
idx.build([a, b])
|
||||
# Corrupt the derived tier directly: delete a true edge (simulated index corruption).
|
||||
idx._content_edges.clear()
|
||||
assert idx.groups() == () # corrupted away
|
||||
|
||||
checker = ConsistencyChecker(idx, lambda: [a, b])
|
||||
report = checker.check_and_repair()
|
||||
assert report.drifted is True and report.healthy is True
|
||||
assert idx.groups() # edge restored by scoped recompute
|
||||
|
||||
|
||||
def test_removed_source_page_is_reconciled():
|
||||
a = _page("A", "Foo", "same body")
|
||||
b = _page("B", "Bar", "same body")
|
||||
idx = EquivalenceIndex()
|
||||
idx.build([a, b])
|
||||
checker = ConsistencyChecker(idx, lambda: [a]) # B vanished from source
|
||||
report = checker.check_and_repair()
|
||||
assert report.healthy is True
|
||||
assert Identity("B", "Bar") not in idx.identities()
|
||||
Reference in New Issue
Block a user