generated from coulomb/repo-seed
feat(incremental): I-2 digest + consistency-checker (WP-0011 T3)
A Merkle-style digest summarizes the derived tier (per-identity fingerprint + incident edges as order-independent leaves) so equal states have equal digests and the digest is stable under equivalent event orders. A ConsistencyChecker recomputes the authoritative fold from the current source, compares it over a sampled region, and on mismatch scoped-recomputes just the affected identities — self-healing missed-delta drift, corrupted internal state, and vanished pages. Makes derived = f(canonical) verified, not asserted. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,12 @@ from shard_wiki.incremental.minhash import (
|
|||||||
jaccard,
|
jaccard,
|
||||||
shingles,
|
shingles,
|
||||||
)
|
)
|
||||||
|
from shard_wiki.incremental.verification import (
|
||||||
|
ConsistencyChecker,
|
||||||
|
ConsistencyReport,
|
||||||
|
derived_digest,
|
||||||
|
region_digest,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"shingles",
|
"shingles",
|
||||||
@@ -31,4 +37,8 @@ __all__ = [
|
|||||||
"EquivalenceEdge",
|
"EquivalenceEdge",
|
||||||
"EquivalenceIndex",
|
"EquivalenceIndex",
|
||||||
"normalized_title",
|
"normalized_title",
|
||||||
|
"derived_digest",
|
||||||
|
"region_digest",
|
||||||
|
"ConsistencyReport",
|
||||||
|
"ConsistencyChecker",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -136,6 +136,15 @@ class EquivalenceIndex:
|
|||||||
|
|
||||||
# -- queries -------------------------------------------------------------
|
# -- queries -------------------------------------------------------------
|
||||||
|
|
||||||
|
def identities(self) -> frozenset[Identity]:
|
||||||
|
"""All identities currently present in the index."""
|
||||||
|
return frozenset(self._entries)
|
||||||
|
|
||||||
|
def fingerprint(self, identity: Identity) -> str | None:
|
||||||
|
"""The content fingerprint indexed for ``identity`` (None if absent) — a digest leaf."""
|
||||||
|
entry = self._entries.get(identity)
|
||||||
|
return entry.fingerprint if entry is not None else None
|
||||||
|
|
||||||
def edges(self) -> frozenset[frozenset[Identity]]:
|
def edges(self) -> frozenset[frozenset[Identity]]:
|
||||||
"""All equivalence edges (content + curator) among currently present identities."""
|
"""All equivalence edges (content + curator) among currently present identities."""
|
||||||
present = self._entries.keys()
|
present = self._entries.keys()
|
||||||
|
|||||||
112
src/shard_wiki/incremental/verification.py
Normal file
112
src/shard_wiki/incremental/verification.py
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
"""I-2 verification — digest + background consistency-checker (SHARD-WP-0011 T3).
|
||||||
|
|
||||||
|
``derived = f(canonical)`` is made *verified*, not asserted. A **Merkle-style digest** summarizes
|
||||||
|
the derived tier (each identity's content fingerprint + its incident equivalence edges as a leaf,
|
||||||
|
order-independently combined into a root) so two derived states are equal iff their digests match.
|
||||||
|
A **consistency-checker** recomputes the authoritative fold from the current source, compares it to
|
||||||
|
the maintained index over a (sampled) region, and on mismatch performs a **scoped recompute** of
|
||||||
|
just the affected identities — self-healing drift from a missed delta or corrupted state.
|
||||||
|
|
||||||
|
The digest is a pure function of index state, so it is "maintained alongside deltas" for free and
|
||||||
|
is stable under equivalent event orders (leaves are sorted before combination).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
from collections.abc import Callable, Iterable
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from shard_wiki.incremental.equivalence import EquivalenceIndex
|
||||||
|
from shard_wiki.model import Identity, Page
|
||||||
|
|
||||||
|
__all__ = ["region_digest", "derived_digest", "ConsistencyReport", "ConsistencyChecker"]
|
||||||
|
|
||||||
|
CuratorEdges = Iterable[tuple[Identity, Identity]]
|
||||||
|
|
||||||
|
|
||||||
|
def _leaf(index: EquivalenceIndex, identity: Identity) -> str:
|
||||||
|
"""A digest leaf for one identity: its fingerprint + its incident edges (as sorted peers)."""
|
||||||
|
fingerprint = index.fingerprint(identity) or "∅"
|
||||||
|
peers = sorted(
|
||||||
|
str(other)
|
||||||
|
for edge in index.edges()
|
||||||
|
if identity in edge
|
||||||
|
for other in edge
|
||||||
|
if other != identity
|
||||||
|
)
|
||||||
|
payload = f"{identity}|{fingerprint}|{','.join(peers)}"
|
||||||
|
return hashlib.blake2b(payload.encode("utf-8"), digest_size=16).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def region_digest(index: EquivalenceIndex, identities: Iterable[Identity]) -> str:
|
||||||
|
"""A Merkle-style root over the given identities' leaves (order-independent)."""
|
||||||
|
leaves = sorted(_leaf(index, identity) for identity in identities)
|
||||||
|
root = hashlib.blake2b(digest_size=16)
|
||||||
|
for leaf in leaves:
|
||||||
|
root.update(leaf.encode("utf-8"))
|
||||||
|
return root.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def derived_digest(index: EquivalenceIndex) -> str:
|
||||||
|
"""The digest of the whole maintained derived tier."""
|
||||||
|
return region_digest(index, index.identities())
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True, slots=True)
|
||||||
|
class ConsistencyReport:
|
||||||
|
"""Outcome of a consistency check: what was examined, whether it drifted, and if it healed."""
|
||||||
|
|
||||||
|
checked: int
|
||||||
|
drifted: bool
|
||||||
|
repaired: bool
|
||||||
|
healthy: bool
|
||||||
|
|
||||||
|
|
||||||
|
class ConsistencyChecker:
|
||||||
|
"""Compares the maintained index against an authoritative rebuild and repairs drift in place."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
index: EquivalenceIndex,
|
||||||
|
pages: Callable[[], Iterable[Page]],
|
||||||
|
curator_edges: Callable[[], CuratorEdges] = lambda: (),
|
||||||
|
) -> None:
|
||||||
|
self._index = index
|
||||||
|
self._pages = pages
|
||||||
|
self._curator = curator_edges
|
||||||
|
|
||||||
|
def _authoritative(self) -> EquivalenceIndex:
|
||||||
|
expected = EquivalenceIndex(
|
||||||
|
num_bands=self._index.num_bands, threshold=self._index.threshold
|
||||||
|
)
|
||||||
|
expected.build(list(self._pages()), list(self._curator()))
|
||||||
|
return expected
|
||||||
|
|
||||||
|
def check_and_repair(self, sample: Iterable[Identity] | None = None) -> ConsistencyReport:
|
||||||
|
"""Verify the (sampled) region against a from-scratch fold; scoped-recompute on mismatch."""
|
||||||
|
source = {p.identity: p for p in self._pages()}
|
||||||
|
expected = self._authoritative()
|
||||||
|
region = (
|
||||||
|
set(sample)
|
||||||
|
if sample is not None
|
||||||
|
else set(source) | set(self._index.identities())
|
||||||
|
)
|
||||||
|
|
||||||
|
drifted = region_digest(self._index, region) != region_digest(expected, region)
|
||||||
|
if not drifted:
|
||||||
|
return ConsistencyReport(len(region), drifted=False, repaired=False, healthy=True)
|
||||||
|
|
||||||
|
self._repair(region, source)
|
||||||
|
healthy = region_digest(self._index, region) == region_digest(expected, region)
|
||||||
|
return ConsistencyReport(len(region), drifted=True, repaired=True, healthy=healthy)
|
||||||
|
|
||||||
|
def _repair(self, region: set[Identity], source: dict[Identity, Page]) -> None:
|
||||||
|
"""Scoped recompute: reconcile each affected identity to the current source."""
|
||||||
|
present = self._index.identities()
|
||||||
|
for identity in region:
|
||||||
|
page = source.get(identity)
|
||||||
|
if page is not None:
|
||||||
|
self._index.update(page) if identity in present else self._index.add(page)
|
||||||
|
elif identity in present:
|
||||||
|
self._index.remove(identity)
|
||||||
89
tests/test_incremental_verification.py
Normal file
89
tests/test_incremental_verification.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
"""Tests for I-2 verification — digest + consistency-checker (SHARD-WP-0011 T3)."""
|
||||||
|
|
||||||
|
from shard_wiki.incremental import (
|
||||||
|
ConsistencyChecker,
|
||||||
|
EquivalenceIndex,
|
||||||
|
derived_digest,
|
||||||
|
)
|
||||||
|
from shard_wiki.model import Identity, Page
|
||||||
|
from shard_wiki.provenance import ProvenanceEnvelope
|
||||||
|
|
||||||
|
|
||||||
|
def _page(shard, key, body):
|
||||||
|
return Page(
|
||||||
|
identity=Identity(shard, key),
|
||||||
|
body=body,
|
||||||
|
envelope=ProvenanceEnvelope(source_shard=shard),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_digest_is_stable_under_equivalent_event_orders():
|
||||||
|
pages = [
|
||||||
|
_page("A", "Foo", "shared body text here"),
|
||||||
|
_page("B", "Bar", "shared body text here"),
|
||||||
|
_page("C", "Baz", "an entirely separate unrelated document"),
|
||||||
|
]
|
||||||
|
forward = EquivalenceIndex()
|
||||||
|
for p in pages:
|
||||||
|
forward.add(p)
|
||||||
|
reverse = EquivalenceIndex()
|
||||||
|
for p in reversed(pages):
|
||||||
|
reverse.add(p)
|
||||||
|
assert derived_digest(forward) == derived_digest(reverse)
|
||||||
|
|
||||||
|
|
||||||
|
def test_clean_index_reports_healthy():
|
||||||
|
pages = [_page("A", "Foo", "same body"), _page("B", "Bar", "same body")]
|
||||||
|
idx = EquivalenceIndex()
|
||||||
|
idx.build(pages)
|
||||||
|
checker = ConsistencyChecker(idx, pages_fn := (lambda: pages))
|
||||||
|
report = checker.check_and_repair()
|
||||||
|
assert report.drifted is False and report.healthy is True
|
||||||
|
assert pages_fn() # source unchanged
|
||||||
|
|
||||||
|
|
||||||
|
def test_missed_delta_drift_is_detected_and_repaired():
|
||||||
|
a = _page("A", "Foo", "converging target body")
|
||||||
|
b = _page("B", "Bar", "initially unrelated separate text")
|
||||||
|
source = {"pages": [a, b]}
|
||||||
|
idx = EquivalenceIndex()
|
||||||
|
idx.build(source["pages"])
|
||||||
|
assert idx.groups() == () # not equivalent yet
|
||||||
|
|
||||||
|
# Source changes B to match A, but the index is never told (a missed delta → drift).
|
||||||
|
b2 = _page("B", "Bar", "converging target body")
|
||||||
|
source["pages"] = [a, b2]
|
||||||
|
|
||||||
|
checker = ConsistencyChecker(idx, lambda: source["pages"])
|
||||||
|
report = checker.check_and_repair()
|
||||||
|
assert report.drifted is True and report.repaired is True and report.healthy is True
|
||||||
|
# Self-healed: the index now reflects the equivalence.
|
||||||
|
assert idx.equivalent_to(Identity("A", "Foo")) == frozenset(
|
||||||
|
{Identity("A", "Foo"), Identity("B", "Bar")}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_corrupted_internal_state_is_healed():
|
||||||
|
a = _page("A", "Foo", "identical content")
|
||||||
|
b = _page("B", "Bar", "identical content")
|
||||||
|
idx = EquivalenceIndex()
|
||||||
|
idx.build([a, b])
|
||||||
|
# Corrupt the derived tier directly: delete a true edge (simulated index corruption).
|
||||||
|
idx._content_edges.clear()
|
||||||
|
assert idx.groups() == () # corrupted away
|
||||||
|
|
||||||
|
checker = ConsistencyChecker(idx, lambda: [a, b])
|
||||||
|
report = checker.check_and_repair()
|
||||||
|
assert report.drifted is True and report.healthy is True
|
||||||
|
assert idx.groups() # edge restored by scoped recompute
|
||||||
|
|
||||||
|
|
||||||
|
def test_removed_source_page_is_reconciled():
|
||||||
|
a = _page("A", "Foo", "same body")
|
||||||
|
b = _page("B", "Bar", "same body")
|
||||||
|
idx = EquivalenceIndex()
|
||||||
|
idx.build([a, b])
|
||||||
|
checker = ConsistencyChecker(idx, lambda: [a]) # B vanished from source
|
||||||
|
report = checker.check_and_repair()
|
||||||
|
assert report.healthy is True
|
||||||
|
assert Identity("B", "Bar") not in idx.identities()
|
||||||
Reference in New Issue
Block a user