Files
shard-wiki/src/shard_wiki/incremental/equivalence.py
tegwick a8e65235a8 feat(incremental): I-2 digest + consistency-checker (WP-0011 T3)
A Merkle-style digest summarizes the derived tier (per-identity fingerprint +
incident edges as order-independent leaves) so equal states have equal digests
and the digest is stable under equivalent event orders. A ConsistencyChecker
recomputes the authoritative fold from the current source, compares it over a
sampled region, and on mismatch scoped-recomputes just the affected identities —
self-healing missed-delta drift, corrupted internal state, and vanished pages.
Makes derived = f(canonical) verified, not asserted.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 02:16:50 +02:00

222 lines
8.4 KiB
Python

"""Indexed equivalence — blocking + verify, incrementally maintained (SHARD-WP-0011 T1/T2).
Equivalence (two *distinct* identities holding the same page) is detected without pairwise O(N²):
1. **Blocking** generates candidate pairs — pages sharing a normalized-title bucket or an LSH band
(MinHash over content shingles).
2. **Verify** confirms a candidate — exact-body fingerprint match, or shingle Jaccard ≥ threshold —
plus **curator bindings** (explicit decision-log edges) which are always equivalence edges.
The index is **incrementally maintained** (T2): ``add`` / ``update`` / ``remove`` re-bucket the
changed page, **retract** the edges it leaves and **add** the edges it enters; equivalence groups
are the connected components of the current edge set, so a retraction that disconnects a component
**splits** a chorus automatically. A full :meth:`build` is just repeated ``add`` — the bounded
rebuild fallback. The invariant (and the test oracle): incremental state == a from-scratch rebuild.
"""
from __future__ import annotations
import hashlib
import re
from collections.abc import Iterable
from dataclasses import dataclass
from shard_wiki.incremental.minhash import MinHasher, band_keys, jaccard, shingles
from shard_wiki.model import Identity, Page
__all__ = ["EquivalenceEdge", "EquivalenceIndex", "normalized_title"]
_NONALNUM_RE = re.compile(r"[^a-z0-9]+")
def normalized_title(key: str) -> str:
"""A blocking bucket key: the last path segment, lowercased, stripped of non-alphanumerics."""
leaf = key.rsplit("/", 1)[-1]
return _NONALNUM_RE.sub("", leaf.lower())
@dataclass(frozen=True, slots=True)
class EquivalenceEdge:
"""A verified equivalence between two identities, tagged with why it was accepted."""
a: Identity
b: Identity
reason: str # "fingerprint" | "content" | "curator"
@dataclass(frozen=True, slots=True)
class _Entry:
shingle_set: frozenset[str]
bands: tuple[tuple[int, tuple[int, ...]], ...]
title: str
fingerprint: str
def _fingerprint(body: str) -> str:
return hashlib.blake2b(body.strip().encode("utf-8"), digest_size=16).hexdigest()
def _pair(a: Identity, b: Identity) -> frozenset[Identity]:
return frozenset((a, b))
class EquivalenceIndex:
"""An incrementally maintained, blocked-and-verified equivalence relation over union pages."""
def __init__(
self,
*,
num_perm: int = 64,
num_bands: int = 32,
threshold: float = 0.7,
hasher: MinHasher | None = None,
) -> None:
self.threshold = threshold
self.num_bands = num_bands
self._hasher = hasher or MinHasher(num_perm=num_perm)
self._entries: dict[Identity, _Entry] = {}
self._band_buckets: dict[tuple[int, tuple[int, ...]], set[Identity]] = {}
self._title_buckets: dict[str, set[Identity]] = {}
self._content_edges: dict[frozenset[Identity], str] = {}
self._curator_edges: set[frozenset[Identity]] = set()
# -- build / maintain ----------------------------------------------------
def build(
self,
pages: Iterable[Page],
curator_edges: Iterable[tuple[Identity, Identity]] = (),
) -> None:
"""Rebuild from scratch (the bounded fallback): add every page, then curator edges."""
self.__init__(
num_bands=self.num_bands, threshold=self.threshold, hasher=self._hasher
)
for page in pages:
self.add(page)
for a, b in curator_edges:
self.bind(a, b)
def add(self, page: Page) -> None:
"""Index a new (or, via :meth:`update`, refreshed) page and add its equivalence edges."""
identity = page.identity
entry = self._make_entry(page)
self._entries[identity] = entry
for key in entry.bands:
self._band_buckets.setdefault(key, set()).add(identity)
self._title_buckets.setdefault(entry.title, set()).add(identity)
for candidate in self._candidates(identity, entry):
reason = self._verify(identity, candidate)
if reason is not None:
self._content_edges[_pair(identity, candidate)] = reason
def remove(self, identity: Identity) -> None:
"""Drop a page: de-bucket it and retract every content edge incident to it."""
entry = self._entries.pop(identity, None)
if entry is None:
return
for key in entry.bands:
self._discard_bucket(self._band_buckets, key, identity)
self._discard_bucket(self._title_buckets, entry.title, identity)
for edge in [e for e in self._content_edges if identity in e]:
del self._content_edges[edge]
def update(self, page: Page) -> None:
"""Apply a change as retract-then-add: stale (bucket-exit) edges go, new edges arrive."""
self.remove(page.identity)
self.add(page)
def bind(self, a: Identity, b: Identity) -> None:
"""Record a curator equivalence (an explicit decision-log binding); always an edge."""
if a != b:
self._curator_edges.add(_pair(a, b))
def unbind(self, a: Identity, b: Identity) -> None:
self._curator_edges.discard(_pair(a, b))
# -- queries -------------------------------------------------------------
def identities(self) -> frozenset[Identity]:
"""All identities currently present in the index."""
return frozenset(self._entries)
def fingerprint(self, identity: Identity) -> str | None:
"""The content fingerprint indexed for ``identity`` (None if absent) — a digest leaf."""
entry = self._entries.get(identity)
return entry.fingerprint if entry is not None else None
def edges(self) -> frozenset[frozenset[Identity]]:
"""All equivalence edges (content + curator) among currently present identities."""
present = self._entries.keys()
curator = {e for e in self._curator_edges if e <= present}
return frozenset(set(self._content_edges) | curator)
def groups(self) -> tuple[frozenset[Identity], ...]:
"""Equivalence groups: connected components of size ≥ 2 (union-find over the edges)."""
parent: dict[Identity, Identity] = {}
def find(x: Identity) -> Identity:
parent.setdefault(x, x)
root = x
while parent[root] != root:
root = parent[root]
while parent[x] != root:
parent[x], x = root, parent[x]
return root
for edge in self.edges():
a, b = tuple(edge)
ra, rb = find(a), find(b)
if ra != rb:
parent[ra] = rb
comps: dict[Identity, set[Identity]] = {}
for node in parent:
comps.setdefault(find(node), set()).add(node)
return tuple(
frozenset(members) for members in comps.values() if len(members) > 1
)
def equivalent_to(self, identity: Identity) -> frozenset[Identity]:
"""The equivalence group containing ``identity`` (including itself), else just itself."""
for group in self.groups():
if identity in group:
return group
return frozenset({identity})
# -- internals -----------------------------------------------------------
def _make_entry(self, page: Page) -> _Entry:
shingle_set = shingles(page.body)
signature = self._hasher.signature(shingle_set)
return _Entry(
shingle_set=shingle_set,
bands=band_keys(signature, self.num_bands),
title=normalized_title(page.identity.key),
fingerprint=_fingerprint(page.body),
)
def _candidates(self, identity: Identity, entry: _Entry) -> set[Identity]:
candidates: set[Identity] = set()
for key in entry.bands:
candidates |= self._band_buckets.get(key, set())
candidates |= self._title_buckets.get(entry.title, set())
candidates.discard(identity)
return candidates
def _verify(self, a: Identity, b: Identity) -> str | None:
ea, eb = self._entries[a], self._entries[b]
if ea.fingerprint == eb.fingerprint:
return "fingerprint"
if jaccard(ea.shingle_set, eb.shingle_set) >= self.threshold:
return "content"
return None
@staticmethod
def _discard_bucket(buckets: dict, key, identity: Identity) -> None:
bucket = buckets.get(key)
if bucket is not None:
bucket.discard(identity)
if not bucket:
del buckets[key]