Files
shard-wiki/tests/test_incremental_equivalence.py
tegwick 0b3ab2086f feat(incremental): indexed equivalence — blocking + verify (WP-0011 T1)
Detect equivalence (distinct identities holding the same page) without pairwise
O(N²): MinHash/LSH bands over content shingles + normalized-title buckets
generate candidates (blocking), then exact-fingerprint or Jaccard>=threshold
confirm them (verify), with curator decision-log bindings always forming edges.
Groups are the connected components of the edge set. Includes the incremental
add/update/remove internals used by T2. Matches a brute-force oracle. New
incremental/ package (minhash primitives + EquivalenceIndex).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 02:13:06 +02:00

90 lines
3.4 KiB
Python

"""Tests for the indexed equivalence relation — blocking + verify (SHARD-WP-0011 T1)."""
from itertools import combinations
from shard_wiki.incremental import EquivalenceIndex, MinHasher, band_keys, jaccard, shingles
from shard_wiki.incremental.equivalence import _fingerprint
from shard_wiki.model import Identity, Page
from shard_wiki.provenance import ProvenanceEnvelope
def _page(shard, key, body):
return Page(
identity=Identity(shard, key),
body=body,
envelope=ProvenanceEnvelope(source_shard=shard),
)
def _brute_force_groups(pages, threshold):
"""Oracle: O(N²) verify of every pair, then connected components."""
parent = {p.identity: p.identity for p in pages}
def find(x):
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
for p, q in combinations(pages, 2):
same_fp = _fingerprint(p.body) == _fingerprint(q.body)
sim = jaccard(shingles(p.body), shingles(q.body))
if same_fp or sim >= threshold:
parent[find(p.identity)] = find(q.identity)
comps = {}
for p in pages:
comps.setdefault(find(p.identity), set()).add(p.identity)
return {frozenset(v) for v in comps.values() if len(v) > 1}
def test_minhash_lsh_buckets_near_duplicates_together():
hasher = MinHasher(num_perm=64)
base = "the quick brown fox jumps over the lazy dog near the river bank today"
near = base + " and then some"
far = "completely unrelated content about astrophysics and distant galaxies far"
b_base = set(band_keys(hasher.signature(shingles(base)), 32))
b_near = set(band_keys(hasher.signature(shingles(near)), 32))
b_far = set(band_keys(hasher.signature(shingles(far)), 32))
assert b_base & b_near # near-duplicates share at least one band
assert not (b_base & b_far) # unrelated pages do not
def test_exact_duplicate_across_shards_is_equivalent():
idx = EquivalenceIndex()
idx.add(_page("A", "Foo", "identical body text here"))
idx.add(_page("B", "Bar", "identical body text here"))
assert idx.equivalent_to(Identity("A", "Foo")) == frozenset(
{Identity("A", "Foo"), Identity("B", "Bar")}
)
def test_unrelated_pages_are_not_equivalent():
idx = EquivalenceIndex()
idx.add(_page("A", "Foo", "alpha beta gamma delta epsilon"))
idx.add(_page("B", "Bar", "nothing in common whatsoever entirely"))
assert idx.groups() == ()
def test_curator_binding_forces_equivalence_regardless_of_content():
idx = EquivalenceIndex()
idx.add(_page("A", "Foo", "one thing"))
idx.add(_page("B", "Bar", "totally different"))
idx.bind(Identity("A", "Foo"), Identity("B", "Bar"))
assert idx.equivalent_to(Identity("A", "Foo")) == frozenset(
{Identity("A", "Foo"), Identity("B", "Bar")}
)
def test_index_matches_brute_force_oracle():
threshold = 0.7
shared = "shared sentence one shared sentence two shared sentence three end"
pages = [
_page("A", "Doc1", shared),
_page("B", "Doc1copy", shared + " minor tail"), # near-dup of A
_page("C", "Other", "a totally distinct page with no overlapping shingles at all here"),
_page("D", "Lonely", "yet another isolated document about unrelated subject matter alone"),
]
idx = EquivalenceIndex(threshold=threshold)
idx.build(pages)
assert set(idx.groups()) == _brute_force_groups(pages, threshold)