generated from coulomb/repo-seed
Detect equivalence (distinct identities holding the same page) without pairwise O(N²): MinHash/LSH bands over content shingles + normalized-title buckets generate candidates (blocking), then exact-fingerprint or Jaccard>=threshold confirm them (verify), with curator decision-log bindings always forming edges. Groups are the connected components of the edge set. Includes the incremental add/update/remove internals used by T2. Matches a brute-force oracle. New incremental/ package (minhash primitives + EquivalenceIndex). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
90 lines
3.4 KiB
Python
90 lines
3.4 KiB
Python
"""Tests for the indexed equivalence relation — blocking + verify (SHARD-WP-0011 T1)."""
|
|
|
|
from itertools import combinations
|
|
|
|
from shard_wiki.incremental import EquivalenceIndex, MinHasher, band_keys, jaccard, shingles
|
|
from shard_wiki.incremental.equivalence import _fingerprint
|
|
from shard_wiki.model import Identity, Page
|
|
from shard_wiki.provenance import ProvenanceEnvelope
|
|
|
|
|
|
def _page(shard, key, body):
|
|
return Page(
|
|
identity=Identity(shard, key),
|
|
body=body,
|
|
envelope=ProvenanceEnvelope(source_shard=shard),
|
|
)
|
|
|
|
|
|
def _brute_force_groups(pages, threshold):
|
|
"""Oracle: O(N²) verify of every pair, then connected components."""
|
|
parent = {p.identity: p.identity for p in pages}
|
|
|
|
def find(x):
|
|
while parent[x] != x:
|
|
parent[x] = parent[parent[x]]
|
|
x = parent[x]
|
|
return x
|
|
|
|
for p, q in combinations(pages, 2):
|
|
same_fp = _fingerprint(p.body) == _fingerprint(q.body)
|
|
sim = jaccard(shingles(p.body), shingles(q.body))
|
|
if same_fp or sim >= threshold:
|
|
parent[find(p.identity)] = find(q.identity)
|
|
comps = {}
|
|
for p in pages:
|
|
comps.setdefault(find(p.identity), set()).add(p.identity)
|
|
return {frozenset(v) for v in comps.values() if len(v) > 1}
|
|
|
|
|
|
def test_minhash_lsh_buckets_near_duplicates_together():
|
|
hasher = MinHasher(num_perm=64)
|
|
base = "the quick brown fox jumps over the lazy dog near the river bank today"
|
|
near = base + " and then some"
|
|
far = "completely unrelated content about astrophysics and distant galaxies far"
|
|
b_base = set(band_keys(hasher.signature(shingles(base)), 32))
|
|
b_near = set(band_keys(hasher.signature(shingles(near)), 32))
|
|
b_far = set(band_keys(hasher.signature(shingles(far)), 32))
|
|
assert b_base & b_near # near-duplicates share at least one band
|
|
assert not (b_base & b_far) # unrelated pages do not
|
|
|
|
|
|
def test_exact_duplicate_across_shards_is_equivalent():
|
|
idx = EquivalenceIndex()
|
|
idx.add(_page("A", "Foo", "identical body text here"))
|
|
idx.add(_page("B", "Bar", "identical body text here"))
|
|
assert idx.equivalent_to(Identity("A", "Foo")) == frozenset(
|
|
{Identity("A", "Foo"), Identity("B", "Bar")}
|
|
)
|
|
|
|
|
|
def test_unrelated_pages_are_not_equivalent():
|
|
idx = EquivalenceIndex()
|
|
idx.add(_page("A", "Foo", "alpha beta gamma delta epsilon"))
|
|
idx.add(_page("B", "Bar", "nothing in common whatsoever entirely"))
|
|
assert idx.groups() == ()
|
|
|
|
|
|
def test_curator_binding_forces_equivalence_regardless_of_content():
|
|
idx = EquivalenceIndex()
|
|
idx.add(_page("A", "Foo", "one thing"))
|
|
idx.add(_page("B", "Bar", "totally different"))
|
|
idx.bind(Identity("A", "Foo"), Identity("B", "Bar"))
|
|
assert idx.equivalent_to(Identity("A", "Foo")) == frozenset(
|
|
{Identity("A", "Foo"), Identity("B", "Bar")}
|
|
)
|
|
|
|
|
|
def test_index_matches_brute_force_oracle():
|
|
threshold = 0.7
|
|
shared = "shared sentence one shared sentence two shared sentence three end"
|
|
pages = [
|
|
_page("A", "Doc1", shared),
|
|
_page("B", "Doc1copy", shared + " minor tail"), # near-dup of A
|
|
_page("C", "Other", "a totally distinct page with no overlapping shingles at all here"),
|
|
_page("D", "Lonely", "yet another isolated document about unrelated subject matter alone"),
|
|
]
|
|
idx = EquivalenceIndex(threshold=threshold)
|
|
idx.build(pages)
|
|
assert set(idx.groups()) == _brute_force_groups(pages, threshold)
|