"""Tests for the indexed equivalence relation — blocking + verify (SHARD-WP-0011 T1).""" from itertools import combinations from shard_wiki.incremental import EquivalenceIndex, MinHasher, band_keys, jaccard, shingles from shard_wiki.incremental.equivalence import _fingerprint from shard_wiki.model import Identity, Page from shard_wiki.provenance import ProvenanceEnvelope def _page(shard, key, body): return Page( identity=Identity(shard, key), body=body, envelope=ProvenanceEnvelope(source_shard=shard), ) def _brute_force_groups(pages, threshold): """Oracle: O(N²) verify of every pair, then connected components.""" parent = {p.identity: p.identity for p in pages} def find(x): while parent[x] != x: parent[x] = parent[parent[x]] x = parent[x] return x for p, q in combinations(pages, 2): same_fp = _fingerprint(p.body) == _fingerprint(q.body) sim = jaccard(shingles(p.body), shingles(q.body)) if same_fp or sim >= threshold: parent[find(p.identity)] = find(q.identity) comps = {} for p in pages: comps.setdefault(find(p.identity), set()).add(p.identity) return {frozenset(v) for v in comps.values() if len(v) > 1} def test_minhash_lsh_buckets_near_duplicates_together(): hasher = MinHasher(num_perm=64) base = "the quick brown fox jumps over the lazy dog near the river bank today" near = base + " and then some" far = "completely unrelated content about astrophysics and distant galaxies far" b_base = set(band_keys(hasher.signature(shingles(base)), 32)) b_near = set(band_keys(hasher.signature(shingles(near)), 32)) b_far = set(band_keys(hasher.signature(shingles(far)), 32)) assert b_base & b_near # near-duplicates share at least one band assert not (b_base & b_far) # unrelated pages do not def test_exact_duplicate_across_shards_is_equivalent(): idx = EquivalenceIndex() idx.add(_page("A", "Foo", "identical body text here")) idx.add(_page("B", "Bar", "identical body text here")) assert idx.equivalent_to(Identity("A", "Foo")) == frozenset( {Identity("A", "Foo"), Identity("B", "Bar")} ) def test_unrelated_pages_are_not_equivalent(): idx = EquivalenceIndex() idx.add(_page("A", "Foo", "alpha beta gamma delta epsilon")) idx.add(_page("B", "Bar", "nothing in common whatsoever entirely")) assert idx.groups() == () def test_curator_binding_forces_equivalence_regardless_of_content(): idx = EquivalenceIndex() idx.add(_page("A", "Foo", "one thing")) idx.add(_page("B", "Bar", "totally different")) idx.bind(Identity("A", "Foo"), Identity("B", "Bar")) assert idx.equivalent_to(Identity("A", "Foo")) == frozenset( {Identity("A", "Foo"), Identity("B", "Bar")} ) def test_index_matches_brute_force_oracle(): threshold = 0.7 shared = "shared sentence one shared sentence two shared sentence three end" pages = [ _page("A", "Doc1", shared), _page("B", "Doc1copy", shared + " minor tail"), # near-dup of A _page("C", "Other", "a totally distinct page with no overlapping shingles at all here"), _page("D", "Lonely", "yet another isolated document about unrelated subject matter alone"), ] idx = EquivalenceIndex(threshold=threshold) idx.build(pages) assert set(idx.groups()) == _brute_force_groups(pages, threshold)