from __future__ import annotations import re from dataclasses import dataclass from typing import Any TOKEN_RE = re.compile(r"[a-z][a-z0-9-]{2,}") @dataclass class OverlapCandidate: left_id: str right_id: str score: float reasons: list[str] def _tokens(text: str) -> set[str]: return set(TOKEN_RE.findall(text.lower())) def _entry_blob(entry: dict[str, Any], index_item: dict[str, Any]) -> str: discovery = entry.get("discovery") or {} parts = [ index_item.get("name", ""), index_item.get("summary", ""), entry.get("id", ""), " ".join(index_item.get("tags", [])), discovery.get("intent", ""), " ".join(discovery.get("includes", [])), ] return " ".join(str(part) for part in parts if part) def _relation_overlap(left: dict[str, Any], right: dict[str, Any]) -> list[str]: reasons: list[str] = [] left_id = left["id"] right_id = right["id"] relations = left.get("relations") or {} for relation_type, targets in relations.items(): if not isinstance(targets, list): continue if right_id in targets: reasons.append(f"relation:{relation_type}") if left_id.split(".")[1] == right_id.split(".")[1]: reasons.append("shared domain segment") return reasons def find_overlaps( indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]], *, threshold: float = 0.28, ) -> list[OverlapCandidate]: candidates: list[OverlapCandidate] = [] blobs = [ (_entry_blob(entry, index_item), index_item["id"], entry) for index_item, entry in indexed_entries ] for i, (left_blob, left_id, left_entry) in enumerate(blobs): left_tokens = _tokens(left_blob) for j in range(i + 1, len(blobs)): right_blob, right_id, right_entry = blobs[j] right_tokens = _tokens(right_blob) if not left_tokens or not right_tokens: continue score = len(left_tokens & right_tokens) / len(left_tokens | right_tokens) reasons: list[str] = [] if score >= threshold: reasons.append(f"token similarity {score:.2f}") shared_tags = set(left_entry.get("tags", [])) & set( right_entry.get("tags", []) ) if shared_tags: reasons.append(f"shared tags: {', '.join(sorted(shared_tags))}") reasons.extend(_relation_overlap(left_entry, right_entry)) if reasons and (score >= threshold or len(reasons) > 1): candidates.append( OverlapCandidate( left_id=left_id, right_id=right_id, score=score, reasons=reasons, ) ) return sorted(candidates, key=lambda item: item.score, reverse=True)