reuse-surface/reuse_surface/overlaps.py

from __future__ import annotations

import re
from dataclasses import dataclass
from typing import Any

TOKEN_RE = re.compile(r"[a-z][a-z0-9-]{2,}")


@dataclass
class OverlapCandidate:
    left_id: str
    right_id: str
    score: float
    reasons: list[str]


def _tokens(text: str) -> set[str]:
    return set(TOKEN_RE.findall(text.lower()))


def _entry_blob(entry: dict[str, Any], index_item: dict[str, Any]) -> str:
    discovery = entry.get("discovery") or {}
    parts = [
        index_item.get("name", ""),
        index_item.get("summary", ""),
        entry.get("id", ""),
        " ".join(index_item.get("tags", [])),
        discovery.get("intent", ""),
        " ".join(discovery.get("includes", [])),
    ]
    return " ".join(str(part) for part in parts if part)


def _relation_overlap(left: dict[str, Any], right: dict[str, Any]) -> list[str]:
    reasons: list[str] = []
    left_id = left["id"]
    right_id = right["id"]
    relations = left.get("relations") or {}
    for relation_type, targets in relations.items():
        if not isinstance(targets, list):
            continue
        if right_id in targets:
            reasons.append(f"relation:{relation_type}")
    if left_id.split(".")[1] == right_id.split(".")[1]:
        reasons.append("shared domain segment")
    return reasons


def find_overlaps(
    indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
    *,
    threshold: float = 0.28,
) -> list[OverlapCandidate]:
    candidates: list[OverlapCandidate] = []
    blobs = [
        (_entry_blob(entry, index_item), index_item["id"], entry)
        for index_item, entry in indexed_entries
    ]

    for i, (left_blob, left_id, left_entry) in enumerate(blobs):
        left_tokens = _tokens(left_blob)
        for j in range(i + 1, len(blobs)):
            right_blob, right_id, right_entry = blobs[j]
            right_tokens = _tokens(right_blob)
            if not left_tokens or not right_tokens:
                continue
            score = len(left_tokens & right_tokens) / len(left_tokens | right_tokens)
            reasons: list[str] = []
            if score >= threshold:
                reasons.append(f"token similarity {score:.2f}")
            shared_tags = set(left_entry.get("tags", [])) & set(
                right_entry.get("tags", [])
            )
            if shared_tags:
                reasons.append(f"shared tags: {', '.join(sorted(shared_tags))}")
            reasons.extend(_relation_overlap(left_entry, right_entry))
            if reasons and (score >= threshold or len(reasons) > 1):
                candidates.append(
                    OverlapCandidate(
                        left_id=left_id,
                        right_id=right_id,
                        score=score,
                        reasons=reasons,
                    )
                )
    return sorted(candidates, key=lambda item: item.score, reverse=True)