Files
reuse-surface/reuse_surface/overlaps.py
tegwick c366fc4a4e
Some checks failed
ci / validate-registry (push) Has been cancelled
Complete REUSE-WP-0004: CI, overlap detection, and catalog generation
Add Gitea CI workflow for registry validation, reuse-surface overlaps and
catalog commands, generated catalog artifacts, and documentation updates
closing gap analysis priorities 9-11.
2026-06-15 01:20:31 +02:00

87 lines
2.8 KiB
Python

from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Any
TOKEN_RE = re.compile(r"[a-z][a-z0-9-]{2,}")
@dataclass
class OverlapCandidate:
left_id: str
right_id: str
score: float
reasons: list[str]
def _tokens(text: str) -> set[str]:
return set(TOKEN_RE.findall(text.lower()))
def _entry_blob(entry: dict[str, Any], index_item: dict[str, Any]) -> str:
discovery = entry.get("discovery") or {}
parts = [
index_item.get("name", ""),
index_item.get("summary", ""),
entry.get("id", ""),
" ".join(index_item.get("tags", [])),
discovery.get("intent", ""),
" ".join(discovery.get("includes", [])),
]
return " ".join(str(part) for part in parts if part)
def _relation_overlap(left: dict[str, Any], right: dict[str, Any]) -> list[str]:
reasons: list[str] = []
left_id = left["id"]
right_id = right["id"]
relations = left.get("relations") or {}
for relation_type, targets in relations.items():
if not isinstance(targets, list):
continue
if right_id in targets:
reasons.append(f"relation:{relation_type}")
if left_id.split(".")[1] == right_id.split(".")[1]:
reasons.append("shared domain segment")
return reasons
def find_overlaps(
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
*,
threshold: float = 0.28,
) -> list[OverlapCandidate]:
candidates: list[OverlapCandidate] = []
blobs = [
(_entry_blob(entry, index_item), index_item["id"], entry)
for index_item, entry in indexed_entries
]
for i, (left_blob, left_id, left_entry) in enumerate(blobs):
left_tokens = _tokens(left_blob)
for j in range(i + 1, len(blobs)):
right_blob, right_id, right_entry = blobs[j]
right_tokens = _tokens(right_blob)
if not left_tokens or not right_tokens:
continue
score = len(left_tokens & right_tokens) / len(left_tokens | right_tokens)
reasons: list[str] = []
if score >= threshold:
reasons.append(f"token similarity {score:.2f}")
shared_tags = set(left_entry.get("tags", [])) & set(
right_entry.get("tags", [])
)
if shared_tags:
reasons.append(f"shared tags: {', '.join(sorted(shared_tags))}")
reasons.extend(_relation_overlap(left_entry, right_entry))
if reasons and (score >= threshold or len(reasons) > 1):
candidates.append(
OverlapCandidate(
left_id=left_id,
right_id=right_id,
score=score,
reasons=reasons,
)
)
return sorted(candidates, key=lambda item: item.score, reverse=True)