Complete REUSE-WP-0004: CI, overlap detection, and catalog generation

Add Gitea CI workflow for registry validation, reuse-surface overlaps and catalog commands, generated catalog artifacts, and documentation updates closing gap analysis priorities 9-11.
2026-06-15 01:20:31 +02:00
parent 5c5023c000
commit c366fc4a4e
12 changed files with 538 additions and 12 deletions
--- a/reuse_surface/overlaps.py
+++ b/reuse_surface/overlaps.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import Any
+
+TOKEN_RE = re.compile(r"[a-z][a-z0-9-]{2,}")
+
+
+@dataclass
+class OverlapCandidate:
+    left_id: str
+    right_id: str
+    score: float
+    reasons: list[str]
+
+
+def _tokens(text: str) -> set[str]:
+    return set(TOKEN_RE.findall(text.lower()))
+
+
+def _entry_blob(entry: dict[str, Any], index_item: dict[str, Any]) -> str:
+    discovery = entry.get("discovery") or {}
+    parts = [
+        index_item.get("name", ""),
+        index_item.get("summary", ""),
+        entry.get("id", ""),
+        " ".join(index_item.get("tags", [])),
+        discovery.get("intent", ""),
+        " ".join(discovery.get("includes", [])),
+    ]
+    return " ".join(str(part) for part in parts if part)
+
+
+def _relation_overlap(left: dict[str, Any], right: dict[str, Any]) -> list[str]:
+    reasons: list[str] = []
+    left_id = left["id"]
+    right_id = right["id"]
+    relations = left.get("relations") or {}
+    for relation_type, targets in relations.items():
+        if not isinstance(targets, list):
+            continue
+        if right_id in targets:
+            reasons.append(f"relation:{relation_type}")
+    if left_id.split(".")[1] == right_id.split(".")[1]:
+        reasons.append("shared domain segment")
+    return reasons
+
+
+def find_overlaps(
+    indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
+    *,
+    threshold: float = 0.28,
+) -> list[OverlapCandidate]:
+    candidates: list[OverlapCandidate] = []
+    blobs = [
+        (_entry_blob(entry, index_item), index_item["id"], entry)
+        for index_item, entry in indexed_entries
+    ]
+
+    for i, (left_blob, left_id, left_entry) in enumerate(blobs):
+        left_tokens = _tokens(left_blob)
+        for j in range(i + 1, len(blobs)):
+            right_blob, right_id, right_entry = blobs[j]
+            right_tokens = _tokens(right_blob)
+            if not left_tokens or not right_tokens:
+                continue
+            score = len(left_tokens & right_tokens) / len(left_tokens | right_tokens)
+            reasons: list[str] = []
+            if score >= threshold:
+                reasons.append(f"token similarity {score:.2f}")
+            shared_tags = set(left_entry.get("tags", [])) & set(
+                right_entry.get("tags", [])
+            )
+            if shared_tags:
+                reasons.append(f"shared tags: {', '.join(sorted(shared_tags))}")
+            reasons.extend(_relation_overlap(left_entry, right_entry))
+            if reasons and (score >= threshold or len(reasons) > 1):
+                candidates.append(
+                    OverlapCandidate(
+                        left_id=left_id,
+                        right_id=right_id,
+                        score=score,
+                        reasons=reasons,
+                    )
+                )
+    return sorted(candidates, key=lambda item: item.score, reverse=True)