Complete REUSE-WP-0004: CI, overlap detection, and catalog generation

Add Gitea CI workflow for registry validation, reuse-surface overlaps and catalog commands, generated catalog artifacts, and documentation updates closing gap analysis priorities 9-11.
2026-06-15 01:20:31 +02:00
parent 5c5023c000
commit c366fc4a4e
12 changed files with 538 additions and 12 deletions
--- a/reuse_surface/catalog.py
+++ b/reuse_surface/catalog.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+import html
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+ROOT = Path(__file__).resolve().parent.parent
+CATALOG_MD = ROOT / "docs" / "CapabilityCatalog.md"
+CATALOG_HTML_DIR = ROOT / "docs" / "catalog"
+CATALOG_HTML = CATALOG_HTML_DIR / "index.html"
+
+
+def _grouped_capabilities(
+    indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
+) -> dict[str, list[tuple[dict[str, Any], dict[str, Any]]]]:
+    grouped: dict[str, list[tuple[dict[str, Any], dict[str, Any]]]] = defaultdict(
+        list
+    )
+    for index_item, entry in indexed_entries:
+        domain = index_item.get("domain", "unknown")
+        grouped[domain].append((index_item, entry))
+    return dict(sorted(grouped.items()))
+
+
+def render_markdown(
+    index: dict[str, Any],
+    indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
+) -> str:
+    lines = [
+        "# Capability Catalog",
+        "",
+        f"**Domain:** {index.get('domain', 'unknown')}  ",
+        f"**Updated:** {index.get('updated', 'unknown')}  ",
+        f"**Entries:** {len(indexed_entries)}",
+        "",
+        "Generated by `reuse-surface catalog`. Do not edit manually.",
+        "",
+    ]
+    for domain, items in _grouped_capabilities(indexed_entries).items():
+        lines.extend([f"## {domain}", ""])
+        for index_item, entry in sorted(items, key=lambda pair: pair[0]["id"]):
+            lines.extend(
+                [
+                    f"### {index_item['name']}",
+                    "",
+                    f"- **ID:** `{index_item['id']}`",
+                    f"- **Vector:** {index_item['vector']}",
+                    f"- **Owner:** {index_item.get('owner', 'unknown')}",
+                    f"- **Path:** `{index_item['path']}`",
+                    f"- **Summary:** {index_item['summary']}",
+                    "",
+                ]
+            )
+            guidance = entry.get("consumer_guidance") or {}
+            limitations = guidance.get("known_limitations") or []
+            if limitations:
+                lines.append("**Known limitations:**")
+                lines.extend(f"- {item}" for item in limitations)
+                lines.append("")
+    return "\n".join(lines).rstrip() + "\n"
+
+
+def render_html(
+    index: dict[str, Any],
+    indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
+) -> str:
+    sections: list[str] = []
+    for domain, items in _grouped_capabilities(indexed_entries).items():
+        cards: list[str] = []
+        for index_item, entry in sorted(items, key=lambda pair: pair[0]["id"]):
+            name = html.escape(index_item["name"])
+            summary = html.escape(index_item["summary"])
+            cap_id = html.escape(index_item["id"])
+            vector = html.escape(index_item["vector"])
+            path = html.escape(index_item["path"])
+            cards.append(
+                f"""<article class="card">
+  <h3>{name}</h3>
+  <p class="meta"><code>{cap_id}</code> · {vector}</p>
+  <p>{summary}</p>
+  <p class="path">{path}</p>
+</article>"""
+            )
+        sections.append(
+            f"<section><h2>{html.escape(domain)}</h2>\n" + "\n".join(cards) + "</section>"
+        )
+
+    body = "\n".join(sections)
+    title = html.escape(f"Capability Catalog — {index.get('domain', 'unknown')}")
+    return f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <title>{title}</title>
+  <style>
+    body {{ font-family: system-ui, sans-serif; margin: 2rem; line-height: 1.5; }}
+    h1 {{ margin-bottom: 0.2rem; }}
+    .subtitle {{ color: #555; margin-bottom: 2rem; }}
+    section {{ margin-bottom: 2rem; }}
+    .card {{ border: 1px solid #ddd; border-radius: 8px; padding: 1rem; margin: 1rem 0; }}
+    .meta {{ color: #444; font-size: 0.95rem; }}
+    .path {{ font-size: 0.85rem; color: #666; }}
+  </style>
+</head>
+<body>
+  <h1>Capability Catalog</h1>
+  <p class="subtitle">Updated {html.escape(str(index.get('updated', 'unknown')))} · {len(indexed_entries)} entries</p>
+  {body}
+</body>
+</html>
+"""
+
+
+def write_catalog(
+    index: dict[str, Any],
+    indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
+) -> tuple[Path, Path]:
+    CATALOG_HTML_DIR.mkdir(parents=True, exist_ok=True)
+    CATALOG_MD.write_text(render_markdown(index, indexed_entries), encoding="utf-8")
+    CATALOG_HTML.write_text(render_html(index, indexed_entries), encoding="utf-8")
+    return CATALOG_MD, CATALOG_HTML
--- a/reuse_surface/cli.py
+++ b/reuse_surface/cli.py
@@ -9,9 +9,9 @@ from typing import Any
 import yaml
 from jsonschema import Draft202012Validator

+from reuse_surface.catalog import write_catalog
+from reuse_surface.overlaps import find_overlaps
 from reuse_surface.registry import (
-    CAPABILITIES_DIR,
-    INDEX_PATH,
    ROOT,
    capability_paths,
    level_at_least,
@@ -115,6 +115,40 @@ def cmd_query(args: argparse.Namespace) -> int:
    return 0


+def _load_indexed_entries() -> list[tuple[dict[str, Any], dict[str, Any]]]:
+    index = load_index()
+    indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]] = []
+    for item in index.get("capabilities", []):
+        path = ROOT / item["path"]
+        indexed_entries.append((item, parse_front_matter(path)))
+    return indexed_entries
+
+
+def cmd_overlaps(args: argparse.Namespace) -> int:
+    indexed_entries = _load_indexed_entries()
+    candidates = find_overlaps(indexed_entries, threshold=args.threshold)
+    if not candidates:
+        print("no overlap candidates")
+        return 0
+    for candidate in candidates:
+        reasons = "; ".join(candidate.reasons)
+        print(
+            f"{candidate.left_id} <> {candidate.right_id}  "
+            f"score={candidate.score:.2f}  {reasons}"
+        )
+    print(f"\n{len(candidates)} candidate{'s' if len(candidates) != 1 else ''}")
+    return 0
+
+
+def cmd_catalog(args: argparse.Namespace) -> int:
+    index = load_index()
+    indexed_entries = _load_indexed_entries()
+    md_path, html_path = write_catalog(index, indexed_entries)
+    print(f"ok: wrote {md_path.relative_to(ROOT)}")
+    print(f"ok: wrote {html_path.relative_to(ROOT)}")
+    return 0
+
+
 def cmd_export(args: argparse.Namespace) -> int:
    index = load_index()
    bundle: dict[str, Any] = {
@@ -184,6 +218,22 @@ def main(argv: list[str] | None = None) -> int:
    )
    export.set_defaults(func=cmd_export)

+    overlaps = subparsers.add_parser(
+        "overlaps", help="detect potential duplicate capabilities"
+    )
+    overlaps.add_argument(
+        "--threshold",
+        type=float,
+        default=0.28,
+        help="token similarity threshold (0-1)",
+    )
+    overlaps.set_defaults(func=cmd_overlaps)
+
+    catalog = subparsers.add_parser(
+        "catalog", help="generate human-readable capability catalog"
+    )
+    catalog.set_defaults(func=cmd_catalog)
+
    args = parser.parse_args(argv)
    return args.func(args)

--- a/reuse_surface/overlaps.py
+++ b/reuse_surface/overlaps.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import Any
+
+TOKEN_RE = re.compile(r"[a-z][a-z0-9-]{2,}")
+
+
+@dataclass
+class OverlapCandidate:
+    left_id: str
+    right_id: str
+    score: float
+    reasons: list[str]
+
+
+def _tokens(text: str) -> set[str]:
+    return set(TOKEN_RE.findall(text.lower()))
+
+
+def _entry_blob(entry: dict[str, Any], index_item: dict[str, Any]) -> str:
+    discovery = entry.get("discovery") or {}
+    parts = [
+        index_item.get("name", ""),
+        index_item.get("summary", ""),
+        entry.get("id", ""),
+        " ".join(index_item.get("tags", [])),
+        discovery.get("intent", ""),
+        " ".join(discovery.get("includes", [])),
+    ]
+    return " ".join(str(part) for part in parts if part)
+
+
+def _relation_overlap(left: dict[str, Any], right: dict[str, Any]) -> list[str]:
+    reasons: list[str] = []
+    left_id = left["id"]
+    right_id = right["id"]
+    relations = left.get("relations") or {}
+    for relation_type, targets in relations.items():
+        if not isinstance(targets, list):
+            continue
+        if right_id in targets:
+            reasons.append(f"relation:{relation_type}")
+    if left_id.split(".")[1] == right_id.split(".")[1]:
+        reasons.append("shared domain segment")
+    return reasons
+
+
+def find_overlaps(
+    indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
+    *,
+    threshold: float = 0.28,
+) -> list[OverlapCandidate]:
+    candidates: list[OverlapCandidate] = []
+    blobs = [
+        (_entry_blob(entry, index_item), index_item["id"], entry)
+        for index_item, entry in indexed_entries
+    ]
+
+    for i, (left_blob, left_id, left_entry) in enumerate(blobs):
+        left_tokens = _tokens(left_blob)
+        for j in range(i + 1, len(blobs)):
+            right_blob, right_id, right_entry = blobs[j]
+            right_tokens = _tokens(right_blob)
+            if not left_tokens or not right_tokens:
+                continue
+            score = len(left_tokens & right_tokens) / len(left_tokens | right_tokens)
+            reasons: list[str] = []
+            if score >= threshold:
+                reasons.append(f"token similarity {score:.2f}")
+            shared_tags = set(left_entry.get("tags", [])) & set(
+                right_entry.get("tags", [])
+            )
+            if shared_tags:
+                reasons.append(f"shared tags: {', '.join(sorted(shared_tags))}")
+            reasons.extend(_relation_overlap(left_entry, right_entry))
+            if reasons and (score >= threshold or len(reasons) > 1):
+                candidates.append(
+                    OverlapCandidate(
+                        left_id=left_id,
+                        right_id=right_id,
+                        score=score,
+                        reasons=reasons,
+                    )
+                )
+    return sorted(candidates, key=lambda item: item.score, reverse=True)