Complete REUSE-WP-0004: CI, overlap detection, and catalog generation
Some checks failed
ci / validate-registry (push) Has been cancelled

Add Gitea CI workflow for registry validation, reuse-surface overlaps and
catalog commands, generated catalog artifacts, and documentation updates
closing gap analysis priorities 9-11.
This commit is contained in:
2026-06-15 01:20:31 +02:00
parent 5c5023c000
commit c366fc4a4e
12 changed files with 538 additions and 12 deletions

122
reuse_surface/catalog.py Normal file
View File

@@ -0,0 +1,122 @@
from __future__ import annotations
import html
from collections import defaultdict
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parent.parent
CATALOG_MD = ROOT / "docs" / "CapabilityCatalog.md"
CATALOG_HTML_DIR = ROOT / "docs" / "catalog"
CATALOG_HTML = CATALOG_HTML_DIR / "index.html"
def _grouped_capabilities(
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
) -> dict[str, list[tuple[dict[str, Any], dict[str, Any]]]]:
grouped: dict[str, list[tuple[dict[str, Any], dict[str, Any]]]] = defaultdict(
list
)
for index_item, entry in indexed_entries:
domain = index_item.get("domain", "unknown")
grouped[domain].append((index_item, entry))
return dict(sorted(grouped.items()))
def render_markdown(
index: dict[str, Any],
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
) -> str:
lines = [
"# Capability Catalog",
"",
f"**Domain:** {index.get('domain', 'unknown')} ",
f"**Updated:** {index.get('updated', 'unknown')} ",
f"**Entries:** {len(indexed_entries)}",
"",
"Generated by `reuse-surface catalog`. Do not edit manually.",
"",
]
for domain, items in _grouped_capabilities(indexed_entries).items():
lines.extend([f"## {domain}", ""])
for index_item, entry in sorted(items, key=lambda pair: pair[0]["id"]):
lines.extend(
[
f"### {index_item['name']}",
"",
f"- **ID:** `{index_item['id']}`",
f"- **Vector:** {index_item['vector']}",
f"- **Owner:** {index_item.get('owner', 'unknown')}",
f"- **Path:** `{index_item['path']}`",
f"- **Summary:** {index_item['summary']}",
"",
]
)
guidance = entry.get("consumer_guidance") or {}
limitations = guidance.get("known_limitations") or []
if limitations:
lines.append("**Known limitations:**")
lines.extend(f"- {item}" for item in limitations)
lines.append("")
return "\n".join(lines).rstrip() + "\n"
def render_html(
index: dict[str, Any],
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
) -> str:
sections: list[str] = []
for domain, items in _grouped_capabilities(indexed_entries).items():
cards: list[str] = []
for index_item, entry in sorted(items, key=lambda pair: pair[0]["id"]):
name = html.escape(index_item["name"])
summary = html.escape(index_item["summary"])
cap_id = html.escape(index_item["id"])
vector = html.escape(index_item["vector"])
path = html.escape(index_item["path"])
cards.append(
f"""<article class="card">
<h3>{name}</h3>
<p class="meta"><code>{cap_id}</code> · {vector}</p>
<p>{summary}</p>
<p class="path">{path}</p>
</article>"""
)
sections.append(
f"<section><h2>{html.escape(domain)}</h2>\n" + "\n".join(cards) + "</section>"
)
body = "\n".join(sections)
title = html.escape(f"Capability Catalog — {index.get('domain', 'unknown')}")
return f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>{title}</title>
<style>
body {{ font-family: system-ui, sans-serif; margin: 2rem; line-height: 1.5; }}
h1 {{ margin-bottom: 0.2rem; }}
.subtitle {{ color: #555; margin-bottom: 2rem; }}
section {{ margin-bottom: 2rem; }}
.card {{ border: 1px solid #ddd; border-radius: 8px; padding: 1rem; margin: 1rem 0; }}
.meta {{ color: #444; font-size: 0.95rem; }}
.path {{ font-size: 0.85rem; color: #666; }}
</style>
</head>
<body>
<h1>Capability Catalog</h1>
<p class="subtitle">Updated {html.escape(str(index.get('updated', 'unknown')))} · {len(indexed_entries)} entries</p>
{body}
</body>
</html>
"""
def write_catalog(
index: dict[str, Any],
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
) -> tuple[Path, Path]:
CATALOG_HTML_DIR.mkdir(parents=True, exist_ok=True)
CATALOG_MD.write_text(render_markdown(index, indexed_entries), encoding="utf-8")
CATALOG_HTML.write_text(render_html(index, indexed_entries), encoding="utf-8")
return CATALOG_MD, CATALOG_HTML

View File

@@ -9,9 +9,9 @@ from typing import Any
import yaml
from jsonschema import Draft202012Validator
from reuse_surface.catalog import write_catalog
from reuse_surface.overlaps import find_overlaps
from reuse_surface.registry import (
CAPABILITIES_DIR,
INDEX_PATH,
ROOT,
capability_paths,
level_at_least,
@@ -115,6 +115,40 @@ def cmd_query(args: argparse.Namespace) -> int:
return 0
def _load_indexed_entries() -> list[tuple[dict[str, Any], dict[str, Any]]]:
index = load_index()
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]] = []
for item in index.get("capabilities", []):
path = ROOT / item["path"]
indexed_entries.append((item, parse_front_matter(path)))
return indexed_entries
def cmd_overlaps(args: argparse.Namespace) -> int:
indexed_entries = _load_indexed_entries()
candidates = find_overlaps(indexed_entries, threshold=args.threshold)
if not candidates:
print("no overlap candidates")
return 0
for candidate in candidates:
reasons = "; ".join(candidate.reasons)
print(
f"{candidate.left_id} <> {candidate.right_id} "
f"score={candidate.score:.2f} {reasons}"
)
print(f"\n{len(candidates)} candidate{'s' if len(candidates) != 1 else ''}")
return 0
def cmd_catalog(args: argparse.Namespace) -> int:
index = load_index()
indexed_entries = _load_indexed_entries()
md_path, html_path = write_catalog(index, indexed_entries)
print(f"ok: wrote {md_path.relative_to(ROOT)}")
print(f"ok: wrote {html_path.relative_to(ROOT)}")
return 0
def cmd_export(args: argparse.Namespace) -> int:
index = load_index()
bundle: dict[str, Any] = {
@@ -184,6 +218,22 @@ def main(argv: list[str] | None = None) -> int:
)
export.set_defaults(func=cmd_export)
overlaps = subparsers.add_parser(
"overlaps", help="detect potential duplicate capabilities"
)
overlaps.add_argument(
"--threshold",
type=float,
default=0.28,
help="token similarity threshold (0-1)",
)
overlaps.set_defaults(func=cmd_overlaps)
catalog = subparsers.add_parser(
"catalog", help="generate human-readable capability catalog"
)
catalog.set_defaults(func=cmd_catalog)
args = parser.parse_args(argv)
return args.func(args)

87
reuse_surface/overlaps.py Normal file
View File

@@ -0,0 +1,87 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Any
TOKEN_RE = re.compile(r"[a-z][a-z0-9-]{2,}")
@dataclass
class OverlapCandidate:
left_id: str
right_id: str
score: float
reasons: list[str]
def _tokens(text: str) -> set[str]:
return set(TOKEN_RE.findall(text.lower()))
def _entry_blob(entry: dict[str, Any], index_item: dict[str, Any]) -> str:
discovery = entry.get("discovery") or {}
parts = [
index_item.get("name", ""),
index_item.get("summary", ""),
entry.get("id", ""),
" ".join(index_item.get("tags", [])),
discovery.get("intent", ""),
" ".join(discovery.get("includes", [])),
]
return " ".join(str(part) for part in parts if part)
def _relation_overlap(left: dict[str, Any], right: dict[str, Any]) -> list[str]:
reasons: list[str] = []
left_id = left["id"]
right_id = right["id"]
relations = left.get("relations") or {}
for relation_type, targets in relations.items():
if not isinstance(targets, list):
continue
if right_id in targets:
reasons.append(f"relation:{relation_type}")
if left_id.split(".")[1] == right_id.split(".")[1]:
reasons.append("shared domain segment")
return reasons
def find_overlaps(
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
*,
threshold: float = 0.28,
) -> list[OverlapCandidate]:
candidates: list[OverlapCandidate] = []
blobs = [
(_entry_blob(entry, index_item), index_item["id"], entry)
for index_item, entry in indexed_entries
]
for i, (left_blob, left_id, left_entry) in enumerate(blobs):
left_tokens = _tokens(left_blob)
for j in range(i + 1, len(blobs)):
right_blob, right_id, right_entry = blobs[j]
right_tokens = _tokens(right_blob)
if not left_tokens or not right_tokens:
continue
score = len(left_tokens & right_tokens) / len(left_tokens | right_tokens)
reasons: list[str] = []
if score >= threshold:
reasons.append(f"token similarity {score:.2f}")
shared_tags = set(left_entry.get("tags", [])) & set(
right_entry.get("tags", [])
)
if shared_tags:
reasons.append(f"shared tags: {', '.join(sorted(shared_tags))}")
reasons.extend(_relation_overlap(left_entry, right_entry))
if reasons and (score >= threshold or len(reasons) > 1):
candidates.append(
OverlapCandidate(
left_id=left_id,
right_id=right_id,
score=score,
reasons=reasons,
)
)
return sorted(candidates, key=lambda item: item.score, reverse=True)