generated from coulomb/repo-seed
Complete REUSE-WP-0004: CI, overlap detection, and catalog generation
Some checks failed
ci / validate-registry (push) Has been cancelled
Some checks failed
ci / validate-registry (push) Has been cancelled
Add Gitea CI workflow for registry validation, reuse-surface overlaps and catalog commands, generated catalog artifacts, and documentation updates closing gap analysis priorities 9-11.
This commit is contained in:
122
reuse_surface/catalog.py
Normal file
122
reuse_surface/catalog.py
Normal file
@@ -0,0 +1,122 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
CATALOG_MD = ROOT / "docs" / "CapabilityCatalog.md"
|
||||
CATALOG_HTML_DIR = ROOT / "docs" / "catalog"
|
||||
CATALOG_HTML = CATALOG_HTML_DIR / "index.html"
|
||||
|
||||
|
||||
def _grouped_capabilities(
|
||||
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
|
||||
) -> dict[str, list[tuple[dict[str, Any], dict[str, Any]]]]:
|
||||
grouped: dict[str, list[tuple[dict[str, Any], dict[str, Any]]]] = defaultdict(
|
||||
list
|
||||
)
|
||||
for index_item, entry in indexed_entries:
|
||||
domain = index_item.get("domain", "unknown")
|
||||
grouped[domain].append((index_item, entry))
|
||||
return dict(sorted(grouped.items()))
|
||||
|
||||
|
||||
def render_markdown(
|
||||
index: dict[str, Any],
|
||||
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
|
||||
) -> str:
|
||||
lines = [
|
||||
"# Capability Catalog",
|
||||
"",
|
||||
f"**Domain:** {index.get('domain', 'unknown')} ",
|
||||
f"**Updated:** {index.get('updated', 'unknown')} ",
|
||||
f"**Entries:** {len(indexed_entries)}",
|
||||
"",
|
||||
"Generated by `reuse-surface catalog`. Do not edit manually.",
|
||||
"",
|
||||
]
|
||||
for domain, items in _grouped_capabilities(indexed_entries).items():
|
||||
lines.extend([f"## {domain}", ""])
|
||||
for index_item, entry in sorted(items, key=lambda pair: pair[0]["id"]):
|
||||
lines.extend(
|
||||
[
|
||||
f"### {index_item['name']}",
|
||||
"",
|
||||
f"- **ID:** `{index_item['id']}`",
|
||||
f"- **Vector:** {index_item['vector']}",
|
||||
f"- **Owner:** {index_item.get('owner', 'unknown')}",
|
||||
f"- **Path:** `{index_item['path']}`",
|
||||
f"- **Summary:** {index_item['summary']}",
|
||||
"",
|
||||
]
|
||||
)
|
||||
guidance = entry.get("consumer_guidance") or {}
|
||||
limitations = guidance.get("known_limitations") or []
|
||||
if limitations:
|
||||
lines.append("**Known limitations:**")
|
||||
lines.extend(f"- {item}" for item in limitations)
|
||||
lines.append("")
|
||||
return "\n".join(lines).rstrip() + "\n"
|
||||
|
||||
|
||||
def render_html(
|
||||
index: dict[str, Any],
|
||||
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
|
||||
) -> str:
|
||||
sections: list[str] = []
|
||||
for domain, items in _grouped_capabilities(indexed_entries).items():
|
||||
cards: list[str] = []
|
||||
for index_item, entry in sorted(items, key=lambda pair: pair[0]["id"]):
|
||||
name = html.escape(index_item["name"])
|
||||
summary = html.escape(index_item["summary"])
|
||||
cap_id = html.escape(index_item["id"])
|
||||
vector = html.escape(index_item["vector"])
|
||||
path = html.escape(index_item["path"])
|
||||
cards.append(
|
||||
f"""<article class="card">
|
||||
<h3>{name}</h3>
|
||||
<p class="meta"><code>{cap_id}</code> · {vector}</p>
|
||||
<p>{summary}</p>
|
||||
<p class="path">{path}</p>
|
||||
</article>"""
|
||||
)
|
||||
sections.append(
|
||||
f"<section><h2>{html.escape(domain)}</h2>\n" + "\n".join(cards) + "</section>"
|
||||
)
|
||||
|
||||
body = "\n".join(sections)
|
||||
title = html.escape(f"Capability Catalog — {index.get('domain', 'unknown')}")
|
||||
return f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>{title}</title>
|
||||
<style>
|
||||
body {{ font-family: system-ui, sans-serif; margin: 2rem; line-height: 1.5; }}
|
||||
h1 {{ margin-bottom: 0.2rem; }}
|
||||
.subtitle {{ color: #555; margin-bottom: 2rem; }}
|
||||
section {{ margin-bottom: 2rem; }}
|
||||
.card {{ border: 1px solid #ddd; border-radius: 8px; padding: 1rem; margin: 1rem 0; }}
|
||||
.meta {{ color: #444; font-size: 0.95rem; }}
|
||||
.path {{ font-size: 0.85rem; color: #666; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Capability Catalog</h1>
|
||||
<p class="subtitle">Updated {html.escape(str(index.get('updated', 'unknown')))} · {len(indexed_entries)} entries</p>
|
||||
{body}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def write_catalog(
|
||||
index: dict[str, Any],
|
||||
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
|
||||
) -> tuple[Path, Path]:
|
||||
CATALOG_HTML_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CATALOG_MD.write_text(render_markdown(index, indexed_entries), encoding="utf-8")
|
||||
CATALOG_HTML.write_text(render_html(index, indexed_entries), encoding="utf-8")
|
||||
return CATALOG_MD, CATALOG_HTML
|
||||
@@ -9,9 +9,9 @@ from typing import Any
|
||||
import yaml
|
||||
from jsonschema import Draft202012Validator
|
||||
|
||||
from reuse_surface.catalog import write_catalog
|
||||
from reuse_surface.overlaps import find_overlaps
|
||||
from reuse_surface.registry import (
|
||||
CAPABILITIES_DIR,
|
||||
INDEX_PATH,
|
||||
ROOT,
|
||||
capability_paths,
|
||||
level_at_least,
|
||||
@@ -115,6 +115,40 @@ def cmd_query(args: argparse.Namespace) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def _load_indexed_entries() -> list[tuple[dict[str, Any], dict[str, Any]]]:
|
||||
index = load_index()
|
||||
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]] = []
|
||||
for item in index.get("capabilities", []):
|
||||
path = ROOT / item["path"]
|
||||
indexed_entries.append((item, parse_front_matter(path)))
|
||||
return indexed_entries
|
||||
|
||||
|
||||
def cmd_overlaps(args: argparse.Namespace) -> int:
|
||||
indexed_entries = _load_indexed_entries()
|
||||
candidates = find_overlaps(indexed_entries, threshold=args.threshold)
|
||||
if not candidates:
|
||||
print("no overlap candidates")
|
||||
return 0
|
||||
for candidate in candidates:
|
||||
reasons = "; ".join(candidate.reasons)
|
||||
print(
|
||||
f"{candidate.left_id} <> {candidate.right_id} "
|
||||
f"score={candidate.score:.2f} {reasons}"
|
||||
)
|
||||
print(f"\n{len(candidates)} candidate{'s' if len(candidates) != 1 else ''}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_catalog(args: argparse.Namespace) -> int:
|
||||
index = load_index()
|
||||
indexed_entries = _load_indexed_entries()
|
||||
md_path, html_path = write_catalog(index, indexed_entries)
|
||||
print(f"ok: wrote {md_path.relative_to(ROOT)}")
|
||||
print(f"ok: wrote {html_path.relative_to(ROOT)}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_export(args: argparse.Namespace) -> int:
|
||||
index = load_index()
|
||||
bundle: dict[str, Any] = {
|
||||
@@ -184,6 +218,22 @@ def main(argv: list[str] | None = None) -> int:
|
||||
)
|
||||
export.set_defaults(func=cmd_export)
|
||||
|
||||
overlaps = subparsers.add_parser(
|
||||
"overlaps", help="detect potential duplicate capabilities"
|
||||
)
|
||||
overlaps.add_argument(
|
||||
"--threshold",
|
||||
type=float,
|
||||
default=0.28,
|
||||
help="token similarity threshold (0-1)",
|
||||
)
|
||||
overlaps.set_defaults(func=cmd_overlaps)
|
||||
|
||||
catalog = subparsers.add_parser(
|
||||
"catalog", help="generate human-readable capability catalog"
|
||||
)
|
||||
catalog.set_defaults(func=cmd_catalog)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
return args.func(args)
|
||||
|
||||
|
||||
87
reuse_surface/overlaps.py
Normal file
87
reuse_surface/overlaps.py
Normal file
@@ -0,0 +1,87 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
TOKEN_RE = re.compile(r"[a-z][a-z0-9-]{2,}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class OverlapCandidate:
|
||||
left_id: str
|
||||
right_id: str
|
||||
score: float
|
||||
reasons: list[str]
|
||||
|
||||
|
||||
def _tokens(text: str) -> set[str]:
|
||||
return set(TOKEN_RE.findall(text.lower()))
|
||||
|
||||
|
||||
def _entry_blob(entry: dict[str, Any], index_item: dict[str, Any]) -> str:
|
||||
discovery = entry.get("discovery") or {}
|
||||
parts = [
|
||||
index_item.get("name", ""),
|
||||
index_item.get("summary", ""),
|
||||
entry.get("id", ""),
|
||||
" ".join(index_item.get("tags", [])),
|
||||
discovery.get("intent", ""),
|
||||
" ".join(discovery.get("includes", [])),
|
||||
]
|
||||
return " ".join(str(part) for part in parts if part)
|
||||
|
||||
|
||||
def _relation_overlap(left: dict[str, Any], right: dict[str, Any]) -> list[str]:
|
||||
reasons: list[str] = []
|
||||
left_id = left["id"]
|
||||
right_id = right["id"]
|
||||
relations = left.get("relations") or {}
|
||||
for relation_type, targets in relations.items():
|
||||
if not isinstance(targets, list):
|
||||
continue
|
||||
if right_id in targets:
|
||||
reasons.append(f"relation:{relation_type}")
|
||||
if left_id.split(".")[1] == right_id.split(".")[1]:
|
||||
reasons.append("shared domain segment")
|
||||
return reasons
|
||||
|
||||
|
||||
def find_overlaps(
|
||||
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
|
||||
*,
|
||||
threshold: float = 0.28,
|
||||
) -> list[OverlapCandidate]:
|
||||
candidates: list[OverlapCandidate] = []
|
||||
blobs = [
|
||||
(_entry_blob(entry, index_item), index_item["id"], entry)
|
||||
for index_item, entry in indexed_entries
|
||||
]
|
||||
|
||||
for i, (left_blob, left_id, left_entry) in enumerate(blobs):
|
||||
left_tokens = _tokens(left_blob)
|
||||
for j in range(i + 1, len(blobs)):
|
||||
right_blob, right_id, right_entry = blobs[j]
|
||||
right_tokens = _tokens(right_blob)
|
||||
if not left_tokens or not right_tokens:
|
||||
continue
|
||||
score = len(left_tokens & right_tokens) / len(left_tokens | right_tokens)
|
||||
reasons: list[str] = []
|
||||
if score >= threshold:
|
||||
reasons.append(f"token similarity {score:.2f}")
|
||||
shared_tags = set(left_entry.get("tags", [])) & set(
|
||||
right_entry.get("tags", [])
|
||||
)
|
||||
if shared_tags:
|
||||
reasons.append(f"shared tags: {', '.join(sorted(shared_tags))}")
|
||||
reasons.extend(_relation_overlap(left_entry, right_entry))
|
||||
if reasons and (score >= threshold or len(reasons) > 1):
|
||||
candidates.append(
|
||||
OverlapCandidate(
|
||||
left_id=left_id,
|
||||
right_id=right_id,
|
||||
score=score,
|
||||
reasons=reasons,
|
||||
)
|
||||
)
|
||||
return sorted(candidates, key=lambda item: item.score, reverse=True)
|
||||
Reference in New Issue
Block a user