generated from coulomb/repo-seed
Some checks failed
ci / validate-registry (push) Has been cancelled
Add Gitea CI workflow for registry validation, reuse-surface overlaps and catalog commands, generated catalog artifacts, and documentation updates closing gap analysis priorities 9-11.
87 lines
2.8 KiB
Python
87 lines
2.8 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
TOKEN_RE = re.compile(r"[a-z][a-z0-9-]{2,}")
|
|
|
|
|
|
@dataclass
|
|
class OverlapCandidate:
|
|
left_id: str
|
|
right_id: str
|
|
score: float
|
|
reasons: list[str]
|
|
|
|
|
|
def _tokens(text: str) -> set[str]:
|
|
return set(TOKEN_RE.findall(text.lower()))
|
|
|
|
|
|
def _entry_blob(entry: dict[str, Any], index_item: dict[str, Any]) -> str:
|
|
discovery = entry.get("discovery") or {}
|
|
parts = [
|
|
index_item.get("name", ""),
|
|
index_item.get("summary", ""),
|
|
entry.get("id", ""),
|
|
" ".join(index_item.get("tags", [])),
|
|
discovery.get("intent", ""),
|
|
" ".join(discovery.get("includes", [])),
|
|
]
|
|
return " ".join(str(part) for part in parts if part)
|
|
|
|
|
|
def _relation_overlap(left: dict[str, Any], right: dict[str, Any]) -> list[str]:
|
|
reasons: list[str] = []
|
|
left_id = left["id"]
|
|
right_id = right["id"]
|
|
relations = left.get("relations") or {}
|
|
for relation_type, targets in relations.items():
|
|
if not isinstance(targets, list):
|
|
continue
|
|
if right_id in targets:
|
|
reasons.append(f"relation:{relation_type}")
|
|
if left_id.split(".")[1] == right_id.split(".")[1]:
|
|
reasons.append("shared domain segment")
|
|
return reasons
|
|
|
|
|
|
def find_overlaps(
|
|
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
|
|
*,
|
|
threshold: float = 0.28,
|
|
) -> list[OverlapCandidate]:
|
|
candidates: list[OverlapCandidate] = []
|
|
blobs = [
|
|
(_entry_blob(entry, index_item), index_item["id"], entry)
|
|
for index_item, entry in indexed_entries
|
|
]
|
|
|
|
for i, (left_blob, left_id, left_entry) in enumerate(blobs):
|
|
left_tokens = _tokens(left_blob)
|
|
for j in range(i + 1, len(blobs)):
|
|
right_blob, right_id, right_entry = blobs[j]
|
|
right_tokens = _tokens(right_blob)
|
|
if not left_tokens or not right_tokens:
|
|
continue
|
|
score = len(left_tokens & right_tokens) / len(left_tokens | right_tokens)
|
|
reasons: list[str] = []
|
|
if score >= threshold:
|
|
reasons.append(f"token similarity {score:.2f}")
|
|
shared_tags = set(left_entry.get("tags", [])) & set(
|
|
right_entry.get("tags", [])
|
|
)
|
|
if shared_tags:
|
|
reasons.append(f"shared tags: {', '.join(sorted(shared_tags))}")
|
|
reasons.extend(_relation_overlap(left_entry, right_entry))
|
|
if reasons and (score >= threshold or len(reasons) > 1):
|
|
candidates.append(
|
|
OverlapCandidate(
|
|
left_id=left_id,
|
|
right_id=right_id,
|
|
score=score,
|
|
reasons=reasons,
|
|
)
|
|
)
|
|
return sorted(candidates, key=lambda item: item.score, reverse=True) |