generated from coulomb/repo-seed
Complete REUSE-WP-0004: CI, overlap detection, and catalog generation
Some checks failed
ci / validate-registry (push) Has been cancelled
Some checks failed
ci / validate-registry (push) Has been cancelled
Add Gitea CI workflow for registry validation, reuse-surface overlaps and catalog commands, generated catalog artifacts, and documentation updates closing gap analysis priorities 9-11.
This commit is contained in:
87
reuse_surface/overlaps.py
Normal file
87
reuse_surface/overlaps.py
Normal file
@@ -0,0 +1,87 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
TOKEN_RE = re.compile(r"[a-z][a-z0-9-]{2,}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class OverlapCandidate:
|
||||
left_id: str
|
||||
right_id: str
|
||||
score: float
|
||||
reasons: list[str]
|
||||
|
||||
|
||||
def _tokens(text: str) -> set[str]:
|
||||
return set(TOKEN_RE.findall(text.lower()))
|
||||
|
||||
|
||||
def _entry_blob(entry: dict[str, Any], index_item: dict[str, Any]) -> str:
|
||||
discovery = entry.get("discovery") or {}
|
||||
parts = [
|
||||
index_item.get("name", ""),
|
||||
index_item.get("summary", ""),
|
||||
entry.get("id", ""),
|
||||
" ".join(index_item.get("tags", [])),
|
||||
discovery.get("intent", ""),
|
||||
" ".join(discovery.get("includes", [])),
|
||||
]
|
||||
return " ".join(str(part) for part in parts if part)
|
||||
|
||||
|
||||
def _relation_overlap(left: dict[str, Any], right: dict[str, Any]) -> list[str]:
|
||||
reasons: list[str] = []
|
||||
left_id = left["id"]
|
||||
right_id = right["id"]
|
||||
relations = left.get("relations") or {}
|
||||
for relation_type, targets in relations.items():
|
||||
if not isinstance(targets, list):
|
||||
continue
|
||||
if right_id in targets:
|
||||
reasons.append(f"relation:{relation_type}")
|
||||
if left_id.split(".")[1] == right_id.split(".")[1]:
|
||||
reasons.append("shared domain segment")
|
||||
return reasons
|
||||
|
||||
|
||||
def find_overlaps(
|
||||
indexed_entries: list[tuple[dict[str, Any], dict[str, Any]]],
|
||||
*,
|
||||
threshold: float = 0.28,
|
||||
) -> list[OverlapCandidate]:
|
||||
candidates: list[OverlapCandidate] = []
|
||||
blobs = [
|
||||
(_entry_blob(entry, index_item), index_item["id"], entry)
|
||||
for index_item, entry in indexed_entries
|
||||
]
|
||||
|
||||
for i, (left_blob, left_id, left_entry) in enumerate(blobs):
|
||||
left_tokens = _tokens(left_blob)
|
||||
for j in range(i + 1, len(blobs)):
|
||||
right_blob, right_id, right_entry = blobs[j]
|
||||
right_tokens = _tokens(right_blob)
|
||||
if not left_tokens or not right_tokens:
|
||||
continue
|
||||
score = len(left_tokens & right_tokens) / len(left_tokens | right_tokens)
|
||||
reasons: list[str] = []
|
||||
if score >= threshold:
|
||||
reasons.append(f"token similarity {score:.2f}")
|
||||
shared_tags = set(left_entry.get("tags", [])) & set(
|
||||
right_entry.get("tags", [])
|
||||
)
|
||||
if shared_tags:
|
||||
reasons.append(f"shared tags: {', '.join(sorted(shared_tags))}")
|
||||
reasons.extend(_relation_overlap(left_entry, right_entry))
|
||||
if reasons and (score >= threshold or len(reasons) > 1):
|
||||
candidates.append(
|
||||
OverlapCandidate(
|
||||
left_id=left_id,
|
||||
right_id=right_id,
|
||||
score=score,
|
||||
reasons=reasons,
|
||||
)
|
||||
)
|
||||
return sorted(candidates, key=lambda item: item.score, reverse=True)
|
||||
Reference in New Issue
Block a user