repo-scoping/src/repo_registry/scope/validator.py

from __future__ import annotations

import re
from dataclasses import dataclass
from pathlib import Path

from repo_registry.scope.generator import SCOPE_SECTIONS, ScopeGenerator


@dataclass(frozen=True)
class ScopeDiffSection:
    section: str
    status: str
    current_text: str | None
    proposed_text: str | None


@dataclass(frozen=True)
class ScopeDiff:
    sections: list[ScopeDiffSection]

    @property
    def needs_update(self) -> bool:
        return any(section.status != "ok" for section in self.sections)


@dataclass(frozen=True)
class ScopeValidationIssue:
    check: str
    severity: str
    message: str


@dataclass(frozen=True)
class ValidationResult:
    issues: list[ScopeValidationIssue]

    @property
    def ok(self) -> bool:
        return not any(issue.severity == "error" for issue in self.issues)


class ScopeValidator:
    """Validate and diff SCOPE.md files."""

    def __init__(self, generator: ScopeGenerator | None = None) -> None:
        self.generator = generator

    def diff(self, repo_slug: str, existing_path: Path) -> ScopeDiff:
        if self.generator is None:
            raise ValueError("ScopeValidator.diff requires a ScopeGenerator")
        current = existing_path.read_text(encoding="utf-8") if existing_path.exists() else ""
        proposed = self.generator.generate(repo_slug)
        current_sections = self._parse_sections(current)
        proposed_sections = self._parse_sections(proposed)
        sections: list[ScopeDiffSection] = []
        for section in SCOPE_SECTIONS:
            current_text = current_sections.get(section)
            proposed_text = proposed_sections.get(section, "")
            if current_text is None:
                status = "missing"
            elif self._normalize(current_text) == self._normalize(proposed_text):
                status = "ok"
            else:
                status = "stale"
            sections.append(
                ScopeDiffSection(
                    section=section,
                    status=status,
                    current_text=current_text,
                    proposed_text=proposed_text,
                )
            )
        return ScopeDiff(sections=sections)

    def validate(self, path: Path) -> ValidationResult:
        issues: list[ScopeValidationIssue] = []
        if not path.exists():
            return ValidationResult(
                issues=[
                    ScopeValidationIssue(
                        check="C5a",
                        severity="error",
                        message="SCOPE.md is missing.",
                    )
                ]
            )
        content = path.read_text(encoding="utf-8")
        sections = self._parse_sections(content)
        missing = [section for section in SCOPE_SECTIONS if section not in sections]
        if missing:
            severity = "warn" if missing == ["Provided Capabilities"] else "error"
            issues.append(
                ScopeValidationIssue(
                    check="C5b",
                    severity=severity,
                    message=f"Missing SCOPE.md section(s): {', '.join(missing)}.",
                )
            )
        ordered = self._heading_order(content)
        expected_order = [section for section in SCOPE_SECTIONS if section in sections]
        if ordered[: len(expected_order)] != expected_order:
            issues.append(
                ScopeValidationIssue(
                    check="C5b",
                    severity="warn",
                    message="SCOPE.md sections are not in canonical order.",
                )
            )
        capabilities = sections.get("Provided Capabilities")
        if capabilities is None:
            issues.append(
                ScopeValidationIssue(
                    check="C5c",
                    severity="warn",
                    message="Provided Capabilities section is missing.",
                )
            )
        elif "```capability" in capabilities:
            for index, block in enumerate(self._capability_blocks(capabilities), start=1):
                keys = self._capability_keys(block)
                missing_keys = {"type", "title"} - keys
                if missing_keys:
                    issues.append(
                        ScopeValidationIssue(
                            check="C5c",
                            severity="warn",
                            message=(
                                f"Capability block {index} is missing required field(s): "
                                f"{', '.join(sorted(missing_keys))}."
                            ),
                        )
                    )
        elif "No approved capabilities yet" not in capabilities:
            issues.append(
                ScopeValidationIssue(
                    check="C5c",
                    severity="warn",
                    message=(
                        "Provided Capabilities has no capability blocks or explicit "
                        "empty-state note."
                    ),
                )
            )
        return ValidationResult(issues=issues)

    def _parse_sections(self, content: str) -> dict[str, str]:
        matches = list(re.finditer(r"^##\s+(.+?)\s*$", content, re.MULTILINE))
        sections: dict[str, str] = {}
        for index, match in enumerate(matches):
            title = match.group(1).strip()
            start = match.end()
            end = matches[index + 1].start() if index + 1 < len(matches) else len(content)
            body = content[start:end]
            body = re.sub(r"\n---\s*$", "", body.strip())
            sections[title] = body.strip()
        return sections

    def _heading_order(self, content: str) -> list[str]:
        return [
            match.group(1).strip()
            for match in re.finditer(r"^##\s+(.+?)\s*$", content, re.MULTILINE)
            if match.group(1).strip() in SCOPE_SECTIONS
        ]

    def _normalize(self, value: str | None) -> str:
        if value is None:
            return ""
        without_comments = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
        without_markdown = re.sub(r"[`*_>#-]+", " ", without_comments)
        return re.sub(r"\s+", " ", without_markdown).strip().lower()

    def _capability_blocks(self, content: str) -> list[str]:
        return re.findall(
            r"```capability\s*(.*?)```",
            content,
            flags=re.DOTALL | re.IGNORECASE,
        )

    def _capability_keys(self, block: str) -> set[str]:
        return {
            match.group(1)
            for match in re.finditer(r"^([A-Za-z_][A-Za-z0-9_-]*):", block, re.MULTILINE)
        }