repo-scoping/src/repo_registry/content_indexing/extractor.py

from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path

from repo_registry.core.models import ObservedFact


INDEXED_FACT_KINDS = {
    "intent",
    "scope",
    "documentation",
    "example",
    "test",
    "manifest",
    "interface",
    "config",
    "llm_provider",
    "credential_config",
    "provider_registry",
    "fallback_policy",
}
MAX_CHUNK_LINES = 40
MAX_FILE_BYTES = 200_000


@dataclass(frozen=True)
class ContentChunkCandidate:
    path: str
    kind: str
    start_line: int
    end_line: int
    text: str
    metadata: dict[str, object] = field(default_factory=dict)


class ContentExtractor:
    """Extract deterministic text chunks from source-linked observed facts."""

    def extract(
        self,
        source_path: str | Path,
        facts: list[ObservedFact],
    ) -> list[ContentChunkCandidate]:
        root = Path(source_path).expanduser().resolve()
        chunks: list[ContentChunkCandidate] = []
        seen: set[tuple[str, str, int, int]] = set()
        for fact in facts:
            if fact.kind not in INDEXED_FACT_KINDS or not fact.path:
                continue
            path = (root / fact.path).resolve()
            if not self._is_within(root, path) or not path.is_file():
                continue
            for chunk in self._chunks_for_fact(path, root, fact):
                key = (chunk.path, chunk.kind, chunk.start_line, chunk.end_line)
                if key in seen:
                    continue
                seen.add(key)
                chunks.append(chunk)
        return sorted(chunks, key=lambda chunk: (chunk.path, chunk.start_line, chunk.kind))

    def _chunks_for_fact(
        self,
        path: Path,
        root: Path,
        fact: ObservedFact,
    ) -> list[ContentChunkCandidate]:
        try:
            if path.stat().st_size > MAX_FILE_BYTES:
                return []
            lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
        except OSError:
            return []
        if not lines:
            return []

        line = fact.metadata.get("line")
        if isinstance(line, int):
            start_line = max(1, line - 5)
            end_line = min(len(lines), line + 10)
            return [
                self._chunk(
                    path,
                    root,
                    fact.kind,
                    fact.metadata,
                    lines,
                    start_line,
                    end_line,
                )
            ]

        chunks: list[ContentChunkCandidate] = []
        for start_index in range(0, len(lines), MAX_CHUNK_LINES):
            start_line = start_index + 1
            end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
            chunks.append(
                self._chunk(
                    path,
                    root,
                    fact.kind,
                    fact.metadata,
                    lines,
                    start_line,
                    end_line,
                )
            )
        return chunks

    def _chunk(
        self,
        path: Path,
        root: Path,
        kind: str,
        fact_metadata: dict[str, object],
        lines: list[str],
        start_line: int,
        end_line: int,
    ) -> ContentChunkCandidate:
        return ContentChunkCandidate(
            path=path.relative_to(root).as_posix(),
            kind=kind,
            start_line=start_line,
            end_line=end_line,
            text="\n".join(lines[start_line - 1 : end_line]).strip(),
            metadata={"source_role": fact_metadata.get("source_role", "")},
        )

    def _is_within(self, root: Path, path: Path) -> bool:
        try:
            path.relative_to(root)
        except ValueError:
            return False
        return True