from __future__ import annotations from dataclasses import dataclass, field from pathlib import Path from repo_registry.core.models import ObservedFact INDEXED_FACT_KINDS = { "intent", "scope", "documentation", "example", "test", "manifest", "interface", "config", "llm_provider", "credential_config", "provider_registry", "fallback_policy", } MAX_CHUNK_LINES = 40 MAX_FILE_BYTES = 200_000 @dataclass(frozen=True) class ContentChunkCandidate: path: str kind: str start_line: int end_line: int text: str metadata: dict[str, object] = field(default_factory=dict) class ContentExtractor: """Extract deterministic text chunks from source-linked observed facts.""" def extract( self, source_path: str | Path, facts: list[ObservedFact], ) -> list[ContentChunkCandidate]: root = Path(source_path).expanduser().resolve() chunks: list[ContentChunkCandidate] = [] seen: set[tuple[str, str, int, int]] = set() for fact in facts: if fact.kind not in INDEXED_FACT_KINDS or not fact.path: continue path = (root / fact.path).resolve() if not self._is_within(root, path) or not path.is_file(): continue for chunk in self._chunks_for_fact(path, root, fact): key = (chunk.path, chunk.kind, chunk.start_line, chunk.end_line) if key in seen: continue seen.add(key) chunks.append(chunk) return sorted(chunks, key=lambda chunk: (chunk.path, chunk.start_line, chunk.kind)) def _chunks_for_fact( self, path: Path, root: Path, fact: ObservedFact, ) -> list[ContentChunkCandidate]: try: if path.stat().st_size > MAX_FILE_BYTES: return [] lines = path.read_text(encoding="utf-8", errors="ignore").splitlines() except OSError: return [] if not lines: return [] line = fact.metadata.get("line") if isinstance(line, int): start_line = max(1, line - 5) end_line = min(len(lines), line + 10) return [ self._chunk( path, root, fact.kind, fact.metadata, lines, start_line, end_line, ) ] chunks: list[ContentChunkCandidate] = [] for start_index in range(0, len(lines), MAX_CHUNK_LINES): start_line = start_index + 1 end_line = min(len(lines), start_index + MAX_CHUNK_LINES) chunks.append( self._chunk( path, root, fact.kind, fact.metadata, lines, start_line, end_line, ) ) return chunks def _chunk( self, path: Path, root: Path, kind: str, fact_metadata: dict[str, object], lines: list[str], start_line: int, end_line: int, ) -> ContentChunkCandidate: return ContentChunkCandidate( path=path.relative_to(root).as_posix(), kind=kind, start_line=start_line, end_line=end_line, text="\n".join(lines[start_line - 1 : end_line]).strip(), metadata={"source_role": fact_metadata.get("source_role", "")}, ) def _is_within(self, root: Path, path: Path) -> bool: try: path.relative_to(root) except ValueError: return False return True