Files
repo-scoping/src/repo_registry/content_indexing/extractor.py
2026-05-02 00:36:00 +02:00

135 lines
3.7 KiB
Python

from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from repo_registry.core.models import ObservedFact
INDEXED_FACT_KINDS = {
"intent",
"scope",
"documentation",
"example",
"test",
"manifest",
"interface",
"config",
"llm_provider",
"credential_config",
"provider_registry",
"fallback_policy",
}
MAX_CHUNK_LINES = 40
MAX_FILE_BYTES = 200_000
@dataclass(frozen=True)
class ContentChunkCandidate:
path: str
kind: str
start_line: int
end_line: int
text: str
metadata: dict[str, object] = field(default_factory=dict)
class ContentExtractor:
"""Extract deterministic text chunks from source-linked observed facts."""
def extract(
self,
source_path: str | Path,
facts: list[ObservedFact],
) -> list[ContentChunkCandidate]:
root = Path(source_path).expanduser().resolve()
chunks: list[ContentChunkCandidate] = []
seen: set[tuple[str, str, int, int]] = set()
for fact in facts:
if fact.kind not in INDEXED_FACT_KINDS or not fact.path:
continue
path = (root / fact.path).resolve()
if not self._is_within(root, path) or not path.is_file():
continue
for chunk in self._chunks_for_fact(path, root, fact):
key = (chunk.path, chunk.kind, chunk.start_line, chunk.end_line)
if key in seen:
continue
seen.add(key)
chunks.append(chunk)
return sorted(chunks, key=lambda chunk: (chunk.path, chunk.start_line, chunk.kind))
def _chunks_for_fact(
self,
path: Path,
root: Path,
fact: ObservedFact,
) -> list[ContentChunkCandidate]:
try:
if path.stat().st_size > MAX_FILE_BYTES:
return []
lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
except OSError:
return []
if not lines:
return []
line = fact.metadata.get("line")
if isinstance(line, int):
start_line = max(1, line - 5)
end_line = min(len(lines), line + 10)
return [
self._chunk(
path,
root,
fact.kind,
fact.metadata,
lines,
start_line,
end_line,
)
]
chunks: list[ContentChunkCandidate] = []
for start_index in range(0, len(lines), MAX_CHUNK_LINES):
start_line = start_index + 1
end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
chunks.append(
self._chunk(
path,
root,
fact.kind,
fact.metadata,
lines,
start_line,
end_line,
)
)
return chunks
def _chunk(
self,
path: Path,
root: Path,
kind: str,
fact_metadata: dict[str, object],
lines: list[str],
start_line: int,
end_line: int,
) -> ContentChunkCandidate:
return ContentChunkCandidate(
path=path.relative_to(root).as_posix(),
kind=kind,
start_line=start_line,
end_line=end_line,
text="\n".join(lines[start_line - 1 : end_line]).strip(),
metadata={"source_role": fact_metadata.get("source_role", "")},
)
def _is_within(self, root: Path, path: Path) -> bool:
try:
path.relative_to(root)
except ValueError:
return False
return True