generated from coulomb/repo-seed
135 lines
3.7 KiB
Python
135 lines
3.7 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
from repo_registry.core.models import ObservedFact
|
|
|
|
|
|
INDEXED_FACT_KINDS = {
|
|
"intent",
|
|
"scope",
|
|
"documentation",
|
|
"example",
|
|
"test",
|
|
"manifest",
|
|
"interface",
|
|
"config",
|
|
"llm_provider",
|
|
"credential_config",
|
|
"provider_registry",
|
|
"fallback_policy",
|
|
}
|
|
MAX_CHUNK_LINES = 40
|
|
MAX_FILE_BYTES = 200_000
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ContentChunkCandidate:
|
|
path: str
|
|
kind: str
|
|
start_line: int
|
|
end_line: int
|
|
text: str
|
|
metadata: dict[str, object] = field(default_factory=dict)
|
|
|
|
|
|
class ContentExtractor:
|
|
"""Extract deterministic text chunks from source-linked observed facts."""
|
|
|
|
def extract(
|
|
self,
|
|
source_path: str | Path,
|
|
facts: list[ObservedFact],
|
|
) -> list[ContentChunkCandidate]:
|
|
root = Path(source_path).expanduser().resolve()
|
|
chunks: list[ContentChunkCandidate] = []
|
|
seen: set[tuple[str, str, int, int]] = set()
|
|
for fact in facts:
|
|
if fact.kind not in INDEXED_FACT_KINDS or not fact.path:
|
|
continue
|
|
path = (root / fact.path).resolve()
|
|
if not self._is_within(root, path) or not path.is_file():
|
|
continue
|
|
for chunk in self._chunks_for_fact(path, root, fact):
|
|
key = (chunk.path, chunk.kind, chunk.start_line, chunk.end_line)
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
chunks.append(chunk)
|
|
return sorted(chunks, key=lambda chunk: (chunk.path, chunk.start_line, chunk.kind))
|
|
|
|
def _chunks_for_fact(
|
|
self,
|
|
path: Path,
|
|
root: Path,
|
|
fact: ObservedFact,
|
|
) -> list[ContentChunkCandidate]:
|
|
try:
|
|
if path.stat().st_size > MAX_FILE_BYTES:
|
|
return []
|
|
lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
|
|
except OSError:
|
|
return []
|
|
if not lines:
|
|
return []
|
|
|
|
line = fact.metadata.get("line")
|
|
if isinstance(line, int):
|
|
start_line = max(1, line - 5)
|
|
end_line = min(len(lines), line + 10)
|
|
return [
|
|
self._chunk(
|
|
path,
|
|
root,
|
|
fact.kind,
|
|
fact.metadata,
|
|
lines,
|
|
start_line,
|
|
end_line,
|
|
)
|
|
]
|
|
|
|
chunks: list[ContentChunkCandidate] = []
|
|
for start_index in range(0, len(lines), MAX_CHUNK_LINES):
|
|
start_line = start_index + 1
|
|
end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
|
|
chunks.append(
|
|
self._chunk(
|
|
path,
|
|
root,
|
|
fact.kind,
|
|
fact.metadata,
|
|
lines,
|
|
start_line,
|
|
end_line,
|
|
)
|
|
)
|
|
return chunks
|
|
|
|
def _chunk(
|
|
self,
|
|
path: Path,
|
|
root: Path,
|
|
kind: str,
|
|
fact_metadata: dict[str, object],
|
|
lines: list[str],
|
|
start_line: int,
|
|
end_line: int,
|
|
) -> ContentChunkCandidate:
|
|
return ContentChunkCandidate(
|
|
path=path.relative_to(root).as_posix(),
|
|
kind=kind,
|
|
start_line=start_line,
|
|
end_line=end_line,
|
|
text="\n".join(lines[start_line - 1 : end_line]).strip(),
|
|
metadata={"source_role": fact_metadata.get("source_role", "")},
|
|
)
|
|
|
|
def _is_within(self, root: Path, path: Path) -> bool:
|
|
try:
|
|
path.relative_to(root)
|
|
except ValueError:
|
|
return False
|
|
return True
|