first steps to better scanning of repos

This commit is contained in:
2026-05-02 00:11:55 +02:00
parent 2c427d253c
commit 89c4081001
9 changed files with 270 additions and 35 deletions

View File

@@ -1,12 +1,13 @@
from __future__ import annotations
from dataclasses import dataclass
from dataclasses import dataclass, field
from pathlib import Path
from repo_registry.core.models import ObservedFact
INDEXED_FACT_KINDS = {
"scope",
"documentation",
"example",
"test",
@@ -29,6 +30,7 @@ class ContentChunkCandidate:
start_line: int
end_line: int
text: str
metadata: dict[str, object] = field(default_factory=dict)
class ContentExtractor:
@@ -80,6 +82,7 @@ class ContentExtractor:
path,
root,
fact.kind,
fact.metadata,
lines,
start_line,
end_line,
@@ -91,7 +94,15 @@ class ContentExtractor:
start_line = start_index + 1
end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
chunks.append(
self._chunk(path, root, fact.kind, lines, start_line, end_line)
self._chunk(
path,
root,
fact.kind,
fact.metadata,
lines,
start_line,
end_line,
)
)
return chunks
@@ -100,6 +111,7 @@ class ContentExtractor:
path: Path,
root: Path,
kind: str,
fact_metadata: dict[str, object],
lines: list[str],
start_line: int,
end_line: int,
@@ -110,6 +122,7 @@ class ContentExtractor:
start_line=start_line,
end_line=end_line,
text="\n".join(lines[start_line - 1 : end_line]).strip(),
metadata={"source_role": fact_metadata.get("source_role", "")},
)
def _is_within(self, root: Path, path: Path) -> bool: