generated from coulomb/repo-seed
first steps to better scanning of repos
This commit is contained in:
@@ -1,12 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from repo_registry.core.models import ObservedFact
|
||||
|
||||
|
||||
INDEXED_FACT_KINDS = {
|
||||
"scope",
|
||||
"documentation",
|
||||
"example",
|
||||
"test",
|
||||
@@ -29,6 +30,7 @@ class ContentChunkCandidate:
|
||||
start_line: int
|
||||
end_line: int
|
||||
text: str
|
||||
metadata: dict[str, object] = field(default_factory=dict)
|
||||
|
||||
|
||||
class ContentExtractor:
|
||||
@@ -80,6 +82,7 @@ class ContentExtractor:
|
||||
path,
|
||||
root,
|
||||
fact.kind,
|
||||
fact.metadata,
|
||||
lines,
|
||||
start_line,
|
||||
end_line,
|
||||
@@ -91,7 +94,15 @@ class ContentExtractor:
|
||||
start_line = start_index + 1
|
||||
end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
|
||||
chunks.append(
|
||||
self._chunk(path, root, fact.kind, lines, start_line, end_line)
|
||||
self._chunk(
|
||||
path,
|
||||
root,
|
||||
fact.kind,
|
||||
fact.metadata,
|
||||
lines,
|
||||
start_line,
|
||||
end_line,
|
||||
)
|
||||
)
|
||||
return chunks
|
||||
|
||||
@@ -100,6 +111,7 @@ class ContentExtractor:
|
||||
path: Path,
|
||||
root: Path,
|
||||
kind: str,
|
||||
fact_metadata: dict[str, object],
|
||||
lines: list[str],
|
||||
start_line: int,
|
||||
end_line: int,
|
||||
@@ -110,6 +122,7 @@ class ContentExtractor:
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
text="\n".join(lines[start_line - 1 : end_line]).strip(),
|
||||
metadata={"source_role": fact_metadata.get("source_role", "")},
|
||||
)
|
||||
|
||||
def _is_within(self, root: Path, path: Path) -> bool:
|
||||
|
||||
Reference in New Issue
Block a user