first steps to better scanning of repos

2026-05-02 00:11:55 +02:00
parent 2c427d253c
commit 89c4081001
9 changed files with 270 additions and 35 deletions
--- a/src/repo_registry/content_indexing/extractor.py
+++ b/src/repo_registry/content_indexing/extractor.py
@@ -1,12 +1,13 @@
 from __future__ import annotations

-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path

 from repo_registry.core.models import ObservedFact


 INDEXED_FACT_KINDS = {
+    "scope",
    "documentation",
    "example",
    "test",
@@ -29,6 +30,7 @@ class ContentChunkCandidate:
    start_line: int
    end_line: int
    text: str
+    metadata: dict[str, object] = field(default_factory=dict)


 class ContentExtractor:
@@ -80,6 +82,7 @@ class ContentExtractor:
                    path,
                    root,
                    fact.kind,
+                    fact.metadata,
                    lines,
                    start_line,
                    end_line,
@@ -91,7 +94,15 @@ class ContentExtractor:
            start_line = start_index + 1
            end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
            chunks.append(
-                self._chunk(path, root, fact.kind, lines, start_line, end_line)
+                self._chunk(
+                    path,
+                    root,
+                    fact.kind,
+                    fact.metadata,
+                    lines,
+                    start_line,
+                    end_line,
+                )
            )
        return chunks

@@ -100,6 +111,7 @@ class ContentExtractor:
        path: Path,
        root: Path,
        kind: str,
+        fact_metadata: dict[str, object],
        lines: list[str],
        start_line: int,
        end_line: int,
@@ -110,6 +122,7 @@ class ContentExtractor:
            start_line=start_line,
            end_line=end_line,
            text="\n".join(lines[start_line - 1 : end_line]).strip(),
+            metadata={"source_role": fact_metadata.get("source_role", "")},
        )

    def _is_within(self, root: Path, path: Path) -> bool: