richer normalized structure, permission context preservation

2026-05-06 13:43:16 +02:00
parent a4a4759ac4
commit 24cb3c5b6a
10 changed files with 636 additions and 15 deletions
--- a/src/kontextual_engine/adapters/builtin_extractors/datasets.py
+++ b/src/kontextual_engine/adapters/builtin_extractors/datasets.py
@@ -4,6 +4,7 @@ from __future__ import annotations

 import csv
 import io
+import re
 from typing import Any

 from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
@@ -31,9 +32,10 @@ class CsvDatasetExtractor:
        reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
        columns = list(reader.fieldnames or [])
        rows = [dict(row) for row in reader]
+        links = _links_from_rows(rows)
        table = {
            "name": payload.title,
-            "columns": columns,
+            "columns": [{"name": column, "index": index} for index, column in enumerate(columns)],
            "rows": rows,
            "row_count": len(rows),
        }
@@ -44,6 +46,9 @@ class CsvDatasetExtractor:
            "column_count": len(columns),
            "row_count": len(rows),
            "table_count": 1,
+            "link_count": len(links),
+            "links": links,
+            "sample_rows": rows[:5],
            "source_digest": payload.content_digest,
            "source_size_bytes": payload.size_bytes,
        }
@@ -53,15 +58,19 @@ class CsvDatasetExtractor:
            structure={
                "kind": "dataset",
                "format": metadata["dataset_format"],
-                "columns": columns,
+                "columns": [{"name": column, "index": index} for index, column in enumerate(columns)],
                "row_count": len(rows),
+                "table_count": 1,
+                "sample_rows": rows[:5],
            },
            tables=[table],
+            links=links,
            fields={
                "columns": columns,
                "column_count": len(columns),
                "row_count": len(rows),
                "dataset_format": metadata["dataset_format"],
+                "link_count": len(links),
            },
            confidence=0.95,
            extractor_metadata={
@@ -77,3 +86,21 @@ def _delimiter_for(payload: SourcePayload) -> str:
    if payload.media_type == "text/tab-separated-values" or filename.endswith(".tsv"):
        return "\t"
    return ","
+
+
+def _links_from_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    links: list[dict[str, Any]] = []
+    for row_index, row in enumerate(rows):
+        for column, value in row.items():
+            if not isinstance(value, str):
+                continue
+            for match in re.finditer(r"https?://[^\s<>)]+", value):
+                links.append(
+                    {
+                        "url": match.group(0),
+                        "table": 0,
+                        "row": row_index,
+                        "column": column,
+                    }
+                )
+    return links
--- a/src/kontextual_engine/adapters/builtin_extractors/documents.py
+++ b/src/kontextual_engine/adapters/builtin_extractors/documents.py
@@ -63,6 +63,9 @@ class DocumentPlaceholderExtractor:
            "document_kind": document_kind,
            "extraction_depth": "metadata_only",
            "unsupported_elements": [unsupported],
+            "unsupported_count": 1,
+            "link_count": 0,
+            "table_count": 0,
            "source_digest": payload.content_digest,
            "source_size_bytes": payload.size_bytes,
        }
@@ -72,11 +75,15 @@ class DocumentPlaceholderExtractor:
            structure={
                "kind": document_kind,
                "extraction_depth": "metadata_only",
+                "unsupported_elements": [unsupported],
            },
            fields={
                "document_kind": document_kind,
                "source_media_type": payload.media_type,
                "source_size_bytes": payload.size_bytes,
+                "unsupported_count": 1,
+                "link_count": 0,
+                "table_count": 0,
            },
            confidence=0.0,
            unsupported_elements=[unsupported],
--- a/src/kontextual_engine/adapters/builtin_extractors/text.py
+++ b/src/kontextual_engine/adapters/builtin_extractors/text.py
@@ -2,6 +2,8 @@

 from __future__ import annotations

+import re
+
 from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload


@@ -13,8 +15,8 @@ class PlainTextExtractor:
        return ExtractorCapability(
            extractor_name=self.name,
            media_types=self.media_types,
-            extraction_depth="text",
-            produces_structure=False,
+            extraction_depth="text_structure",
+            produces_structure=True,
        )

    def supports(self, media_type: str) -> bool:
@@ -22,10 +24,23 @@ class PlainTextExtractor:

    def extract(self, payload: SourcePayload) -> ExtractionResult:
        text = payload.read_text()
+        lines = _lines(text)
+        paragraphs = _paragraphs(text)
+        links = _links(text)
        normalized = NormalizedDocument(
            title=payload.title,
            text=text,
-            fields={"line_count": len(text.splitlines())},
+            structure={
+                "kind": "plain_text",
+                "lines": lines,
+                "paragraphs": paragraphs,
+            },
+            links=links,
+            fields={
+                "line_count": len(lines),
+                "paragraph_count": len(paragraphs),
+                "link_count": len(links),
+            },
            confidence=1.0,
            extractor_metadata={
                "extractor": self.name,
@@ -36,7 +51,70 @@ class PlainTextExtractor:
            normalized=normalized,
            metadata={
                "extractor": self.name,
+                "line_count": len(lines),
+                "paragraph_count": len(paragraphs),
+                "link_count": len(links),
+                "links": links,
                "source_digest": payload.content_digest,
                "source_size_bytes": payload.size_bytes,
            },
        )
+
+
+def _lines(text: str) -> list[dict[str, int | str]]:
+    return [
+        {
+            "index": index,
+            "line_number": index + 1,
+            "text": line,
+        }
+        for index, line in enumerate(text.splitlines())
+    ]
+
+
+def _paragraphs(text: str) -> list[dict[str, int | str]]:
+    paragraphs: list[dict[str, int | str]] = []
+    current: list[str] = []
+    start_line: int | None = None
+    for index, line in enumerate(text.splitlines(), start=1):
+        if line.strip():
+            if start_line is None:
+                start_line = index
+            current.append(line)
+            continue
+        if current and start_line is not None:
+            paragraphs.append(
+                {
+                    "index": len(paragraphs),
+                    "line_start": start_line,
+                    "line_end": index - 1,
+                    "text": "\n".join(current),
+                }
+            )
+        current = []
+        start_line = None
+    if current and start_line is not None:
+        paragraphs.append(
+            {
+                "index": len(paragraphs),
+                "line_start": start_line,
+                "line_end": start_line + len(current) - 1,
+                "text": "\n".join(current),
+            }
+        )
+    return paragraphs
+
+
+def _links(text: str) -> list[dict[str, int | str]]:
+    links: list[dict[str, int | str]] = []
+    for line_index, line in enumerate(text.splitlines(), start=1):
+        for match in re.finditer(r"https?://[^\s<>)]+", line):
+            links.append(
+                {
+                    "url": match.group(0),
+                    "line": line_index,
+                    "start": match.start(),
+                    "end": match.end(),
+                }
+            )
+    return links
--- a/src/kontextual_engine/adapters/markitect_tool/markdown.py
+++ b/src/kontextual_engine/adapters/markitect_tool/markdown.py
@@ -42,8 +42,11 @@ class MarkitectMarkdownExtractor:
        document = self._parse_document(mkt, text, source_path)
        serialized = document.to_dict() if hasattr(document, "to_dict") else {}
        snapshot = self._snapshot(mkt, source_path)
+        links = _links_from_tokens(list(serialized.get("tokens", [])))
+        tables = _tables_from_blocks(list(serialized.get("blocks", [])))
        structure = {
            "frontmatter": dict(serialized.get("frontmatter", {})),
+            "blocks": list(serialized.get("blocks", [])),
            "headings": list(serialized.get("headings", [])),
            "sections": list(serialized.get("sections", [])),
        }
@@ -51,10 +54,15 @@ class MarkitectMarkdownExtractor:
            title=payload.title,
            text=text,
            structure=structure,
+            tables=tables,
+            links=links,
            fields={
                "frontmatter": dict(serialized.get("frontmatter", {})),
+                "block_count": len(structure["blocks"]),
                "heading_count": len(structure["headings"]),
                "section_count": len(structure["sections"]),
+                "table_count": len(tables),
+                "link_count": len(links),
            },
            confidence=1.0,
            extractor_metadata={
@@ -68,7 +76,12 @@ class MarkitectMarkdownExtractor:
            metadata={
                "extractor": self.name,
                "frontmatter": structure["frontmatter"],
+                "blocks": structure["blocks"],
                "headings": structure["headings"],
+                "sections": structure["sections"],
+                "links": links,
+                "link_count": len(links),
+                "table_count": len(tables),
                "snapshot": snapshot,
                "source_digest": payload.content_digest,
                "source_size_bytes": payload.size_bytes,
@@ -84,3 +97,52 @@ class MarkitectMarkdownExtractor:
        if not source_path or not Path(source_path).exists() or not hasattr(mkt, "snapshot_identity_for_file"):
            return {}
        return mkt.snapshot_identity_for_file(Path(source_path), parse_options={"profile": "default"}).to_dict()
+
+
+def _links_from_tokens(tokens: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    links: list[dict[str, Any]] = []
+    for token in _walk_tokens(tokens):
+        if token.get("type") != "link_open":
+            continue
+        href = _attr_value(token.get("attrs"), "href")
+        if href:
+            links.append({"url": href, "kind": "markdown_link"})
+    return links
+
+
+def _walk_tokens(tokens: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    walked: list[dict[str, Any]] = []
+    for token in tokens:
+        walked.append(token)
+        children = token.get("children")
+        if isinstance(children, list):
+            walked.extend(_walk_tokens([child for child in children if isinstance(child, dict)]))
+    return walked
+
+
+def _attr_value(attrs: Any, name: str) -> str | None:
+    if isinstance(attrs, dict):
+        value = attrs.get(name)
+        return str(value) if value is not None else None
+    if isinstance(attrs, list):
+        for item in attrs:
+            if isinstance(item, (list, tuple)) and len(item) == 2 and item[0] == name:
+                return str(item[1])
+    return None
+
+
+def _tables_from_blocks(blocks: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    tables: list[dict[str, Any]] = []
+    for index, block in enumerate(blocks):
+        if block.get("type") != "table":
+            continue
+        tables.append(
+            {
+                "index": len(tables),
+                "source_block_index": index,
+                "text": block.get("text", ""),
+                "line_start": block.get("line_start"),
+                "line_end": block.get("line_end"),
+            }
+        )
+    return tables