generated from coulomb/repo-seed
richer normalized structure, permission context preservation
This commit is contained in:
91
tests/test_normalized_structure.py
Normal file
91
tests/test_normalized_structure.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from kontextual_engine import SourcePayload, SourceReference, content_digest
|
||||
from kontextual_engine.adapters.builtin_extractors import (
|
||||
CsvDatasetExtractor,
|
||||
DocumentPlaceholderExtractor,
|
||||
PlainTextExtractor,
|
||||
)
|
||||
|
||||
|
||||
def test_plain_text_extractor_emits_structural_units_and_links() -> None:
|
||||
payload = source_payload(
|
||||
"Intro line\nwith https://example.test/ref\n\nSecond paragraph\n",
|
||||
media_type="text/plain",
|
||||
)
|
||||
|
||||
result = PlainTextExtractor().extract(payload)
|
||||
|
||||
assert result.normalized.structure["kind"] == "plain_text"
|
||||
assert result.normalized.fields["line_count"] == 4
|
||||
assert result.normalized.fields["paragraph_count"] == 2
|
||||
assert result.normalized.links == [
|
||||
{
|
||||
"url": "https://example.test/ref",
|
||||
"line": 2,
|
||||
"start": 5,
|
||||
"end": 29,
|
||||
}
|
||||
]
|
||||
assert result.metadata["link_count"] == 1
|
||||
|
||||
|
||||
def test_csv_dataset_extractor_emits_table_schema_samples_and_links() -> None:
|
||||
payload = source_payload(
|
||||
"name,source\nalpha,https://example.test/a\nbeta,\n",
|
||||
media_type="text/csv",
|
||||
filename="metrics.csv",
|
||||
)
|
||||
|
||||
result = CsvDatasetExtractor().extract(payload)
|
||||
|
||||
assert result.normalized.structure["kind"] == "dataset"
|
||||
assert result.normalized.structure["columns"] == [
|
||||
{"name": "name", "index": 0},
|
||||
{"name": "source", "index": 1},
|
||||
]
|
||||
assert result.normalized.tables[0]["row_count"] == 2
|
||||
assert result.normalized.tables[0]["rows"][0]["name"] == "alpha"
|
||||
assert result.normalized.links == [
|
||||
{
|
||||
"url": "https://example.test/a",
|
||||
"table": 0,
|
||||
"row": 0,
|
||||
"column": "source",
|
||||
}
|
||||
]
|
||||
assert result.metadata["sample_rows"][0]["source"] == "https://example.test/a"
|
||||
|
||||
|
||||
def test_document_placeholder_exposes_unsupported_structure() -> None:
|
||||
payload = source_payload(b"%PDF-1.7\n", media_type="application/pdf", filename="brief.pdf")
|
||||
|
||||
result = DocumentPlaceholderExtractor().extract(payload)
|
||||
|
||||
assert result.normalized.structure["kind"] == "pdf"
|
||||
assert result.normalized.fields["unsupported_count"] == 1
|
||||
assert result.normalized.fields["link_count"] == 0
|
||||
assert result.normalized.unsupported_elements[0]["reason"] == "deep_extraction_not_available"
|
||||
assert result.diagnostics[0].code == "extraction.depth_unsupported"
|
||||
|
||||
|
||||
def source_payload(
|
||||
content: str | bytes,
|
||||
*,
|
||||
media_type: str,
|
||||
filename: str = "source.txt",
|
||||
) -> SourcePayload:
|
||||
data = content.encode("utf-8") if isinstance(content, str) else content
|
||||
source_ref = SourceReference(
|
||||
source_system="test",
|
||||
path=filename,
|
||||
checksum=content_digest(data),
|
||||
connector_ref=f"test:{filename}",
|
||||
)
|
||||
return SourcePayload(
|
||||
connector_name="test",
|
||||
source_uri=filename,
|
||||
source_ref=source_ref,
|
||||
media_type=media_type,
|
||||
content=data,
|
||||
title=filename.rsplit(".", maxsplit=1)[0],
|
||||
metadata={"filename": filename},
|
||||
)
|
||||
Reference in New Issue
Block a user