from kontextual_engine import SourcePayload, SourceReference, content_digest from kontextual_engine.adapters.builtin_extractors import ( CsvDatasetExtractor, DocumentPlaceholderExtractor, PlainTextExtractor, ) def test_plain_text_extractor_emits_structural_units_and_links() -> None: payload = source_payload( "Intro line\nwith https://example.test/ref\n\nSecond paragraph\n", media_type="text/plain", ) result = PlainTextExtractor().extract(payload) assert result.normalized.structure["kind"] == "plain_text" assert result.normalized.fields["line_count"] == 4 assert result.normalized.fields["paragraph_count"] == 2 assert result.normalized.links == [ { "url": "https://example.test/ref", "line": 2, "start": 5, "end": 29, } ] assert result.metadata["link_count"] == 1 def test_csv_dataset_extractor_emits_table_schema_samples_and_links() -> None: payload = source_payload( "name,source\nalpha,https://example.test/a\nbeta,\n", media_type="text/csv", filename="metrics.csv", ) result = CsvDatasetExtractor().extract(payload) assert result.normalized.structure["kind"] == "dataset" assert result.normalized.structure["columns"] == [ {"name": "name", "index": 0}, {"name": "source", "index": 1}, ] assert result.normalized.tables[0]["row_count"] == 2 assert result.normalized.tables[0]["rows"][0]["name"] == "alpha" assert result.normalized.links == [ { "url": "https://example.test/a", "table": 0, "row": 0, "column": "source", } ] assert result.metadata["sample_rows"][0]["source"] == "https://example.test/a" def test_document_placeholder_exposes_unsupported_structure() -> None: payload = source_payload(b"%PDF-1.7\n", media_type="application/pdf", filename="brief.pdf") result = DocumentPlaceholderExtractor().extract(payload) assert result.normalized.structure["kind"] == "pdf" assert result.normalized.fields["unsupported_count"] == 1 assert result.normalized.fields["link_count"] == 0 assert result.normalized.unsupported_elements[0]["reason"] == "deep_extraction_not_available" assert result.diagnostics[0].code == "extraction.depth_unsupported" def source_payload( content: str | bytes, *, media_type: str, filename: str = "source.txt", ) -> SourcePayload: data = content.encode("utf-8") if isinstance(content, str) else content source_ref = SourceReference( source_system="test", path=filename, checksum=content_digest(data), connector_ref=f"test:{filename}", ) return SourcePayload( connector_name="test", source_uri=filename, source_ref=source_ref, media_type=media_type, content=data, title=filename.rsplit(".", maxsplit=1)[0], metadata={"filename": filename}, )